diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml new file mode 100644 index 00000000..514c0c48 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -0,0 +1,54 @@ +name: Bug report +description: Report a bug. +title: "" +labels: [bug] + +body: + - type: markdown + attributes: + value: > + Thank you for taking the time to file a bug report. + Please check that the code is pointing to the HEAD of develop + or any commit in master which is tagged with a version number. + + - type: textarea + attributes: + label: "Describe the issue:" + description: > + Describe the issue and any previous attempt to solve it. + validations: + required: true + + - type: textarea + attributes: + label: "Code example:" + description: > + If relevant, show how to reproduce the issue using a minimal working + example. + placeholder: | + << your code here >> + render: shell + validations: + required: false + + - type: textarea + attributes: + label: "Target platform:" + description: > + Give a description of the target platform (CPU, network, compiler). + Please give the full CPU part description, using for example + `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) + or `sysctl machdep.cpu.brand_string` (macOS) and the full output + the `--version` option of your compiler. + validations: + required: true + + - type: textarea + attributes: + label: "Configure options:" + description: > + Please give the exact configure command used and attach + `config.log`, `grid.config.summary` and the output of `make V=1`. + render: shell + validations: + required: true diff --git a/Grid/DisableWarnings.h b/Grid/DisableWarnings.h index 4bd1edd0..015e19d1 100644 --- a/Grid/DisableWarnings.h +++ b/Grid/DisableWarnings.h @@ -44,14 +44,22 @@ directory #ifdef __NVCC__ //disables nvcc specific warning in json.hpp #pragma clang diagnostic ignored "-Wdeprecated-register" + +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ + //disables nvcc specific warning in json.hpp +#pragma nv_diag_suppress unsigned_compare_with_zero +#pragma nv_diag_suppress cast_to_qualified_type + //disables nvcc specific warning in many files +#pragma nv_diag_suppress esa_on_defaulted_function_ignored +#pragma nv_diag_suppress extra_semicolon +#else + //disables nvcc specific warning in json.hpp #pragma diag_suppress unsigned_compare_with_zero #pragma diag_suppress cast_to_qualified_type - //disables nvcc specific warning in many files #pragma diag_suppress esa_on_defaulted_function_ignored #pragma diag_suppress extra_semicolon - -//Eigen only +#endif #endif // Disable vectorisation in Eigen on the Power8/9 and PowerPC diff --git a/Grid/GridCore.h b/Grid/GridCore.h index 2209f960..41c64ef6 100644 --- a/Grid/GridCore.h +++ b/Grid/GridCore.h @@ -44,9 +44,10 @@ Author: paboyle #include #include #include -#include +//#include #include #include +#include #include #include #include diff --git a/Grid/GridQCDcore.h b/Grid/GridQCDcore.h index cae6f43f..065b62cd 100644 --- a/Grid/GridQCDcore.h +++ b/Grid/GridQCDcore.h @@ -36,6 +36,7 @@ Author: paboyle #include #include #include +#include #include #include NAMESPACE_CHECK(GridQCDCore); diff --git a/Grid/GridStd.h b/Grid/GridStd.h index 28f6bc46..d0a8124a 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index c62d9cdb..bdd39a65 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -14,7 +14,11 @@ /* NVCC save and restore compile environment*/ #ifdef __NVCC__ #pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress code_is_unreachable +#else #pragma diag_suppress code_is_unreachable +#endif #pragma push_macro("__CUDA_ARCH__") #pragma push_macro("__NVCC__") #pragma push_macro("__CUDACC__") diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 7f27784b..9eaf89f3 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -54,6 +54,8 @@ NAMESPACE_CHECK(BiCGSTAB); #include #include #include +#include +#include #include #include #include diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 2fd187ff..7008008c 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -262,7 +262,7 @@ public: autoView( Tnp_v , (*Tnp), AcceleratorWrite); autoView( Tnm_v , (*Tnm), AcceleratorWrite); const int Nsimd = CComplex::Nsimd(); - accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { + accelerator_for(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); }); @@ -324,9 +324,9 @@ public: GridBase* _cbgrid; int hermitian; - CartesianStencil Stencil; - CartesianStencil StencilEven; - CartesianStencil StencilOdd; + CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; std::vector A; std::vector Aeven; @@ -358,7 +358,7 @@ public: autoView( in_v , in, AcceleratorRead); autoView( out_v , out, AcceleratorWrite); autoView( Stencil_v , Stencil, AcceleratorRead); - auto& geom_v = geom; + int npoint = geom.npoint; typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -380,7 +380,7 @@ public: int ptype; StencilEntry *SE; - for(int point=0;point Aview; Vector AcceleratorViewContainer; @@ -454,7 +454,7 @@ public: int ptype; StencilEntry *SE; - for(int p=0;p &st, CoarseMatrix &a, + void DselfInternal(CartesianStencil &st, CoarseMatrix &a, const CoarseVector &in, CoarseVector &out, int dag) { int point = geom.npoint-1; autoView( out_v, out, AcceleratorWrite); @@ -694,7 +694,7 @@ public: } } - void DhopInternal(CartesianStencil &st, std::vector &a, + void DhopInternal(CartesianStencil &st, std::vector &a, const CoarseVector &in, CoarseVector &out, int dag) { SimpleCompressor compressor; @@ -784,9 +784,9 @@ public: _cbgrid(new GridRedBlackCartesian(&CoarseGrid)), geom(CoarseGrid._ndimension), hermitian(hermitian_), - Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0), + Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements), A(geom.npoint,&CoarseGrid), Aeven(geom.npoint,_cbgrid), Aodd(geom.npoint,_cbgrid), @@ -804,9 +804,9 @@ public: _cbgrid(&CoarseRBGrid), geom(CoarseGrid._ndimension), hermitian(hermitian_), - Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0), - StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0), + Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements), + StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements), A(geom.npoint,&CoarseGrid), Aeven(geom.npoint,&CoarseRBGrid), Aodd(geom.npoint,&CoarseRBGrid), diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h index 815226d2..5096231d 100644 --- a/Grid/algorithms/LinearOperator.h +++ b/Grid/algorithms/LinearOperator.h @@ -52,6 +52,7 @@ public: virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; virtual void HermOp(const Field &in, Field &out)=0; + virtual ~LinearOperatorBase(){}; }; @@ -507,7 +508,7 @@ class SchurStaggeredOperator : public SchurOperatorBase { virtual void MpcDag (const Field &in, Field &out){ Mpc(in,out); } - virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { + virtual void MpcDagMpc(const Field &in, Field &out) { assert(0);// Never need with staggered } }; @@ -525,6 +526,7 @@ public: (*this)(Linop,in[k],out[k]); } }; + virtual ~OperatorFunction(){}; }; template class LinearFunction { @@ -540,6 +542,7 @@ public: (*this)(in[i], out[i]); } } + virtual ~LinearFunction(){}; }; template class IdentityLinearFunction : public LinearFunction { @@ -585,6 +588,7 @@ class HermOpOperatorFunction : public OperatorFunction { template class PlainHermOp : public LinearFunction { public: + using LinearFunction::operator(); LinearOperatorBase &_Linop; PlainHermOp(LinearOperatorBase& linop) : _Linop(linop) @@ -598,6 +602,7 @@ public: template class FunctionHermOp : public LinearFunction { public: + using LinearFunction::operator(); OperatorFunction & _poly; LinearOperatorBase &_Linop; diff --git a/Grid/algorithms/Preconditioner.h b/Grid/algorithms/Preconditioner.h index 65ac3cf9..a95dad7c 100644 --- a/Grid/algorithms/Preconditioner.h +++ b/Grid/algorithms/Preconditioner.h @@ -30,13 +30,19 @@ Author: Azusa Yamaguchi NAMESPACE_BEGIN(Grid); -template class Preconditioner : public LinearFunction { +template using Preconditioner = LinearFunction ; + +/* +template class Preconditioner : public LinearFunction { + using LinearFunction::operator(); virtual void operator()(const Field &src, Field & psi)=0; }; +*/ template class TrivialPrecon : public Preconditioner { public: - void operator()(const Field &src, Field & psi){ + using Preconditioner::operator(); + virtual void operator()(const Field &src, Field & psi){ psi = src; } TrivialPrecon(void){}; diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h index 8a265b3f..b4bab20c 100644 --- a/Grid/algorithms/SparseMatrix.h +++ b/Grid/algorithms/SparseMatrix.h @@ -48,6 +48,7 @@ public: virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void MdirAll (const Field &in, std::vector &out)=0; + virtual ~SparseMatrixBase() {}; }; ///////////////////////////////////////////////////////////////////////////////////////////// @@ -72,7 +73,7 @@ public: virtual void MeooeDag (const Field &in, Field &out)=0; virtual void MooeeDag (const Field &in, Field &out)=0; virtual void MooeeInvDag (const Field &in, Field &out)=0; - + virtual ~CheckerBoardedSparseMatrixBase() {}; }; NAMESPACE_END(Grid); diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 584ed1d5..1d6984f3 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -258,26 +258,12 @@ public: for(int n=2;nView(); - auto Tnp_v = Tnp->View(); - auto Tnm_v = Tnm->View(); - constexpr int Nsimd = vector_type::Nsimd(); - accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, { - coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); - coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); - }); - if ( Coeffs[n] != 0.0) { - axpy(out,Coeffs[n],*Tnp,out); - } -#else axpby(y,xscale,mscale,y,(*Tn)); axpby(*Tnp,2.0,-1.0,y,(*Tnm)); if ( Coeffs[n] != 0.0) { axpy(out,Coeffs[n],*Tnp,out); } -#endif + // Cycle pointers to avoid copies Field *swizzle = Tnm; Tnm =Tn; diff --git a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h index d9b2a9d7..c34035a8 100644 --- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h +++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h @@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid); template::value == 2, int>::type = 0, typename std::enable_if< getPrecision::value == 1, int>::type = 0> class MixedPrecisionBiCGSTAB : public LinearFunction { - public: + public: + using LinearFunction::operator(); RealD Tolerance; RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed Integer MaxInnerIterations; diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index 14f3d306..3308d8fe 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -58,6 +58,7 @@ public: void operator()(LinearOperatorBase &Linop, const Field &src, Field &psi) { + GRID_TRACE("ConjugateGradient"); psi.Checkerboard() = src.Checkerboard(); conformable(psi, src); @@ -117,9 +118,13 @@ public: GridStopWatch MatrixTimer; GridStopWatch SolverTimer; + RealD usecs = -usecond(); SolverTimer.Start(); int k; for (k = 1; k <= MaxIterations; k++) { + + GridStopWatch IterationTimer; + IterationTimer.Start(); c = cp; MatrixTimer.Start(); @@ -152,31 +157,41 @@ public: LinearCombTimer.Stop(); LinalgTimer.Stop(); - std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k + IterationTimer.Stop(); + if ( (k % 500) == 0 ) { + std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; + } else { + std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k + << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl; + } // Stopping condition if (cp <= rsq) { + usecs +=usecond(); SolverTimer.Stop(); Linop.HermOpAndNorm(psi, mmp, d, qq); p = mmp - src; - + GridBase *grid = src.Grid(); + RealD DwfFlops = (1452. )*grid->gSites()*4*k + + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra RealD srcnorm = std::sqrt(norm2(src)); RealD resnorm = std::sqrt(norm2(p)); RealD true_residual = resnorm / srcnorm; - std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << "\tComputed residual " << std::sqrt(cp / ssq) << "\tTrue residual " << true_residual << "\tTarget " << Tolerance << std::endl; - std::cout << GridLogIterative << "Time breakdown "<::value == 2, int>::type = 0, typename std::enable_if< getPrecision::value == 1, int>::type = 0> class MixedPrecisionConjugateGradient : public LinearFunction { - public: + public: + using LinearFunction::operator(); RealD Tolerance; RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed Integer MaxInnerIterations; @@ -48,6 +49,7 @@ NAMESPACE_BEGIN(Grid); Integer TotalInnerIterations; //Number of inner CG iterations Integer TotalOuterIterations; //Number of restarts Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step + RealD TrueResidual; //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess LinearFunction *guesser; @@ -67,6 +69,7 @@ NAMESPACE_BEGIN(Grid); } void operator() (const FieldD &src_d_in, FieldD &sol_d){ + std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl; TotalInnerIterations = 0; GridStopWatch TotalTimer; @@ -96,6 +99,7 @@ NAMESPACE_BEGIN(Grid); FieldF sol_f(SinglePrecGrid); sol_f.Checkerboard() = cb; + std::cout< CG_f(inner_tol, MaxInnerIterations); CG_f.ErrorOnNoConverge = false; @@ -104,7 +108,10 @@ NAMESPACE_BEGIN(Grid); GridStopWatch PrecChangeTimer; Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count - + + precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid); + precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid); + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ //Compute double precision rsd and also new RHS vector. Linop_d.HermOp(sol_d, tmp_d); @@ -119,7 +126,7 @@ NAMESPACE_BEGIN(Grid); while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? PrecChangeTimer.Start(); - precisionChange(src_f, src_d); + precisionChange(src_f, src_d, pc_wk_dp_to_sp); PrecChangeTimer.Stop(); sol_f = Zero(); @@ -129,6 +136,7 @@ NAMESPACE_BEGIN(Grid); (*guesser)(src_f, sol_f); //Inner CG + std::cout< CG_d(Tolerance, MaxInnerIterations); CG_d(Linop_d, src_d_in, sol_d); TotalFinalStepIterations = CG_d.IterationsToComplete; + TrueResidual = CG_d.TrueResidual; TotalTimer.Stop(); std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H +#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H + +NAMESPACE_BEGIN(Grid); + +//Mixed precision restarted defect correction CG +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class MixedPrecisionConjugateGradientBatched : public LinearFunction { +public: + using LinearFunction::operator(); + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + Integer MaxPatchupIterations; + GridBase* SinglePrecGrid; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + LinearOperatorBase &Linop_f; + LinearOperatorBase &Linop_d; + + //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess + LinearFunction *guesser; + bool updateResidual; + + MixedPrecisionConjugateGradientBatched(RealD tol, + Integer maxinnerit, + Integer maxouterit, + Integer maxpatchit, + GridBase* _sp_grid, + LinearOperatorBase &_Linop_f, + LinearOperatorBase &_Linop_d, + bool _updateResidual=true) : + Linop_f(_Linop_f), Linop_d(_Linop_d), + Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), + OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; + + void useGuesser(LinearFunction &g){ + guesser = &g; + } + + void operator() (const FieldD &src_d_in, FieldD &sol_d){ + std::vector srcs_d_in{src_d_in}; + std::vector sols_d{sol_d}; + + (*this)(srcs_d_in,sols_d); + + sol_d = sols_d[0]; + } + + void operator() (const std::vector &src_d_in, std::vector &sol_d){ + assert(src_d_in.size() == sol_d.size()); + int NBatch = src_d_in.size(); + + std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; + + Integer TotalOuterIterations = 0; //Number of restarts + std::vector TotalInnerIterations(NBatch,0); //Number of inner CG iterations + std::vector TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step + + GridStopWatch TotalTimer; + TotalTimer.Start(); + + GridStopWatch InnerCGtimer; + GridStopWatch PrecChangeTimer; + + int cb = src_d_in[0].Checkerboard(); + + std::vector src_norm; + std::vector norm; + std::vector stop; + + GridBase* DoublePrecGrid = src_d_in[0].Grid(); + FieldD tmp_d(DoublePrecGrid); + tmp_d.Checkerboard() = cb; + + FieldD tmp2_d(DoublePrecGrid); + tmp2_d.Checkerboard() = cb; + + std::vector src_d; + std::vector src_f; + std::vector sol_f; + + for (int i=0; i CG_f(inner_tol, MaxInnerIterations); + CG_f.ErrorOnNoConverge = false; + + Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count + + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; + + bool allConverged = true; + + for (int i=0; i OuterLoopNormMult * stop[i]) { + allConverged = false; + } + } + if (allConverged) break; + + if (updateResidual) { + RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); + RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); + while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? + CG_f.Tolerance = inner_tol; + } + + //Optionally improve inner solver guess (eg using known eigenvectors) + if(guesser != NULL) { + (*guesser)(src_f, sol_f); + } + + for (int i=0; i CG_d(Tolerance, MaxPatchupIterations); + CG_d(Linop_d, src_d_in[i], sol_d[i]); + TotalFinalStepIterations[i] += CG_d.IterationsToComplete; + } + + TotalTimer.Stop(); + + std::cout << GridLogMessage << std::endl; + for (int i=0; i::operator(); - RealD Tolerance; + // RealD Tolerance; Integer MaxIterations; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion std::vector IterationsToCompleteShift; // Iterations for this shift @@ -52,7 +52,7 @@ public: MultiShiftFunction shifts; std::vector TrueResidualShift; - ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : + ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : MaxIterations(maxit), shifts(_shifts) { @@ -84,6 +84,7 @@ public: void operator() (LinearOperatorBase &Linop, const Field &src, std::vector &psi) { + GRID_TRACE("ConjugateGradientMultiShift"); GridBase *grid = src.Grid(); @@ -182,6 +183,9 @@ public: for(int s=0;s +Author: Peter Boyle +Author: Christopher Kelly + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. +//The residual is stored in single precision, but the search directions and solution are stored in double precision. +//Every update_freq iterations the residual is corrected in double precision. +//For safety the a final regular CG is applied to clean up if necessary + +//PB Pure single, then double fixup + +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction, + public OperatorFunction +{ +public: + + using OperatorFunction::operator(); + + RealD Tolerance; + Integer MaxIterationsMshift; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + std::vector IterationsToCompleteShift; // Iterations for this shift + int verbose; + MultiShiftFunction shifts; + std::vector TrueResidualShift; + + int ReliableUpdateFreq; //number of iterations between reliable updates + + GridBase* SinglePrecGrid; //Grid for single-precision fields + LinearOperatorBase &Linop_f; //single precision + + ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts, + GridBase* _SinglePrecGrid, LinearOperatorBase &_Linop_f, + int _ReliableUpdateFreq) : + MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq), + MaxIterations(20000) + { + verbose=1; + IterationsToCompleteShift.resize(_shifts.order); + TrueResidualShift.resize(_shifts.order); + } + + void operator() (LinearOperatorBase &Linop, const FieldD &src, FieldD &psi) + { + GridBase *grid = src.Grid(); + int nshift = shifts.order; + std::vector results(nshift,grid); + (*this)(Linop,src,results,psi); + } + void operator() (LinearOperatorBase &Linop, const FieldD &src, std::vector &results, FieldD &psi) + { + int nshift = shifts.order; + + (*this)(Linop,src,results); + + psi = shifts.norm*src; + for(int i=0;i &Linop_d, const FieldD &src_d, std::vector &psi_d) + { + GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup"); + GridBase *DoublePrecGrid = src_d.Grid(); + + //////////////////////////////////////////////////////////////////////// + // Convenience references to the info stored in "MultiShiftFunction" + //////////////////////////////////////////////////////////////////////// + int nshift = shifts.order; + + std::vector &mass(shifts.poles); // Make references to array in "shifts" + std::vector &mresidual(shifts.tolerances); + std::vector alpha(nshift,1.0); + + //Double precision search directions + FieldD p_d(DoublePrecGrid); + std::vector ps_f (nshift, SinglePrecGrid);// Search directions (single precision) + std::vector psi_f(nshift, SinglePrecGrid);// solutions (single precision) + + FieldD tmp_d(DoublePrecGrid); + FieldD r_d(DoublePrecGrid); + FieldF r_f(SinglePrecGrid); + FieldD mmp_d(DoublePrecGrid); + + assert(psi_d.size()==nshift); + assert(mass.size()==nshift); + assert(mresidual.size()==nshift); + + // dynamic sized arrays on stack; 2d is a pain with vector + RealD bs[nshift]; + RealD rsq[nshift]; + RealD rsqf[nshift]; + RealD z[nshift][2]; + int converged[nshift]; + + const int primary =0; + + //Primary shift fields CG iteration + RealD a,b,c,d; + RealD cp,bp,qq; //prev + + // Matrix mult fields + FieldF p_f(SinglePrecGrid); + FieldF mmp_f(SinglePrecGrid); + + // Check lightest mass + for(int s=0;s= mass[primary] ); + converged[s]=0; + } + + // Wire guess to zero + // Residuals "r" are src + // First search direction "p" is also src + cp = norm2(src_d); + + // Handle trivial case of zero src. + if( cp == 0. ){ + for(int s=0;s= rsq[s]){ + CleanupTimer.Start(); + std::cout< Linop_shift_d(Linop_d, mass[s]); + ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop Linop_shift_f(Linop_f, mass[s]); + + MixedPrecisionConjugateGradient cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); + cg(src_d, psi_d[s]); + + TrueResidualShift[s] = cg.TrueResidual; + CleanupTimer.Stop(); + } + } + + std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"< +Author: Peter Boyle +Author: Christopher Kelly + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H +#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H + +NAMESPACE_BEGIN(Grid); + +//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision. +//The residual is stored in single precision, but the search directions and solution are stored in double precision. +//Every update_freq iterations the residual is corrected in double precision. + +//For safety the a final regular CG is applied to clean up if necessary + +//Linop to add shift to input linop, used in cleanup CG +namespace ConjugateGradientMultiShiftMixedPrecSupport{ +template +class ShiftedLinop: public LinearOperatorBase{ +public: + LinearOperatorBase &linop_base; + RealD shift; + + ShiftedLinop(LinearOperatorBase &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){} + + void OpDiag (const Field &in, Field &out){ assert(0); } + void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); } + void OpDirAll (const Field &in, std::vector &out){ assert(0); } + + void Op (const Field &in, Field &out){ assert(0); } + void AdjOp (const Field &in, Field &out){ assert(0); } + + void HermOp(const Field &in, Field &out){ + linop_base.HermOp(in, out); + axpy(out, shift, in, out); + } + + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + HermOp(in,out); + ComplexD dot = innerProduct(in,out); + n1=real(dot); + n2=norm2(out); + } +}; +}; + + +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction, + public OperatorFunction +{ +public: + + using OperatorFunction::operator(); + + RealD Tolerance; + Integer MaxIterationsMshift; + Integer MaxIterations; + Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion + std::vector IterationsToCompleteShift; // Iterations for this shift + int verbose; + MultiShiftFunction shifts; + std::vector TrueResidualShift; + + int ReliableUpdateFreq; //number of iterations between reliable updates + + GridBase* SinglePrecGrid; //Grid for single-precision fields + LinearOperatorBase &Linop_f; //single precision + + ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts, + GridBase* _SinglePrecGrid, LinearOperatorBase &_Linop_f, + int _ReliableUpdateFreq) : + MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq), + MaxIterations(20000) + { + verbose=1; + IterationsToCompleteShift.resize(_shifts.order); + TrueResidualShift.resize(_shifts.order); + } + + void operator() (LinearOperatorBase &Linop, const FieldD &src, FieldD &psi) + { + GridBase *grid = src.Grid(); + int nshift = shifts.order; + std::vector results(nshift,grid); + (*this)(Linop,src,results,psi); + } + void operator() (LinearOperatorBase &Linop, const FieldD &src, std::vector &results, FieldD &psi) + { + int nshift = shifts.order; + + (*this)(Linop,src,results); + + psi = shifts.norm*src; + for(int i=0;i &Linop_d, const FieldD &src_d, std::vector &psi_d) + { + GRID_TRACE("ConjugateGradientMultiShiftMixedPrec"); + GridBase *DoublePrecGrid = src_d.Grid(); + + precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid); + precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid); + + //////////////////////////////////////////////////////////////////////// + // Convenience references to the info stored in "MultiShiftFunction" + //////////////////////////////////////////////////////////////////////// + int nshift = shifts.order; + + std::vector &mass(shifts.poles); // Make references to array in "shifts" + std::vector &mresidual(shifts.tolerances); + std::vector alpha(nshift,1.0); + + //Double precision search directions + FieldD p_d(DoublePrecGrid); + std::vector ps_d(nshift, DoublePrecGrid);// Search directions (double precision) + + FieldD tmp_d(DoublePrecGrid); + FieldD r_d(DoublePrecGrid); + FieldD mmp_d(DoublePrecGrid); + + assert(psi_d.size()==nshift); + assert(mass.size()==nshift); + assert(mresidual.size()==nshift); + + // dynamic sized arrays on stack; 2d is a pain with vector + RealD bs[nshift]; + RealD rsq[nshift]; + RealD rsqf[nshift]; + RealD z[nshift][2]; + int converged[nshift]; + + const int primary =0; + + //Primary shift fields CG iteration + RealD a,b,c,d; + RealD cp,bp,qq; //prev + + // Matrix mult fields + FieldF p_f(SinglePrecGrid); + FieldF mmp_f(SinglePrecGrid); + + // Check lightest mass + for(int s=0;s= mass[primary] ); + converged[s]=0; + } + + // Wire guess to zero + // Residuals "r" are src + // First search direction "p" is also src + cp = norm2(src_d); + + // Handle trivial case of zero src. + if( cp == 0. ){ + for(int s=0;s= rsq[s]){ + CleanupTimer.Start(); + std::cout< Linop_shift_d(Linop_d, mass[s]); + ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop Linop_shift_f(Linop_f, mass[s]); + + MixedPrecisionConjugateGradient cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d); + cg(src_d, psi_d[s]); + + TrueResidualShift[s] = cg.TrueResidual; + CleanupTimer.Stop(); + } + } + + std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"< &Linop_f; LinearOperatorBase &Linop_d; GridBase* SinglePrecGrid; - RealD Delta; //reliable update parameter + RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single LinearOperatorBase *Linop_fallback; @@ -65,7 +65,9 @@ public: ErrorOnNoConverge(err_on_no_conv), DoFinalCleanup(true), Linop_fallback(NULL) - {}; + { + assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1"); + }; void setFallbackLinop(LinearOperatorBase &_Linop_fallback, const RealD _fallback_transition_tol){ Linop_fallback = &_Linop_fallback; @@ -73,6 +75,7 @@ public: } void operator()(const FieldD &src, FieldD &psi) { + GRID_TRACE("ConjugateGradientReliableUpdate"); LinearOperatorBase *Linop_f_use = &Linop_f; bool using_fallback = false; @@ -115,9 +118,12 @@ public: } //Single prec initialization + precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid); + precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid()); + FieldF r_f(SinglePrecGrid); r_f.Checkerboard() = r.Checkerboard(); - precisionChange(r_f, r); + precisionChange(r_f, r, pc_wk_dp_to_sp); FieldF psi_f(r_f); psi_f = Zero(); @@ -133,7 +139,8 @@ public: GridStopWatch LinalgTimer; GridStopWatch MatrixTimer; GridStopWatch SolverTimer; - + GridStopWatch PrecChangeTimer; + SolverTimer.Start(); int k = 0; int l = 0; @@ -172,7 +179,9 @@ public: // Stopping condition if (cp <= rsq) { //Although not written in the paper, I assume that I have to add on the final solution - precisionChange(mmp, psi_f); + PrecChangeTimer.Start(); + precisionChange(mmp, psi_f, pc_wk_sp_to_dp); + PrecChangeTimer.Stop(); psi = psi + mmp; @@ -193,7 +202,10 @@ public: std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() < class ZeroGuesser: public LinearFunction { public: + using LinearFunction::operator(); virtual void operator()(const Field &src, Field &guess) { guess = Zero(); }; }; template class DoNothingGuesser: public LinearFunction { public: + using LinearFunction::operator(); virtual void operator()(const Field &src, Field &guess) { }; }; template class SourceGuesser: public LinearFunction { public: + using LinearFunction::operator(); virtual void operator()(const Field &src, Field &guess) { guess = src; }; }; @@ -57,6 +60,7 @@ private: const unsigned int N; public: + using LinearFunction::operator(); DeflatedGuesser(const std::vector & _evec,const std::vector & _eval) : DeflatedGuesser(_evec, _eval, _evec.size()) @@ -87,6 +91,7 @@ private: const std::vector &eval_coarse; public: + using LinearFunction::operator(); LocalCoherenceDeflatedGuesser(const std::vector &_subspace, const std::vector &_evec_coarse, const std::vector &_eval_coarse) @@ -108,7 +113,43 @@ public: blockPromote(guess_coarse,guess,subspace); guess.Checkerboard() = src.Checkerboard(); }; -}; + + void operator()(const std::vector &src,std::vector &guess) { + int Nevec = (int)evec_coarse.size(); + int Nsrc = (int)src.size(); + // make temp variables + std::vector src_coarse(Nsrc,evec_coarse[0].Grid()); + std::vector guess_coarse(Nsrc,evec_coarse[0].Grid()); + //Preporcessing + std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl; + for (int j=0;j +Author: Yong-Chull Jang +Author: Chulwoo Jung + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_IRBL_H +#define GRID_IRBL_H + +#include //memset +#ifdef USE_LAPACK +#include +#endif + +#undef USE_LAPACK +#define Glog std::cout << GridLogMessage + +#ifdef GRID_CUDA +#include "cublas_v2.h" +#endif + +#if 0 +#define CUDA_COMPLEX cuDoubleComplex +#define CUDA_FLOAT double +#define MAKE_CUDA_COMPLEX make_cuDoubleComplex +#define CUDA_GEMM cublasZgemm +#else +#define CUDA_COMPLEX cuComplex +#define CUDA_FLOAT float +#define MAKE_CUDA_COMPLEX make_cuComplex +#define CUDA_GEMM cublasCgemm +#endif + +namespace Grid { + +//////////////////////////////////////////////////////////////////////////////// +// Helper class for sorting the evalues AND evectors by Field +// Use pointer swizzle on vectors SHOULD GET RID OF IT SOON! +//////////////////////////////////////////////////////////////////////////////// +template +class SortEigen { + private: + static bool less_lmd(RealD left,RealD right){ + return left > right; + } + static bool less_pair(std::pair& left, + std::pair& right){ + return left.first > (right.first); + } + + public: + void push(std::vector& lmd,std::vector& evec,int N) { + + //////////////////////////////////////////////////////////////////////// + // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set. + // : The vector reorder should be done by pointer swizzle somehow + //////////////////////////////////////////////////////////////////////// + std::vector cpy(lmd.size(),evec[0].Grid()); + for(int i=0;i > emod(lmd.size()); + + for(int i=0;i(lmd[i],&cpy[i]); + + partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair); + + typename std::vector >::iterator it = emod.begin(); + for(int i=0;ifirst; + evec[i]=*(it->second); + ++it; + } + } + void push(std::vector& lmd,int N) { + std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd); + } + bool saturated(RealD lmd, RealD thrs) { + return fabs(lmd) > fabs(thrs); + } +}; + +enum class LanczosType { irbl, rbl }; + +enum IRBLdiagonalisation { + IRBLdiagonaliseWithDSTEGR, + IRBLdiagonaliseWithQR, + IRBLdiagonaliseWithEigen +}; + +///////////////////////////////////////////////////////////// +// Implicitly restarted block lanczos +///////////////////////////////////////////////////////////// +template +class ImplicitlyRestartedBlockLanczos { + +private: + + std::string cname = std::string("ImplicitlyRestartedBlockLanczos"); + int MaxIter; // Max iterations + int Nstop; // Number of evecs checked for convergence + int Nu; // Number of vecs in the unit block + int Nk; // Number of converged sought + int Nm; // total number of vectors + int Nblock_k; // Nk/Nu + int Nblock_m; // Nm/Nu + int Nconv_test_interval; // Number of skipped vectors when checking a convergence + RealD eresid; + IRBLdiagonalisation diagonalisation; + //////////////////////////////////// + // Embedded objects + //////////////////////////////////// + SortEigen _sort; + LinearOperatorBase &_Linop; + LinearOperatorBase &_SLinop;//for split + OperatorFunction &_poly; + GridRedBlackCartesian * f_grid; + GridRedBlackCartesian * sf_grid; + int mrhs; + ///////////////////////// + // BLAS objects + ///////////////////////// +#ifdef GRID_CUDA + cudaError_t cudaStat; + CUDA_COMPLEX *w_acc, *evec_acc, *c_acc; +#endif + int Nevec_acc; // Number of eigenvectors stored in the buffer evec_acc + + ///////////////////////// + // Constructor + ///////////////////////// +public: + int split_test; //test split in the first iteration + ImplicitlyRestartedBlockLanczos(LinearOperatorBase &Linop, // op + LinearOperatorBase &SLinop, // op + GridRedBlackCartesian * FrbGrid, + GridRedBlackCartesian * SFrbGrid, + int _mrhs, + OperatorFunction & poly, // polynomial + int _Nstop, // really sought vecs + int _Nconv_test_interval, // conv check interval + int _Nu, // vecs in the unit block + int _Nk, // sought vecs + int _Nm, // total vecs + RealD _eresid, // resid in lmd deficit + int _MaxIter, // Max iterations + IRBLdiagonalisation _diagonalisation = IRBLdiagonaliseWithEigen) + : _Linop(Linop), _SLinop(SLinop), _poly(poly),sf_grid(SFrbGrid),f_grid(FrbGrid), + Nstop(_Nstop), Nconv_test_interval(_Nconv_test_interval), mrhs(_mrhs), + Nu(_Nu), Nk(_Nk), Nm(_Nm), + Nblock_m(_Nm/_Nu), Nblock_k(_Nk/_Nu), + //eresid(_eresid), MaxIter(10), + eresid(_eresid), MaxIter(_MaxIter), + diagonalisation(_diagonalisation),split_test(0), + Nevec_acc(_Nu) + { assert( (Nk%Nu==0) && (Nm%Nu==0) ); }; + + //////////////////////////////// + // Helpers + //////////////////////////////// + static RealD normalize(Field& v, int if_print=0) + { + RealD nn = norm2(v); + nn = sqrt(nn); + v = v * (1.0/nn); + return nn; + } + + void orthogonalize(Field& w, std::vector& evec, int k, int if_print=0) + { + typedef typename Field::scalar_type MyComplex; +// MyComplex ip; + ComplexD ip; + + for(int j=0; j 1e-14) + Glog<<"orthogonalize before: "< 1e-14) + Glog<<"orthogonalize after: "<& evec, int k) + { + orthogonalize(w, evec, k,1); + } + + void orthogonalize(std::vector& w, int _Nu, std::vector& evec, int k, int if_print=0) + { + typedef typename Field::scalar_type MyComplex; + MyComplex ip; +// ComplexD ip; + + for(int j=0; j& w, std::vector& evec, int R, int do_print=0) + { +#ifdef GRID_CUDA + Glog << "cuBLAS orthogonalize" << std::endl; + + typedef typename Field::vector_object vobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + typedef typename Field::scalar_type MyComplex; + + GridBase *grid = w[0].Grid(); + const uint64_t sites = grid->lSites(); + + int Nbatch = R/Nevec_acc; + assert( R%Nevec_acc == 0 ); +// Glog << "nBatch, Nevec_acc, R, Nu = " +// << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl; + + for (int col=0; col(&w_v[0]); +// Glog << "col= "<(&evec_v[0]); +// Glog << "col= "<& evec, int k, int Nu) + { + typedef typename Field::scalar_type MyComplex; + MyComplex ip; + + for(int j=0; j& eval, + std::vector& evec, + const std::vector& src, int& Nconv, LanczosType Impl) + { +#ifdef GRID_CUDA + GridBase *grid = src[0].Grid(); + grid->show_decomposition(); + +// printf("GRID_CUDA\n"); + + // set eigenvector buffers for the cuBLAS calls + //const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites = grid->lSites(); + + cudaStat = cudaMallocManaged((void **)&w_acc, Nu*sites*12*sizeof(CUDA_COMPLEX)); +// Glog << "w_acc= "<& eval, + std::vector& evec, + const std::vector& src, int& Nconv) + { + std::string fname = std::string(cname+"::calc_irbl()"); + GridBase *grid = evec[0].Grid(); + assert(grid == src[0].Grid()); + assert( Nu = src.size() ); + + Glog << std::string(74,'*') << std::endl; + Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; + Glog << std::string(74,'*') << std::endl; + Glog <<" -- seek Nk = "<< Nk <<" vectors"<< std::endl; + Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl; + Glog <<" -- total Nm = "<< Nm <<" vectors"<< std::endl; + Glog <<" -- size of eval = "<< eval.size() << std::endl; + Glog <<" -- size of evec = "<< evec.size() << std::endl; + if ( diagonalisation == IRBLdiagonaliseWithEigen ) { + Glog << "Diagonalisation is Eigen "<< std::endl; +#ifdef USE_LAPACK + } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { + Glog << "Diagonalisation is LAPACK "<< std::endl; +#endif + } else { + abort(); + } + Glog << std::string(74,'*') << std::endl; + + assert(Nm == evec.size() && Nm == eval.size()); + + std::vector> lmd(Nu,std::vector(Nm,0.0)); + std::vector> lme(Nu,std::vector(Nm,0.0)); + std::vector> lmd2(Nu,std::vector(Nm,0.0)); + std::vector> lme2(Nu,std::vector(Nm,0.0)); + std::vector eval2(Nm); + std::vector resid(Nk); + + Eigen::MatrixXcd Qt = Eigen::MatrixXcd::Zero(Nm,Nm); + Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); + + std::vector Iconv(Nm); + std::vector B(Nm,grid); // waste of space replicating + + std::vector f(Nu,grid); + std::vector f_copy(Nu,grid); + Field v(grid); + + Nconv = 0; + + RealD beta_k; + + // set initial vector + for (int i=0; i& eval, + std::vector& evec, + const std::vector& src, int& Nconv) + { + std::string fname = std::string(cname+"::calc_rbl()"); + GridBase *grid = evec[0].Grid(); + assert(grid == src[0].Grid()); + assert( Nu = src.size() ); + + int Np = (Nm-Nk); + if (Np > 0 && MaxIter > 1) Np /= MaxIter; + int Nblock_p = Np/Nu; + for(int i=0;i< evec.size();i++) evec[0].Advise()=AdviseInfrequentUse; + + Glog << std::string(74,'*') << std::endl; + Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; + Glog << std::string(74,'*') << std::endl; + Glog <<" -- seek (min) Nk = "<< Nk <<" vectors"<< std::endl; + Glog <<" -- seek (inc) Np = "<< Np <<" vectors"<< std::endl; + Glog <<" -- seek (max) Nm = "<< Nm <<" vectors"<< std::endl; + Glog <<" -- accept Nstop = "<< Nstop <<" vectors"<< std::endl; + Glog <<" -- size of eval = "<< eval.size() << std::endl; + Glog <<" -- size of evec = "<< evec.size() << std::endl; + if ( diagonalisation == IRBLdiagonaliseWithEigen ) { + Glog << "Diagonalisation is Eigen "<< std::endl; +#ifdef USE_LAPACK + } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { + Glog << "Diagonalisation is LAPACK "<< std::endl; +#endif + } else { + abort(); + } + Glog << std::string(74,'*') << std::endl; + + assert(Nm == evec.size() && Nm == eval.size()); + + std::vector> lmd(Nu,std::vector(Nm,0.0)); + std::vector> lme(Nu,std::vector(Nm,0.0)); + std::vector> lmd2(Nu,std::vector(Nm,0.0)); + std::vector> lme2(Nu,std::vector(Nm,0.0)); + std::vector eval2(Nk); + std::vector resid(Nm); + + Eigen::MatrixXcd Qt = Eigen::MatrixXcd::Zero(Nm,Nm); + Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); + + std::vector Iconv(Nm); +// int Ntest=Nu; +// std::vector B(Nm,grid); // waste of space replicating + std::vector B(1,grid); // waste of space replicating + + std::vector f(Nu,grid); + std::vector f_copy(Nu,grid); + Field v(grid); + + Nconv = 0; + +// RealD beta_k; + + // set initial vector + for (int i=0; i Btmp(Nstop,grid); // waste of space replicating + + for(int i=0; i>& lmd, + std::vector>& lme, + std::vector& evec, + std::vector& w, + std::vector& w_copy, + int b) + { + const RealD tiny = 1.0e-20; + + int Nu = w.size(); + int Nm = evec.size(); + assert( b < Nm/Nu ); +// GridCartesian *grid = evec[0]._grid; + + // converts block index to full indicies for an interval [L,R) + int L = Nu*b; + int R = Nu*(b+1); + + Real beta; + + Glog << "Using split grid"<< std::endl; +// LatticeGaugeField s_Umu(SGrid); + assert((Nu%mrhs)==0); + std::vector in(mrhs,f_grid); + + Field s_in(sf_grid); + Field s_out(sf_grid); + // unnecessary copy. Can or should it be avoided? + int k_start = 0; +while ( k_start < Nu) { + Glog << "k_start= "<0) { + for (int u=0; u& eval, + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nk, int Nm, + Eigen::MatrixXcd & Qt, // Nm x Nm + GridBase *grid) + { + assert( Nk%Nu == 0 && Nm%Nu == 0 ); + assert( Nk <= Nm ); + Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); + + for ( int u=0; u eigensolver(BlockTriDiag); + + for (int i = 0; i < Nk; i++) { + eval[Nk-1-i] = eigensolver.eigenvalues()(i); + } + for (int i = 0; i < Nk; i++) { + for (int j = 0; j < Nk; j++) { + Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i); + //Qt(Nk-1-i,j) = eigensolver.eigenvectors()(i,j); + //Qt(i,j) = eigensolver.eigenvectors()(i,j); + } + } + } + +#ifdef USE_LAPACK + void diagonalize_lapack(std::vector& eval, + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nk, int Nm, + Eigen::MatrixXcd & Qt, // Nm x Nm + GridBase *grid) + { + Glog << "diagonalize_lapack: Nu= "<_Nprocessors; + int node = grid->_processor; + int interval = (NN/total)+1; + double vl = 0.0, vu = 0.0; + MKL_INT il = interval*node+1 , iu = interval*(node+1); + if (iu > NN) iu=NN; + Glog << "node "<= il-1; i--){ + evals_tmp[i] = evals_tmp[i - (il-1)]; + if (il>1) evals_tmp[i-(il-1)]=0.; + for (int j = 0; j< NN; j++){ + evec_tmp[i*NN+j] = evec_tmp[(i - (il-1))*NN+j]; + if (il>1) { + evec_tmp[(i-(il-1))*NN+j].imag=0.; + evec_tmp[(i-(il-1))*NN+j].real=0.; + } + } + } + } + { + grid->GlobalSumVector(evals_tmp,NN); + grid->GlobalSumVector((double*)evec_tmp,2*NN*NN); + } + } + for (int i = 0; i < Nk; i++) + eval[Nk-1-i] = evals_tmp[i]; + for (int i = 0; i < Nk; i++) { + for (int j = 0; j < Nk; j++) { +// Qt(j,Nk-1-i) = eigensolver.eigenvectors()(j,i); + Qt(j,Nk-1-i)=std::complex + ( evec_tmp[i*Nk+j].real, + evec_tmp[i*Nk+j].imag); +// ( evec_tmp[(Nk-1-j)*Nk+Nk-1-i].real, +// evec_tmp[(Nk-1-j)*Nk+Nk-1-i].imag); + + } + } + +if (1){ + Eigen::SelfAdjointEigenSolver eigensolver(BlockTriDiag); + + for (int i = 0; i < Nk; i++) { + Glog << "eval = "<& eval, + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nk, int Nm, + Eigen::MatrixXcd & Qt, + GridBase *grid) + { + Qt = Eigen::MatrixXcd::Identity(Nm,Nm); + if ( diagonalisation == IRBLdiagonaliseWithEigen ) { + diagonalize_Eigen(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); +#ifdef USE_LAPACK + } else if ( diagonalisation == IRBLdiagonaliseWithLAPACK ) { + diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); +#endif + } else { + assert(0); + } + } + + + void unpackHermitBlockTriDiagMatToEigen( + std::vector>& lmd, + std::vector>& lme, + int Nu, int Nb, int Nk, int Nm, + Eigen::MatrixXcd& M) + { + //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; + assert( Nk%Nu == 0 && Nm%Nu == 0 ); + assert( Nk <= Nm ); + M = Eigen::MatrixXcd::Zero(Nk,Nk); + + // rearrange + for ( int u=0; u>& lmd, + std::vector>& lme, + int Nu, int Nb, int Nk, int Nm, + Eigen::MatrixXcd& M) + { + //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; + assert( Nk%Nu == 0 && Nm%Nu == 0 ); + assert( Nk <= Nm ); + + // rearrange + for ( int u=0; u QRD(Mtmp); + Q = QRD.householderQ(); + R = QRD.matrixQR(); // upper triangular part is the R matrix. + // lower triangular part used to represent series + // of Q sequence. + + // equivalent operation of Qprod *= Q + //M = Eigen::MatrixXcd::Zero(Nm,Nm); + + //for (int i=0; i Nm) kmax = Nm; + for (int k=i; ki) M(i,j) = conj(M(j,i)); + // if (i-j > Nu || j-i > Nu) M(i,j) = 0.; + // } + //} + + //Glog << "shiftedQRDecompEigen() end" << endl; + } + + void exampleQRDecompEigen(void) + { + Eigen::MatrixXd A = Eigen::MatrixXd::Zero(3,3); + Eigen::MatrixXd Q = Eigen::MatrixXd::Zero(3,3); + Eigen::MatrixXd R = Eigen::MatrixXd::Zero(3,3); + Eigen::MatrixXd P = Eigen::MatrixXd::Zero(3,3); + + A(0,0) = 12.0; + A(0,1) = -51.0; + A(0,2) = 4.0; + A(1,0) = 6.0; + A(1,1) = 167.0; + A(1,2) = -68.0; + A(2,0) = -4.0; + A(2,1) = 24.0; + A(2,2) = -41.0; + + Glog << "matrix A before ColPivHouseholder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + + Eigen::ColPivHouseholderQR QRD(A); + + Glog << "matrix A after ColPivHouseholder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "HouseholderQ with sequence lenth = nonzeroPiviots" << std::endl; + Q = QRD.householderQ().setLength(QRD.nonzeroPivots()); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "HouseholderQ with sequence lenth = 1" << std::endl; + Q = QRD.householderQ().setLength(1); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "HouseholderQ with sequence lenth = 2" << std::endl; + Q = QRD.householderQ().setLength(2); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "matrixR" << std::endl; + R = QRD.matrixR(); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "R[" << i << "," << j << "] = " << R(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "rank = " << QRD.rank() << std::endl; + Glog << "threshold = " << QRD.threshold() << std::endl; + + Glog << "matrixP" << std::endl; + P = QRD.colsPermutation(); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "P[" << i << "," << j << "] = " << P(i,j) << '\n'; + } + } + Glog << std::endl; + + + Glog << "QR decomposition without column pivoting" << std::endl; + + A(0,0) = 12.0; + A(0,1) = -51.0; + A(0,2) = 4.0; + A(1,0) = 6.0; + A(1,1) = 167.0; + A(1,2) = -68.0; + A(2,0) = -4.0; + A(2,1) = 24.0; + A(2,2) = -41.0; + + Glog << "matrix A before Householder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + + Eigen::HouseholderQR QRDplain(A); + + Glog << "HouseholderQ" << std::endl; + Q = QRDplain.householderQ(); + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "Q[" << i << "," << j << "] = " << Q(i,j) << '\n'; + } + } + Glog << std::endl; + + Glog << "matrix A after Householder" << std::endl; + for ( int i=0; i<3; i++ ) { + for ( int j=0; j<3; j++ ) { + Glog << "A[" << i << "," << j << "] = " << A(i,j) << '\n'; + } + } + Glog << std::endl; + } + + }; +} +#undef Glog +#undef USE_LAPACK +#undef CUDA_COMPLEX +#undef CUDA_FLOAT +#undef MAKE_CUDA_COMPLEX +#undef CUDA_GEMM +#endif diff --git a/Grid/algorithms/iterative/LocalCoherenceLanczos.h b/Grid/algorithms/iterative/LocalCoherenceLanczos.h index 9c945565..344a785a 100644 --- a/Grid/algorithms/iterative/LocalCoherenceLanczos.h +++ b/Grid/algorithms/iterative/LocalCoherenceLanczos.h @@ -44,6 +44,7 @@ public: int, MinRes); // Must restart }; +//This class is the input parameter class for some testing programs struct LocalCoherenceLanczosParams : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, @@ -67,6 +68,7 @@ public: template class ProjectedHermOp : public LinearFunction > > { public: + using LinearFunction > >::operator(); typedef iVector CoarseSiteVector; typedef Lattice CoarseField; typedef Lattice CoarseScalar; // used for inner products on fine field @@ -97,6 +99,7 @@ public: template class ProjectedFunctionHermOp : public LinearFunction > > { public: + using LinearFunction > >::operator(); typedef iVector CoarseSiteVector; typedef Lattice CoarseField; typedef Lattice CoarseScalar; // used for inner products on fine field @@ -143,16 +146,24 @@ public: LinearOperatorBase &_Linop; RealD _coarse_relax_tol; std::vector &_subspace; + + int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator + //As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult + //To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these + //out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1) + //NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed ImplicitlyRestartedLanczosSmoothedTester(LinearFunction &Poly, OperatorFunction &smoother, LinearOperatorBase &Linop, std::vector &subspace, - RealD coarse_relax_tol=5.0e3) + RealD coarse_relax_tol=5.0e3, + int largestEvalIdxForReport=-1) : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace), - _coarse_relax_tol(coarse_relax_tol) + _coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport) { }; + //evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection) int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) { CoarseField v(B); @@ -175,12 +186,26 @@ public: <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv < nbasis ) eresid = eresid*_coarse_relax_tol; + std::cout.precision(13); std::cout< nbasis ) eresid = eresid*_coarse_relax_tol; if( (vv on the coarse grid. This function orthnormalizes the fine-grid subspace + //vectors under the block inner product. This step must be performed after computing the fine grid + //eigenvectors and before computing the coarse grid eigenvectors. void Orthogonalise(void ) { CoarseScalar InnerProd(_CoarseGrid); std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"< Cheby(cheby_op); - ProjectedHermOp Op(_FineOp,subspace); - ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,subspace); + Chebyshev Cheby(cheby_op); //Chebyshev of fine operator on fine grid + ProjectedHermOp Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion + ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion ////////////////////////////////////////////////////////////////////////////////////////////////// // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL ////////////////////////////////////////////////////////////////////////////////////////////////// - Chebyshev ChebySmooth(cheby_smooth); - ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); + Chebyshev ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); evals_coarse.resize(Nm); evec_coarse.resize(Nm,_CoarseGrid); CoarseField src(_CoarseGrid); src=1.0; + //Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array ImplicitlyRestartedLanczos IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); int Nconv=0; IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); @@ -403,6 +440,14 @@ public: std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl; } } + + //Get the fine eigenvector 'i' by reconstruction + void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{ + blockPromote(evec_coarse[i],evec,subspace); + eval = evals_coarse[i]; + } + + }; NAMESPACE_END(Grid); diff --git a/Grid/algorithms/iterative/PowerMethod.h b/Grid/algorithms/iterative/PowerMethod.h index 6aa8e923..027ea68c 100644 --- a/Grid/algorithms/iterative/PowerMethod.h +++ b/Grid/algorithms/iterative/PowerMethod.h @@ -29,6 +29,8 @@ template class PowerMethod RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. RealD vden = norm2(src_n); RealD na = vnum/vden; + + std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl; if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { evalMaxApprox = na; diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h index bf454ade..feceb21f 100644 --- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h @@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid); template class PrecGeneralisedConjugateResidual : public LinearFunction { public: - + using LinearFunction::operator(); RealD Tolerance; Integer MaxIterations; int verbose; diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h index 22b7725e..181df320 100644 --- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h @@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid); template class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction { public: - + using LinearFunction::operator(); RealD Tolerance; Integer MaxIterations; int verbose; @@ -119,7 +119,8 @@ public: RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD cp; - ComplexD a, b, zAz; + ComplexD a, b; + // ComplexD zAz; RealD zAAz; ComplexD rq; @@ -146,7 +147,7 @@ public: ////////////////////////////////// MatTimer.Start(); Linop.Op(psi,Az); - zAz = innerProduct(Az,psi); + // zAz = innerProduct(Az,psi); zAAz= norm2(Az); MatTimer.Stop(); @@ -170,7 +171,7 @@ public: LinalgTimer.Start(); - zAz = innerProduct(Az,psi); + // zAz = innerProduct(Az,psi); zAAz= norm2(Az); //p[0],q[0],qq[0] @@ -212,7 +213,7 @@ public: MatTimer.Start(); Linop.Op(z,Az); MatTimer.Stop(); - zAz = innerProduct(Az,psi); + // zAz = innerProduct(Az,psi); zAAz= norm2(Az); LinalgTimer.Start(); diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index 30be510b..a9e5c9b4 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid); /*Allocation types, saying which pointer cache should be used*/ #define Cpu (0) -#define CpuSmall (1) -#define Acc (2) -#define AccSmall (3) -#define Shared (4) -#define SharedSmall (5) +#define CpuHuge (1) +#define CpuSmall (2) +#define Acc (3) +#define AccHuge (4) +#define AccSmall (5) +#define Shared (6) +#define SharedHuge (7) +#define SharedSmall (8) #undef GRID_MM_VERBOSE uint64_t total_shared; uint64_t total_device; @@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void) } +uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } +uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; } + ////////////////////////////////////////////////////////////////////// // Data tables for recently freed pooiniter caches ////////////////////////////////////////////////////////////////////// MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; int MemoryManager::Victim[MemoryManager::NallocType]; -int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 }; +int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 }; uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; ////////////////////////////////////////////////////////////////////// // Actual allocation and deallocation utils @@ -159,7 +165,6 @@ void MemoryManager::Init(void) char * str; int Nc; - int NcS; str= getenv("GRID_ALLOC_NCACHE_LARGE"); if ( str ) { @@ -171,6 +176,16 @@ void MemoryManager::Init(void) } } + str= getenv("GRID_ALLOC_NCACHE_HUGE"); + if ( str ) { + Nc = atoi(str); + if ( (Nc>=0) && (Nc < NallocCacheMax)) { + Ncache[CpuHuge]=Nc; + Ncache[AccHuge]=Nc; + Ncache[SharedHuge]=Nc; + } + } + str= getenv("GRID_ALLOC_NCACHE_SMALL"); if ( str ) { Nc = atoi(str); @@ -191,7 +206,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); #else return ptr; @@ -233,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif + if (ncache == 0) return ptr; + void * ret = NULL; int v = -1; @@ -272,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries void *MemoryManager::Lookup(size_t bytes,int type) { #ifdef ALLOCATION_CACHE - bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); - int cache = type+small; + int cache; + if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; + else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); #else return NULL; @@ -282,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index eafcd83f..0dc78f04 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -35,6 +35,12 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? #define GRID_ALLOC_SMALL_LIMIT (4096) +#define GRID_ALLOC_HUGE_LIMIT (2147483648) + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) +#define FILE_LINE __FILE__ ":" TOSTRING(__LINE__) +#define AUDIT(a) MemoryManager::Audit(FILE_LINE) /*Pinning pages is costly*/ //////////////////////////////////////////////////////////////////////////// @@ -65,6 +71,21 @@ enum ViewMode { CpuWriteDiscard = 0x10 // same for now }; +struct MemoryStatus { + uint64_t DeviceBytes; + uint64_t DeviceLRUBytes; + uint64_t DeviceMaxBytes; + uint64_t HostToDeviceBytes; + uint64_t DeviceToHostBytes; + uint64_t HostToDeviceXfer; + uint64_t DeviceToHostXfer; + uint64_t DeviceEvictions; + uint64_t DeviceDestroy; + uint64_t DeviceAllocCacheBytes; + uint64_t HostAllocCacheBytes; +}; + + class MemoryManager { private: @@ -78,7 +99,7 @@ private: } AllocationCacheEntry; static const int NallocCacheMax=128; - static const int NallocType=6; + static const int NallocType=9; static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static int Victim[NallocType]; static int Ncache[NallocType]; @@ -92,8 +113,9 @@ private: static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ; - static void PrintBytes(void); public: + static void PrintBytes(void); + static void Audit(std::string s); static void Init(void); static void InitMessage(void); static void *AcceleratorAllocate(size_t bytes); @@ -113,7 +135,28 @@ private: static uint64_t DeviceToHostBytes; static uint64_t HostToDeviceXfer; static uint64_t DeviceToHostXfer; - + static uint64_t DeviceEvictions; + static uint64_t DeviceDestroy; + + static uint64_t DeviceCacheBytes(); + static uint64_t HostCacheBytes(); + + static MemoryStatus GetFootprint(void) { + MemoryStatus stat; + stat.DeviceBytes = DeviceBytes; + stat.DeviceLRUBytes = DeviceLRUBytes; + stat.DeviceMaxBytes = DeviceMaxBytes; + stat.HostToDeviceBytes = HostToDeviceBytes; + stat.DeviceToHostBytes = DeviceToHostBytes; + stat.HostToDeviceXfer = HostToDeviceXfer; + stat.DeviceToHostXfer = DeviceToHostXfer; + stat.DeviceEvictions = DeviceEvictions; + stat.DeviceDestroy = DeviceDestroy; + stat.DeviceAllocCacheBytes = DeviceCacheBytes(); + stat.HostAllocCacheBytes = HostCacheBytes(); + return stat; + }; + private: #ifndef GRID_UVM ////////////////////////////////////////////////////////////////////// @@ -170,6 +213,8 @@ private: public: static void Print(void); + static void PrintAll(void); + static void PrintState( void* CpuPtr); static int isOpen (void* CpuPtr); static void ViewClose(void* CpuPtr,ViewMode mode); static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 72111dbd..e758ac2f 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -3,8 +3,13 @@ #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); -//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout); -#define dprintf(...) + +#define MAXLINE 512 +static char print_buffer [ MAXLINE ]; + +#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; +#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; +//#define dprintf(...) //////////////////////////////////////////////////////////// @@ -23,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes; uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +uint64_t MemoryManager::DeviceEvictions; +uint64_t MemoryManager::DeviceDestroy; //////////////////////////////////// // Priority ordering for unlocked entries @@ -104,15 +111,17 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceDestroy++; DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + AccCache.AccPtr=(uint64_t) NULL; + dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -121,26 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::Evict(AcceleratorViewEntry &AccCache) { /////////////////////////////////////////////////////////////////////////// - // Make CPU consistent, remove from Accelerator, remove entry - // Cannot be locked. If allocated must be in LRU pool. + // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry + // Cannot be acclocked. If allocated must be in LRU pool. + // + // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock. + // and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen + // but there is a weakness where CpuLock entries are attempted for erase + // Take these OUT LRU queue when CPU locked? + // Cannot take out the table as cpuLock data is important. /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); - assert(AccCache.accLock==0); - assert(AccCache.cpuLock==0); + mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n", + (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, + (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); + if (AccCache.accLock!=0) return; + if (AccCache.cpuLock!=0) return; if(AccCache.state==AccDirty) { Flush(AccCache); } - assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); - DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + AccCache.AccPtr=(uint64_t)NULL; + AccCache.state=CpuDirty; // CPU primary now + DeviceBytes -=AccCache.bytes; + dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } - uint64_t CpuPtr = AccCache.CpuPtr; - EntryErase(CpuPtr); + // uint64_t CpuPtr = AccCache.CpuPtr; + DeviceEvictions++; + // EntryErase(CpuPtr); } void MemoryManager::Flush(AcceleratorViewEntry &AccCache) { @@ -150,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -165,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -191,6 +210,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::ViewClose(void* Ptr,ViewMode mode) { if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr); } else if( (mode==CpuRead)||(mode==CpuWrite)){ CpuViewClose((uint64_t)Ptr); @@ -202,6 +222,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis { uint64_t CpuPtr = (uint64_t)_CpuPtr; if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); } else if( (mode==CpuRead)||(mode==CpuWrite)){ return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); @@ -212,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis } void MemoryManager::EvictVictims(uint64_t bytes) { + assert(bytes DeviceMaxBytes){ if ( DeviceLRUBytes > 0){ assert(LRU.size()>0); - uint64_t victim = LRU.back(); + uint64_t victim = LRU.back(); // From the LRU auto AccCacheIterator = EntryLookup(victim); auto & AccCache = AccCacheIterator->second; Evict(AccCache); + } else { + return; } } } @@ -241,11 +265,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { - dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", (uint64_t)AccCache.CpuPtr, (uint64_t)CpuPtr, (uint64_t)AccCache.bytes, - (uint64_t)bytes); + (uint64_t)bytes, + (uint64_t)AccCache.accLock); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -280,6 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // Empty + AccRead => Consistent } AccCache.accLock= 1; + dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); } else if(AccCache.state==CpuDirty ){ if(mode==AcceleratorWriteDiscard) { CpuDiscard(AccCache); @@ -292,28 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); } else { assert(0); } - // If view is opened on device remove from LRU + assert(AccCache.accLock>0); + // If view is opened on device must remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU + dprintf("AccCache entry removed from LRU \n"); LRUremove(AccCache); } @@ -334,10 +362,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) assert(AccCache.accLock>0); AccCache.accLock--; - // Move to LRU queue if not locked and close on device if(AccCache.accLock==0) { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); LRUinsert(AccCache); + } else { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); } } void MemoryManager::CpuViewClose(uint64_t CpuPtr) @@ -374,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - if (!AccCache.AccPtr) { - EvictVictims(bytes); - } + // CPU doesn't need to free space + // if (!AccCache.AccPtr) { + // EvictVictims(bytes); + // } assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error @@ -430,20 +461,28 @@ void MemoryManager::NotifyDeletion(void *_ptr) void MemoryManager::Print(void) { PrintBytes(); - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "Memory Manager " << std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; - std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; - std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; - std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; - std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; - std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<second; @@ -453,13 +492,13 @@ void MemoryManager::Print(void) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - std::cout << GridLogDebug << "0x"<second; + LruBytes2+=AccCache.bytes; + assert(AccCache.LRU_valid==1); + assert(AccCache.LRU_entry==it); + } + std::cout << " Memory Manager::Audit() LRU queue matches table entries "<second; + + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + + CpuBytes+=AccCache.bytes; + if( AccCache.AccPtr ) AccBytes+=AccCache.bytes; + if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes; + if( AccCache.LRU_valid ) LruCnt++; + + if ( AccCache.cpuLock || AccCache.accLock ) { + assert(AccCache.LRU_valid==0); + + std::cout << GridLogError << s<< "\n\t 0x"<second; + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + if ( AccCache.state==EvictNext) str = std::string("EvictNext"); + + std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<Device memory movement not currently managed by Grid." << std::endl; +}; void MemoryManager::Print(void){}; +void MemoryManager::PrintAll(void){}; void MemoryManager::NotifyDeletion(void *ptr){}; NAMESPACE_END(Grid); diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index ffcfe37a..b98424a1 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -53,10 +53,11 @@ public: // Communicator should know nothing of the physics grid, only processor grid. //////////////////////////////////////////// int _Nprocessors; // How many in all - Coordinate _processors; // Which dimensions get relayed out over processors lanes. int _processor; // linear processor rank - Coordinate _processor_coor; // linear processor coordinate unsigned long _ndimension; + Coordinate _shm_processors; // Which dimensions get relayed out over processors lanes. + Coordinate _processors; // Which dimensions get relayed out over processors lanes. + Coordinate _processor_coor; // linear processor coordinate static Grid_MPI_Comm communicator_world; Grid_MPI_Comm communicator; std::vector communicator_halo; @@ -97,14 +98,16 @@ public: int BossRank(void) ; int ThisRank(void) ; const Coordinate & ThisProcessorCoor(void) ; + const Coordinate & ShmGrid(void) { return _shm_processors; } ; const Coordinate & ProcessorGrid(void) ; - int ProcessorCount(void) ; + int ProcessorCount(void) ; //////////////////////////////////////////////////////////////////////////////// // very VERY rarely (Log, serial RNG) we need world without a grid //////////////////////////////////////////////////////////////////////////////// static int RankWorld(void) ; static void BroadcastWorld(int root,void* data, int bytes); + static void BarrierWorld(void); //////////////////////////////////////////////////////////// // Reduction @@ -128,7 +131,7 @@ public: template void GlobalSum(obj &o){ typedef typename obj::scalar_type scalar_type; int words = sizeof(obj)/sizeof(scalar_type); - scalar_type * ptr = (scalar_type *)& o; + scalar_type * ptr = (scalar_type *)& o; // Safe alias GlobalSumVector(ptr,words); } @@ -142,17 +145,17 @@ public: int bytes); double StencilSendToRecvFrom(void *xmit, - int xmit_to_rank, + int xmit_to_rank,int do_xmit, void *recv, - int recv_from_rank, + int recv_from_rank,int do_recv, int bytes,int dir); double StencilSendToRecvFromBegin(std::vector &list, void *xmit, - int xmit_to_rank, + int xmit_to_rank,int do_xmit, void *recv, - int recv_from_rank, - int bytes,int dir); + int recv_from_rank,int do_recv, + int xbytes,int rbytes,int dir); void StencilSendToRecvFromComplete(std::vector &waitall,int i); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 305a3a9b..e7d7a96d 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) // Remap using the shared memory optimising routine // The remap creates a comm which must be freed //////////////////////////////////////////////////// - GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm); + GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm,_shm_processors); InitFromMPICommunicator(processors,optimal_comm); SetCommunicator(optimal_comm); /////////////////////////////////////////////////// @@ -124,12 +124,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); Coordinate parent_processor_coor(_ndimension,0); Coordinate parent_processors (_ndimension,1); - + Coordinate shm_processors (_ndimension,1); // Can make 5d grid from 4d etc... int pad = _ndimension-parent_ndimension; for(int d=0;d list; - double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); StencilSendToRecvFromComplete(list,dir); return offbytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, - int dest, + int dest,int dox, void *recv, - int from, - int bytes,int dir) + int from,int dor, + int xbytes,int rbytes,int dir) { int ncomm =communicator_halo.size(); int commdir=dir%ncomm; @@ -370,29 +372,28 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); - assert(shm!=NULL); - acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); - } - - if ( CommunicatorPolicy == CommunicatorPolicySequential ) { - this->StencilSendToRecvFromComplete(list,dir); + + if (dox) { + if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { + tag= dir+_processor*32; + ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); + assert(ierr==0); + list.push_back(xrq); + off_node_bytes+=xbytes; + } else { + void *shm = (void *) this->ShmBufferTranslate(dest,recv); + assert(shm!=NULL); + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); + } } return off_node_bytes; @@ -404,7 +405,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector status(nreq); - acceleratorCopySynchronise(); int ierr = MPI_Waitall(nreq,&list[0],&status[0]); assert(ierr==0); list.resize(0); @@ -435,6 +435,10 @@ int CartesianCommunicator::RankWorld(void){ MPI_Comm_rank(communicator_world,&r); return r; } +void CartesianCommunicator::BarrierWorld(void){ + int ierr = MPI_Barrier(communicator_world); + assert(ierr==0); +} void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { int ierr= MPI_Bcast(data, diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index beb2cc97..a06e054d 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -45,12 +45,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) : CartesianCommunicator(processors) { + _shm_processors = Coordinate(processors.size(),1); srank=0; SetCommunicator(communicator_world); } CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) { + _shm_processors = Coordinate(processors.size(),1); _processors = processors; _ndimension = processors.size(); assert(_ndimension>=1); _processor_coor.resize(_ndimension); @@ -102,6 +104,7 @@ int CartesianCommunicator::RankWorld(void){return 0;} void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } +void CartesianCommunicator::BarrierWorld(void) { } int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;} void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; } void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) @@ -111,21 +114,21 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest } double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, - int xmit_to_rank, + int xmit_to_rank,int dox, void *recv, - int recv_from_rank, + int recv_from_rank,int dor, int bytes, int dir) { return 2.0*bytes; } double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, void *xmit, - int xmit_to_rank, + int xmit_to_rank,int dox, void *recv, - int recv_from_rank, - int bytes, int dir) + int recv_from_rank,int dor, + int xbytes,int rbytes, int dir) { - return 2.0*bytes; + return xbytes+rbytes; } void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) { diff --git a/Grid/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc index de10da3d..ec42dd87 100644 --- a/Grid/communicator/SharedMemory.cc +++ b/Grid/communicator/SharedMemory.cc @@ -91,6 +91,59 @@ void *SharedMemory::ShmBufferSelf(void) //std::cerr << "ShmBufferSelf "< IntShmDims; + GridCmdOptionIntVector(std::string(str),IntShmDims); + assert(IntShmDims.size() == WorldDims.size()); + long ShmSize = 1; + for (int dim=0;dim primes({2,3,5}); + + int dim = 0; + int last_dim = ndimension - 1; + int AutoShmSize = 1; + while(AutoShmSize != WorldShmSize) { + int p; + for(p=0;p *************************************************************************************/ /* END LEGAL */ +#define Mheader "SharedMemoryMpi: " + #include #include @@ -36,12 +38,120 @@ Author: Christoph Lehner #ifdef GRID_HIP #include #endif -#ifdef GRID_SYCl - +#ifdef GRID_SYCL +#define GRID_SYCL_LEVEL_ZERO_IPC +#include +#define SHM_SOCKETS #endif +#include +#include + NAMESPACE_BEGIN(Grid); -#define header "SharedMemoryMpi: " + +#ifdef SHM_SOCKETS + +/* + * Barbaric extra intranode communication route in case we need sockets to pass FDs + * Forced by level_zero not being nicely designed + */ +static int sock; +static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d"; +static char sock_path[256]; +class UnixSockets { +public: + static void Open(int rank) + { + int errnum; + + sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0); + + struct sockaddr_un sa_un = { 0 }; + sa_un.sun_family = AF_UNIX; + snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank); + unlink(sa_un.sun_path); + if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) { + perror("bind failure"); + exit(EXIT_FAILURE); + } + } + + static int RecvFileDescriptor(void) + { + int n; + int fd; + char buf[1]; + struct iovec iov; + struct msghdr msg; + struct cmsghdr *cmsg; + char cms[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + + memset(&msg, 0, sizeof msg); + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + msg.msg_control = (caddr_t)cms; + msg.msg_controllen = sizeof cms; + + if((n=recvmsg(sock, &msg, 0)) < 0) { + perror("recvmsg failed"); + return -1; + } + if(n == 0){ + perror("recvmsg returned 0"); + return -1; + } + cmsg = CMSG_FIRSTHDR(&msg); + + memmove(&fd, CMSG_DATA(cmsg), sizeof(int)); + + return fd; + } + + static void SendFileDescriptor(int fildes,int xmit_to_rank) + { + struct msghdr msg; + struct iovec iov; + struct cmsghdr *cmsg = NULL; + char ctrl[CMSG_SPACE(sizeof(int))]; + char data = ' '; + + memset(&msg, 0, sizeof(struct msghdr)); + memset(ctrl, 0, CMSG_SPACE(sizeof(int))); + iov.iov_base = &data; + iov.iov_len = sizeof(data); + + sprintf(sock_path,sock_path_fmt,xmit_to_rank); + + struct sockaddr_un sa_un = { 0 }; + sa_un.sun_family = AF_UNIX; + snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank); + + msg.msg_name = (void *)&sa_un; + msg.msg_namelen = sizeof(sa_un); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_controllen = CMSG_SPACE(sizeof(int)); + msg.msg_control = ctrl; + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + *((int *) CMSG_DATA(cmsg)) = fildes; + + sendmsg(sock, &msg, 0); + }; +}; +#endif + + /*Construct from an MPI communicator*/ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) { @@ -64,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) MPI_Comm_size(WorldShmComm ,&WorldShmSize); if ( WorldRank == 0) { - std::cout << header " World communicator of size " < IntShmDims; - GridCmdOptionIntVector(std::string(str),IntShmDims); - assert(IntShmDims.size() == WorldDims.size()); - long ShmSize = 1; - for (int dim=0;dim primes({2,3,5}); - - int dim = 0; - int last_dim = ndimension - 1; - int AutoShmSize = 1; - while(AutoShmSize != WorldShmSize) { - int p; - for(p=0;p(theGridAccelerator->get_device()); auto zeContext = cl::sycl::get_native(theGridAccelerator->get_context()); ze_ipc_mem_handle_t ihandle; clone_mem_t handle; - + if ( r==WorldShmRank ) { auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); if ( err != ZE_RESULT_SUCCESS ) { - std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "< void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + +template +T iDivUp(T a, T b) // Round a / b to nearest higher integer value +{ return (a % b != 0) ? (a / b + 1) : (a / b); } + +template +__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) +{ + int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx >= e1*e2) return; + + int n, b, o; + + n = idx / e2; + b = idx % e2; + o = n*stride + b; + + vector[2*idx + 0] = lo + o; + vector[2*idx + 1] = ro + o; +} + +#endif + ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -321,12 +345,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + ent = e1*e2; + dim3 blockSize(acceleratorThreads()); + dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); + populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o,ro+o); } } +#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o+b,ro+o+b); }} +#endif } else { for(int n=0;n. SPDX-License-Identifier: MIT -Copyright (c) 2013-2018 Niels Lohmann . +Copyright (c) 2013-2022 Niels Lohmann . Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,588 +27,56 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef NLOHMANN_JSON_HPP -#define NLOHMANN_JSON_HPP +/****************************************************************************\ + * Note on documentation: The source files contain links to the online * + * documentation of the public API at https://json.nlohmann.me. This URL * + * contains the most recent documentation and should also be applicable to * + * previous versions; documentation for deprecated functions is not * + * removed, but marked deprecated. See "Generate documentation" section in * + * file doc/README.md. * +\****************************************************************************/ + +#ifndef INCLUDE_NLOHMANN_JSON_HPP_ +#define INCLUDE_NLOHMANN_JSON_HPP_ #define NLOHMANN_JSON_VERSION_MAJOR 3 -#define NLOHMANN_JSON_VERSION_MINOR 2 -#define NLOHMANN_JSON_VERSION_PATCH 0 +#define NLOHMANN_JSON_VERSION_MINOR 10 +#define NLOHMANN_JSON_VERSION_PATCH 5 #include // all_of, find, for_each -#include // assert -#include // and, not, or #include // nullptr_t, ptrdiff_t, size_t #include // hash, less #include // initializer_list -#include // istream, ostream -#include // iterator_traits, random_access_iterator_tag +#ifndef JSON_NO_IO + #include // istream, ostream +#endif // JSON_NO_IO +#include // random_access_iterator_tag +#include // unique_ptr #include // accumulate #include // string, stoi, to_string #include // declval, forward, move, pair, swap - -// #include -#ifndef NLOHMANN_JSON_FWD_HPP -#define NLOHMANN_JSON_FWD_HPP - -#include // int64_t, uint64_t -#include // map -#include // allocator -#include // string #include // vector -/*! -@brief namespace for Niels Lohmann -@see https://github.com/nlohmann -@since version 1.0.0 -*/ -namespace nlohmann -{ -/*! -@brief default JSONSerializer template argument - -This serializer ignores the template arguments and uses ADL -([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl)) -for serialization. -*/ -template -struct adl_serializer; - -template class ObjectType = - std::map, - template class ArrayType = std::vector, - class StringType = std::string, class BooleanType = bool, - class NumberIntegerType = std::int64_t, - class NumberUnsignedType = std::uint64_t, - class NumberFloatType = double, - template class AllocatorType = std::allocator, - template class JSONSerializer = - adl_serializer> -class basic_json; - -/*! -@brief JSON Pointer - -A JSON pointer defines a string syntax for identifying a specific value -within a JSON document. It can be used with functions `at` and -`operator[]`. Furthermore, JSON pointers are the base for JSON patches. - -@sa [RFC 6901](https://tools.ietf.org/html/rfc6901) - -@since version 2.0.0 -*/ -template -class json_pointer; - -/*! -@brief default JSON class - -This type is the default specialization of the @ref basic_json class which -uses the standard template types. - -@since version 1.0.0 -*/ -using json = basic_json<>; -} - -#endif - -// #include - - -// This file contains all internal macro definitions -// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them - -// exclude unsupported compilers -#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK) - #if defined(__clang__) - #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400 - #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER)) - #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800 - #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #endif -#endif - -// disable float-equal warnings on GCC/clang -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - -// disable documentation warnings on clang -#if defined(__clang__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wdocumentation" -#endif - -// allow for portable deprecation warnings -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #define JSON_DEPRECATED __attribute__((deprecated)) -#elif defined(_MSC_VER) - #define JSON_DEPRECATED __declspec(deprecated) -#else - #define JSON_DEPRECATED -#endif - -// allow to disable exceptions -#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION) - #define JSON_THROW(exception) throw exception - #define JSON_TRY try - #define JSON_CATCH(exception) catch(exception) - #define JSON_INTERNAL_CATCH(exception) catch(exception) -#else - #define JSON_THROW(exception) std::abort() - #define JSON_TRY if(true) - #define JSON_CATCH(exception) if(false) - #define JSON_INTERNAL_CATCH(exception) if(false) -#endif - -// override exception macros -#if defined(JSON_THROW_USER) - #undef JSON_THROW - #define JSON_THROW JSON_THROW_USER -#endif -#if defined(JSON_TRY_USER) - #undef JSON_TRY - #define JSON_TRY JSON_TRY_USER -#endif -#if defined(JSON_CATCH_USER) - #undef JSON_CATCH - #define JSON_CATCH JSON_CATCH_USER - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_CATCH_USER -#endif -#if defined(JSON_INTERNAL_CATCH_USER) - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER -#endif - -// manual branch prediction -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #define JSON_LIKELY(x) __builtin_expect(!!(x), 1) - #define JSON_UNLIKELY(x) __builtin_expect(!!(x), 0) -#else - #define JSON_LIKELY(x) x - #define JSON_UNLIKELY(x) x -#endif - -// C++ language standard detection -#if (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464 - #define JSON_HAS_CPP_17 - #define JSON_HAS_CPP_14 -#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1) - #define JSON_HAS_CPP_14 -#endif - -// Ugly macros to avoid uglier copy-paste when specializing basic_json. They -// may be removed in the future once the class is split. - -#define NLOHMANN_BASIC_JSON_TPL_DECLARATION \ - template class ObjectType, \ - template class ArrayType, \ - class StringType, class BooleanType, class NumberIntegerType, \ - class NumberUnsignedType, class NumberFloatType, \ - template class AllocatorType, \ - template class JSONSerializer> - -#define NLOHMANN_BASIC_JSON_TPL \ - basic_json - -// #include - - -#include // not -#include // size_t -#include // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type - -namespace nlohmann -{ -namespace detail -{ -// alias templates to reduce boilerplate -template -using enable_if_t = typename std::enable_if::type; - -template -using uncvref_t = typename std::remove_cv::type>::type; - -// implementation of C++14 index_sequence and affiliates -// source: https://stackoverflow.com/a/32223343 -template -struct index_sequence -{ - using type = index_sequence; - using value_type = std::size_t; - static constexpr std::size_t size() noexcept - { - return sizeof...(Ints); - } -}; - -template -struct merge_and_renumber; - -template -struct merge_and_renumber, index_sequence> - : index_sequence < I1..., (sizeof...(I1) + I2)... > {}; - -template -struct make_index_sequence - : merge_and_renumber < typename make_index_sequence < N / 2 >::type, - typename make_index_sequence < N - N / 2 >::type > {}; - -template<> struct make_index_sequence<0> : index_sequence<> {}; -template<> struct make_index_sequence<1> : index_sequence<0> {}; - -template -using index_sequence_for = make_index_sequence; - -// dispatch utility (taken from ranges-v3) -template struct priority_tag : priority_tag < N - 1 > {}; -template<> struct priority_tag<0> {}; - -// taken from ranges-v3 -template -struct static_const -{ - static constexpr T value{}; -}; - -template -constexpr T static_const::value; -} -} - -// #include - - -#include // not -#include // numeric_limits -#include // false_type, is_constructible, is_integral, is_same, true_type -#include // declval - -// #include - -// #include - -// #include +// #include #include - -// #include - - -namespace nlohmann -{ -namespace detail -{ -template struct make_void -{ - using type = void; -}; -template using void_t = typename make_void::type; -} -} - - -// http://en.cppreference.com/w/cpp/experimental/is_detected -namespace nlohmann -{ -namespace detail -{ -struct nonesuch -{ - nonesuch() = delete; - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; - void operator=(nonesuch const&) = delete; -}; - -template class Op, - class... Args> -struct detector -{ - using value_t = std::false_type; - using type = Default; -}; - -template class Op, class... Args> -struct detector>, Op, Args...> -{ - using value_t = std::true_type; - using type = Op; -}; - -template