diff --git a/.gitignore b/.gitignore index 5338acb9..40156f9d 100644 --- a/.gitignore +++ b/.gitignore @@ -88,6 +88,7 @@ Thumbs.db # build directory # ################### build*/* +Documentation/_build # IDE related files # ##################### diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 3a0e1e35..00000000 --- a/.travis.yml +++ /dev/null @@ -1,56 +0,0 @@ -language: cpp - -cache: - directories: - - clang - -matrix: - include: - - os: osx - osx_image: xcode8.3 - compiler: clang - -before_install: - - export GRIDDIR=`pwd` - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi - -install: - - export CWD=`pwd` - - echo $CWD - - export CC=$CC$VERSION - - export CXX=$CXX$VERSION - - echo $PATH - - which autoconf - - autoconf --version - - which automake - - automake --version - - which $CC - - $CC --version - - which $CXX - - $CXX --version - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi - -script: - - ./bootstrap.sh - - mkdir build - - cd build - - mkdir lime - - cd lime - - mkdir build - - cd build - - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz - - tar xf lime-1.3.2.tar.gz - - cd lime-1.3.2 - - ./configure --prefix=$CWD/build/lime/install - - make -j4 - - make install - - cd $CWD/build - - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF} - - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - - make check diff --git a/Grid/DisableWarnings.h b/Grid/DisableWarnings.h index 8ea219fb..015e19d1 100644 --- a/Grid/DisableWarnings.h +++ b/Grid/DisableWarnings.h @@ -37,19 +37,29 @@ directory #endif //disables and intel compiler specific warning (in json.hpp) +#ifdef __ICC #pragma warning disable 488 +#endif #ifdef __NVCC__ //disables nvcc specific warning in json.hpp #pragma clang diagnostic ignored "-Wdeprecated-register" + +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ + //disables nvcc specific warning in json.hpp +#pragma nv_diag_suppress unsigned_compare_with_zero +#pragma nv_diag_suppress cast_to_qualified_type + //disables nvcc specific warning in many files +#pragma nv_diag_suppress esa_on_defaulted_function_ignored +#pragma nv_diag_suppress extra_semicolon +#else + //disables nvcc specific warning in json.hpp #pragma diag_suppress unsigned_compare_with_zero #pragma diag_suppress cast_to_qualified_type - //disables nvcc specific warning in many files #pragma diag_suppress esa_on_defaulted_function_ignored #pragma diag_suppress extra_semicolon - -//Eigen only +#endif #endif // Disable vectorisation in Eigen on the Power8/9 and PowerPC diff --git a/Grid/GridStd.h b/Grid/GridStd.h index 28f6bc46..d0a8124a 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index c62d9cdb..bdd39a65 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -14,7 +14,11 @@ /* NVCC save and restore compile environment*/ #ifdef __NVCC__ #pragma push +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress code_is_unreachable +#else #pragma diag_suppress code_is_unreachable +#endif #pragma push_macro("__CUDA_ARCH__") #pragma push_macro("__NVCC__") #pragma push_macro("__CUDACC__") diff --git a/Grid/Makefile.am b/Grid/Makefile.am index f1fa462e..7c3c151b 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -21,6 +21,7 @@ if BUILD_HDF5 extra_headers+=serialisation/Hdf5Type.h endif + all: version-cache Version.h version-cache: @@ -53,6 +54,19 @@ Version.h: version-cache include Make.inc include Eigen.inc +extra_sources+=$(WILS_FERMION_FILES) +extra_sources+=$(STAG_FERMION_FILES) +if BUILD_ZMOBIUS + extra_sources+=$(ZWILS_FERMION_FILES) +endif +if BUILD_GPARITY + extra_sources+=$(GP_FERMION_FILES) +endif +if BUILD_FERMION_REPS + extra_sources+=$(ADJ_FERMION_FILES) + extra_sources+=$(TWOIND_FERMION_FILES) +endif + lib_LIBRARIES = libGrid.a CCFILES += $(extra_sources) diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 7f27784b..ff3da17d 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB); #include #include #include +#include #include #include #include diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 8d184aea..ba4abecd 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -31,6 +31,7 @@ Author: paboyle #ifndef GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H +#include // needed for Dagger(Yes|No), Inverse(Yes|No) NAMESPACE_BEGIN(Grid); @@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice &CoarseInner, class Geometry { public: int npoint; + int base; std::vector directions ; std::vector displacements; + std::vector points_dagger; Geometry(int _d) { - int base = (_d==5) ? 1:0; + base = (_d==5) ? 1:0; // make coarse grid stencil for 4d , not 5d if ( _d==5 ) _d=4; @@ -72,16 +75,51 @@ public: npoint = 2*_d+1; directions.resize(npoint); displacements.resize(npoint); + points_dagger.resize(npoint); for(int d=0;d<_d;d++){ directions[d ] = d+base; directions[d+_d] = d+base; displacements[d ] = +1; displacements[d+_d]= -1; + points_dagger[d ] = d+_d; + points_dagger[d+_d] = d; } directions [2*_d]=0; displacements[2*_d]=0; + points_dagger[2*_d]=2*_d; } + int point(int dir, int disp) { + assert(disp == -1 || disp == 0 || disp == 1); + assert(base+0 <= dir && dir < base+4); + + // directions faster index = new indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 1 2 3 0 1 2 3 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 2 3 4 1 2 3 4 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + + // displacements faster index = old indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 0 1 1 2 2 3 3 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 1 2 2 3 3 4 4 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + + if(dir == 0 and disp == 0) + return 8; + else // New indexing + return (1 - disp) / 2 * 4 + dir - base; + // else // Old indexing + // return (4 * (dir - base) + 1 - disp) / 2; + } }; template @@ -224,7 +262,7 @@ public: autoView( Tnp_v , (*Tnp), AcceleratorWrite); autoView( Tnm_v , (*Tnm), AcceleratorWrite); const int Nsimd = CComplex::Nsimd(); - accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { + accelerator_for(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); }); @@ -258,7 +296,7 @@ public: // Fine Object == (per site) type of fine field // nbasis == number of deflation vectors template -class CoarsenedMatrix : public SparseMatrixBase > > { +class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase > > { public: typedef iVector siteVector; @@ -268,33 +306,59 @@ public: typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; + typedef CoarseVector FermionField; + + // enrich interface, use default implementation as in FermionOperator /////// + void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; } + void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; } + void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; + void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; //////////////////// // Data members //////////////////// Geometry geom; GridBase * _grid; + GridBase* _cbgrid; int hermitian; CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; std::vector A; - + std::vector Aeven; + std::vector Aodd; + + CoarseMatrix AselfInv; + CoarseMatrix AselfInvEven; + CoarseMatrix AselfInvOdd; + + Vector dag_factor; + /////////////////////// // Interface /////////////////////// GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know + GridBase * RedBlackGrid() { return _cbgrid; }; + + int ConstEE() { return 0; } void M (const CoarseVector &in, CoarseVector &out) { conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); SimpleCompressor compressor; Stencil.HaloExchange(in,compressor); autoView( in_v , in, AcceleratorRead); autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + int npoint = geom.npoint; typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -316,14 +380,14 @@ public: int ptype; StencilEntry *SE; - for(int point=0;point_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); @@ -344,12 +408,74 @@ public: return M(in,out); } else { // corresponds to Galerkin coarsening - CoarseVector tmp(Grid()); - G5C(tmp, in); - M(tmp, out); - G5C(out, out); + return MdagNonHermitian(in, out); } }; + + void MdagNonHermitian(const CoarseVector &in, CoarseVector &out) + { + conformable(_grid,in.Grid()); + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + int npoint = geom.npoint; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + + for(int p=0;poSites(); + + Vector points(geom.npoint, 0); + for(int p=0; poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; @@ -359,6 +485,7 @@ public: { conformable(_grid,in.Grid()); conformable(_grid,out.Grid()); + out.Checkerboard() = in.Checkerboard(); typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -367,6 +494,7 @@ public: autoView( out_v , out, AcceleratorWrite); autoView( in_v , in, AcceleratorRead); + autoView( Stencil_v , Stencil, AcceleratorRead); const int Nsimd = CComplex::Nsimd(); typedef decltype(coalescedRead(in_v[0])) calcVector; @@ -380,12 +508,12 @@ public: int ptype; StencilEntry *SE; - SE=Stencil.GetEntry(ptype,point,ss); + SE=Stencil_v.GetEntry(ptype,point,ss); if(SE->_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); @@ -413,34 +541,7 @@ public: this->MdirComms(in); - int ndim = in.Grid()->Nd(); - - ////////////// - // 4D action like wilson - // 0+ => 0 - // 0- => 1 - // 1+ => 2 - // 1- => 3 - // etc.. - ////////////// - // 5D action like DWF - // 1+ => 0 - // 1- => 1 - // 2+ => 2 - // 2- => 3 - // etc.. - auto point = [dir, disp, ndim](){ - if(dir == 0 and disp == 0) - return 8; - else if ( ndim==4 ) { - return (4 * dir + 1 - disp) / 2; - } else { - return (4 * (dir-1) + 1 - disp) / 2; - } - }(); - - MdirCalc(in,out,point); - + MdirCalc(in,out,geom.point(dir,disp)); }; void Mdiag(const CoarseVector &in, CoarseVector &out) @@ -449,23 +550,298 @@ public: MdirCalc(in, out, point); // No comms }; - - CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) : + void Mooee(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseNo); + } + void MooeeInv(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseYes); + } + + void MooeeDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseNo); + } + + void MooeeInvDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseYes); + } + + void Meooe(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } + } + + void MeooeDag(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } + } + + void Dhop(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _grid); // verifies full grid + conformable(in.Grid(), out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + DhopInternal(Stencil, A, in, out, dag); + } + + void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Even); + out.Checkerboard() = Odd; + + DhopInternal(StencilEven, Aodd, in, out, dag); + } + + void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Odd); + out.Checkerboard() = Even; + + DhopInternal(StencilOdd, Aeven, in, out, dag); + } + + void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { + out.Checkerboard() = in.Checkerboard(); + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + + CoarseMatrix *Aself = nullptr; + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1]; + DselfInternal(StencilOdd, *Aself, in, out, dag); + } else { + Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1]; + DselfInternal(StencilEven, *Aself, in, out, dag); + } + } else { + Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; + DselfInternal(Stencil, *Aself, in, out, dag); + } + assert(Aself != nullptr); + } + + void DselfInternal(CartesianStencil &st, CoarseMatrix &a, + const CoarseVector &in, CoarseVector &out, int dag) { + int point = geom.npoint-1; + autoView( out_v, out, AcceleratorWrite); + autoView( in_v, in, AcceleratorRead); + autoView( st_v, st, AcceleratorRead); + autoView( a_v, a, AcceleratorRead); + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + RealD* dag_factor_p = &dag_factor[0]; + + if(dag) { + accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb &st, std::vector &a, + const CoarseVector &in, CoarseVector &out, int dag) { + SimpleCompressor compressor; + + st.HaloExchange(in,compressor); + autoView( in_v, in, AcceleratorRead); + autoView( out_v, out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + typedef LatticeView Aview; + + // determine in what order we need the points + int npoint = geom.npoint-1; + Vector points(npoint, 0); + for(int p=0; p AcceleratorViewContainer; + for(int p=0;poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb > &linop, Aggregation & Subspace) { typedef Lattice FineComplexField; typedef typename Fobj::scalar_type scalar_type; + std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl; + FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); @@ -496,11 +872,13 @@ public: CoarseScalar InnerProd(Grid()); + std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl; // Orthogonalise the subblocks over the basis blockOrthogonalise(InnerProd,Subspace.subspace); // Compute the matrix elements of linop between this orthonormal // set of vectors. + std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl; int self_stencil=-1; for(int p=0;poSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); + if ( hermitian && (disp==-1) ) { + for(int pp=0;pp = * + int dirp = geom.directions[pp]; + int dispp = geom.displacements[pp]; + if ( (dirp==dir) && (dispp==1) ){ + auto sft = conjugate(Cshift(oZProj,dir,1)); + autoView( sft_v , sft , AcceleratorWrite); + autoView( A_pp , A[pp], AcceleratorWrite); + accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); }); + } + } + } } } @@ -606,28 +996,54 @@ public: } if(hermitian) { std::cout << GridLogMessage << " ForceHermitian, new code "<lSites(); + + typedef typename Cobj::scalar_object scalar_object; + + autoView(Aself_v, A[geom.npoint-1], CpuRead); + autoView(AselfInv_v, AselfInv, CpuWrite); + thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke + Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + + scalar_object selfLink = Zero(); + scalar_object selfLinkInv = Zero(); + + Coordinate lcoor; + + Grid()->LocalIndexToLocalCoor(site, lcoor); + peekLocalSite(selfLink, Aself_v, lcoor); + + for (int i = 0; i < nbasis; ++i) + for (int j = 0; j < nbasis; ++j) + selfLinkEigen(i, j) = static_cast(TensorRemove(selfLink(i, j))); + + selfLinkInvEigen = selfLinkEigen.inverse(); + + for(int i = 0; i < nbasis; ++i) + for(int j = 0; j < nbasis; ++j) + selfLinkInv(i, j) = selfLinkInvEigen(i, j); + + pokeLocalSite(selfLinkInv, AselfInv_v, lcoor); + }); + } + + void FillHalfCbs() { + std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl; + for(int p = 0; p < geom.npoint; ++p) { + pickCheckerboard(Even, Aeven[p], A[p]); + pickCheckerboard(Odd, Aodd[p], A[p]); } + pickCheckerboard(Even, AselfInvEven, AselfInv); + pickCheckerboard(Odd, AselfInvOdd, AselfInv); } }; diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index ad42f049..29f0ec4b 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -136,7 +136,7 @@ public: flops=0; usec =0; Coordinate layout(Nd,1); - sgrid = new GridCartesian(dimensions,layout,processors); + sgrid = new GridCartesian(dimensions,layout,processors,*grid); }; ~FFT ( void) { @@ -182,7 +182,7 @@ public: pencil_gd[dim] = G*processors[dim]; // Pencil global vol LxLxGxLxL per node - GridCartesian pencil_g(pencil_gd,layout,processors); + GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid); // Construct pencils typedef typename vobj::scalar_object sobj; diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h index 1add212c..b1cf4d97 100644 --- a/Grid/algorithms/LinearOperator.h +++ b/Grid/algorithms/LinearOperator.h @@ -52,6 +52,7 @@ public: virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; virtual void HermOp(const Field &in, Field &out)=0; + virtual ~LinearOperatorBase(){}; }; @@ -507,7 +508,7 @@ class SchurStaggeredOperator : public SchurOperatorBase { virtual void MpcDag (const Field &in, Field &out){ Mpc(in,out); } - virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { + virtual void MpcDagMpc(const Field &in, Field &out) { assert(0);// Never need with staggered } }; @@ -530,6 +531,16 @@ public: template class LinearFunction { public: virtual void operator() (const Field &in, Field &out) = 0; + + virtual void operator() (const std::vector &in, std::vector &out) + { + assert(in.size() == out.size()); + + for (unsigned int i = 0; i < in.size(); ++i) + { + (*this)(in[i], out[i]); + } + } }; template class IdentityLinearFunction : public LinearFunction { @@ -575,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction { template class PlainHermOp : public LinearFunction { public: + using LinearFunction::operator(); LinearOperatorBase &_Linop; PlainHermOp(LinearOperatorBase& linop) : _Linop(linop) @@ -588,6 +600,7 @@ public: template class FunctionHermOp : public LinearFunction { public: + using LinearFunction::operator(); OperatorFunction & _poly; LinearOperatorBase &_Linop; diff --git a/Grid/algorithms/Preconditioner.h b/Grid/algorithms/Preconditioner.h index 65ac3cf9..a95dad7c 100644 --- a/Grid/algorithms/Preconditioner.h +++ b/Grid/algorithms/Preconditioner.h @@ -30,13 +30,19 @@ Author: Azusa Yamaguchi NAMESPACE_BEGIN(Grid); -template class Preconditioner : public LinearFunction { +template using Preconditioner = LinearFunction ; + +/* +template class Preconditioner : public LinearFunction { + using LinearFunction::operator(); virtual void operator()(const Field &src, Field & psi)=0; }; +*/ template class TrivialPrecon : public Preconditioner { public: - void operator()(const Field &src, Field & psi){ + using Preconditioner::operator(); + virtual void operator()(const Field &src, Field & psi){ psi = src; } TrivialPrecon(void){}; diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h index 8a265b3f..b4bab20c 100644 --- a/Grid/algorithms/SparseMatrix.h +++ b/Grid/algorithms/SparseMatrix.h @@ -48,6 +48,7 @@ public: virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void MdirAll (const Field &in, std::vector &out)=0; + virtual ~SparseMatrixBase() {}; }; ///////////////////////////////////////////////////////////////////////////////////////////// @@ -72,7 +73,7 @@ public: virtual void MeooeDag (const Field &in, Field &out)=0; virtual void MooeeDag (const Field &in, Field &out)=0; virtual void MooeeInvDag (const Field &in, Field &out)=0; - + virtual ~CheckerBoardedSparseMatrixBase() {}; }; NAMESPACE_END(Grid); diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 584ed1d5..7c93f0b8 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -264,7 +264,7 @@ public: auto Tnp_v = Tnp->View(); auto Tnm_v = Tnm->View(); constexpr int Nsimd = vector_type::Nsimd(); - accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, { + accelerator_for(ss, in.Grid()->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); }); diff --git a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h index d9b2a9d7..c34035a8 100644 --- a/Grid/algorithms/iterative/BiCGSTABMixedPrec.h +++ b/Grid/algorithms/iterative/BiCGSTABMixedPrec.h @@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid); template::value == 2, int>::type = 0, typename std::enable_if< getPrecision::value == 1, int>::type = 0> class MixedPrecisionBiCGSTAB : public LinearFunction { - public: + public: + using LinearFunction::operator(); RealD Tolerance; RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed Integer MaxInnerIterations; diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h index 08942097..cd2e4374 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrec.h @@ -35,7 +35,8 @@ NAMESPACE_BEGIN(Grid); typename std::enable_if< getPrecision::value == 2, int>::type = 0, typename std::enable_if< getPrecision::value == 1, int>::type = 0> class MixedPrecisionConjugateGradient : public LinearFunction { - public: + public: + using LinearFunction::operator(); RealD Tolerance; RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed Integer MaxInnerIterations; diff --git a/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h new file mode 100644 index 00000000..93f5c677 --- /dev/null +++ b/Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h @@ -0,0 +1,213 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h + + Copyright (C) 2015 + + Author: Raoul Hodgson + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H +#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H + +NAMESPACE_BEGIN(Grid); + +//Mixed precision restarted defect correction CG +template::value == 2, int>::type = 0, + typename std::enable_if< getPrecision::value == 1, int>::type = 0> +class MixedPrecisionConjugateGradientBatched : public LinearFunction { +public: + using LinearFunction::operator(); + RealD Tolerance; + RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed + Integer MaxInnerIterations; + Integer MaxOuterIterations; + Integer MaxPatchupIterations; + GridBase* SinglePrecGrid; //Grid for single-precision fields + RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance + LinearOperatorBase &Linop_f; + LinearOperatorBase &Linop_d; + + //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess + LinearFunction *guesser; + bool updateResidual; + + MixedPrecisionConjugateGradientBatched(RealD tol, + Integer maxinnerit, + Integer maxouterit, + Integer maxpatchit, + GridBase* _sp_grid, + LinearOperatorBase &_Linop_f, + LinearOperatorBase &_Linop_d, + bool _updateResidual=true) : + Linop_f(_Linop_f), Linop_d(_Linop_d), + Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid), + OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { }; + + void useGuesser(LinearFunction &g){ + guesser = &g; + } + + void operator() (const FieldD &src_d_in, FieldD &sol_d){ + std::vector srcs_d_in{src_d_in}; + std::vector sols_d{sol_d}; + + (*this)(srcs_d_in,sols_d); + + sol_d = sols_d[0]; + } + + void operator() (const std::vector &src_d_in, std::vector &sol_d){ + assert(src_d_in.size() == sol_d.size()); + int NBatch = src_d_in.size(); + + std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; + + Integer TotalOuterIterations = 0; //Number of restarts + std::vector TotalInnerIterations(NBatch,0); //Number of inner CG iterations + std::vector TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step + + GridStopWatch TotalTimer; + TotalTimer.Start(); + + GridStopWatch InnerCGtimer; + GridStopWatch PrecChangeTimer; + + int cb = src_d_in[0].Checkerboard(); + + std::vector src_norm; + std::vector norm; + std::vector stop; + + GridBase* DoublePrecGrid = src_d_in[0].Grid(); + FieldD tmp_d(DoublePrecGrid); + tmp_d.Checkerboard() = cb; + + FieldD tmp2_d(DoublePrecGrid); + tmp2_d.Checkerboard() = cb; + + std::vector src_d; + std::vector src_f; + std::vector sol_f; + + for (int i=0; i CG_f(inner_tol, MaxInnerIterations); + CG_f.ErrorOnNoConverge = false; + + Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count + + for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl; + + bool allConverged = true; + + for (int i=0; i OuterLoopNormMult * stop[i]) { + allConverged = false; + } + } + if (allConverged) break; + + if (updateResidual) { + RealD normMax = *std::max_element(std::begin(norm), std::end(norm)); + RealD stopMax = *std::max_element(std::begin(stop), std::end(stop)); + while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? + CG_f.Tolerance = inner_tol; + } + + //Optionally improve inner solver guess (eg using known eigenvectors) + if(guesser != NULL) { + (*guesser)(src_f, sol_f); + } + + for (int i=0; i CG_d(Tolerance, MaxPatchupIterations); + CG_d(Linop_d, src_d_in[i], sol_d[i]); + TotalFinalStepIterations[i] += CG_d.IterationsToComplete; + } + + TotalTimer.Stop(); + + std::cout << GridLogMessage << std::endl; + for (int i=0; i class ZeroGuesser: public LinearFunction { public: + using LinearFunction::operator(); virtual void operator()(const Field &src, Field &guess) { guess = Zero(); }; }; template class DoNothingGuesser: public LinearFunction { public: + using LinearFunction::operator(); virtual void operator()(const Field &src, Field &guess) { }; }; template class SourceGuesser: public LinearFunction { public: + using LinearFunction::operator(); virtual void operator()(const Field &src, Field &guess) { guess = src; }; }; @@ -54,15 +57,24 @@ class DeflatedGuesser: public LinearFunction { private: const std::vector &evec; const std::vector &eval; + const unsigned int N; public: + using LinearFunction::operator(); - DeflatedGuesser(const std::vector & _evec,const std::vector & _eval) : evec(_evec), eval(_eval) {}; + DeflatedGuesser(const std::vector & _evec,const std::vector & _eval) + : DeflatedGuesser(_evec, _eval, _evec.size()) + {} + + DeflatedGuesser(const std::vector & _evec, const std::vector & _eval, const unsigned int _N) + : evec(_evec), eval(_eval), N(_N) + { + assert(evec.size()==eval.size()); + assert(N <= evec.size()); + } virtual void operator()(const Field &src,Field &guess) { guess = Zero(); - assert(evec.size()==eval.size()); - auto N = evec.size(); for (int i=0;i &eval_coarse; public: + using LinearFunction::operator(); LocalCoherenceDeflatedGuesser(const std::vector &_subspace, const std::vector &_evec_coarse, const std::vector &_eval_coarse) @@ -100,7 +113,43 @@ public: blockPromote(guess_coarse,guess,subspace); guess.Checkerboard() = src.Checkerboard(); }; -}; + + void operator()(const std::vector &src,std::vector &guess) { + int Nevec = (int)evec_coarse.size(); + int Nsrc = (int)src.size(); + // make temp variables + std::vector src_coarse(Nsrc,evec_coarse[0].Grid()); + std::vector guess_coarse(Nsrc,evec_coarse[0].Grid()); + //Preporcessing + std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl; + for (int j=0;j class ProjectedHermOp : public LinearFunction > > { public: + using LinearFunction > >::operator(); typedef iVector CoarseSiteVector; typedef Lattice CoarseField; typedef Lattice CoarseScalar; // used for inner products on fine field @@ -97,6 +98,7 @@ public: template class ProjectedFunctionHermOp : public LinearFunction > > { public: + using LinearFunction > >::operator(); typedef iVector CoarseSiteVector; typedef Lattice CoarseField; typedef Lattice CoarseScalar; // used for inner products on fine field diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h index bf454ade..feceb21f 100644 --- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h @@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid); template class PrecGeneralisedConjugateResidual : public LinearFunction { public: - + using LinearFunction::operator(); RealD Tolerance; Integer MaxIterations; int verbose; diff --git a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h index 22b7725e..181df320 100644 --- a/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h +++ b/Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h @@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid); template class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction { public: - + using LinearFunction::operator(); RealD Tolerance; Integer MaxIterations; int verbose; @@ -119,7 +119,8 @@ public: RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD cp; - ComplexD a, b, zAz; + ComplexD a, b; + // ComplexD zAz; RealD zAAz; ComplexD rq; @@ -146,7 +147,7 @@ public: ////////////////////////////////// MatTimer.Start(); Linop.Op(psi,Az); - zAz = innerProduct(Az,psi); + // zAz = innerProduct(Az,psi); zAAz= norm2(Az); MatTimer.Stop(); @@ -170,7 +171,7 @@ public: LinalgTimer.Start(); - zAz = innerProduct(Az,psi); + // zAz = innerProduct(Az,psi); zAAz= norm2(Az); //p[0],q[0],qq[0] @@ -212,7 +213,7 @@ public: MatTimer.Start(); Linop.Op(z,Az); MatTimer.Stop(); - zAz = innerProduct(Az,psi); + // zAz = innerProduct(Az,psi); zAAz= norm2(Az); LinalgTimer.Start(); diff --git a/Grid/algorithms/iterative/SchurRedBlack.h b/Grid/algorithms/iterative/SchurRedBlack.h index d0b133a3..d97e4993 100644 --- a/Grid/algorithms/iterative/SchurRedBlack.h +++ b/Grid/algorithms/iterative/SchurRedBlack.h @@ -132,6 +132,31 @@ namespace Grid { (*this)(_Matrix,in,out,guess); } + void RedBlackSource(Matrix &_Matrix, const std::vector &in, std::vector &src_o) + { + GridBase *grid = _Matrix.RedBlackGrid(); + Field tmp(grid); + int nblock = in.size(); + for(int b=0;b &in, const std::vector &sol_o, std::vector &out) + { + GridBase *grid = _Matrix.RedBlackGrid(); + Field tmp(grid); + int nblock = in.size(); + for(int b=0;b void operator()(Matrix &_Matrix, const std::vector &in, std::vector &out,Guesser &guess) { @@ -150,24 +175,29 @@ namespace Grid { //////////////////////////////////////////////// // Prepare RedBlack source //////////////////////////////////////////////// - for(int b=0;b -#include - -NAMESPACE_BEGIN(Grid); - -MemoryStats *MemoryProfiler::stats = nullptr; -bool MemoryProfiler::debug = false; - -void check_huge_pages(void *Buf,uint64_t BYTES) -{ -#ifdef __linux__ - int fd = open("/proc/self/pagemap", O_RDONLY); - assert(fd >= 0); - const int page_size = 4096; - uint64_t virt_pfn = (uint64_t)Buf / page_size; - off_t offset = sizeof(uint64_t) * virt_pfn; - uint64_t npages = (BYTES + page_size-1) / page_size; - uint64_t pagedata[npages]; - uint64_t ret = lseek(fd, offset, SEEK_SET); - assert(ret == offset); - ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); - assert(ret == sizeof(uint64_t) * npages); - int nhugepages = npages / 512; - int n4ktotal, nnothuge; - n4ktotal = 0; - nnothuge = 0; - for (int i = 0; i < nhugepages; ++i) { - uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; - for (int j = 0; j < 512; ++j) { - uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; - ++n4ktotal; - if (pageaddr != baseaddr + j * page_size) - ++nnothuge; - } - } - int rank = CartesianCommunicator::RankWorld(); - printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); -#endif -} - -std::string sizeString(const size_t bytes) -{ - constexpr unsigned int bufSize = 256; - const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"}; - char buf[256]; - size_t s = 0; - double count = bytes; - - while (count >= 1024 && s < 7) - { - s++; - count /= 1024; - } - if (count - floor(count) == 0.0) - { - snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); - } - else - { - snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); - } - - return std::string(buf); -} - -NAMESPACE_END(Grid); - diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 4b357523..91622789 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -173,7 +173,8 @@ template using cshiftAllocator = devAllocator; template using cshiftAllocator = std::allocator; #endif -template using Vector = std::vector >; +template using Vector = std::vector >; +template using stencilVector = std::vector >; template using commVector = std::vector >; template using cshiftVector = std::vector >; diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index c1a4d93a..a9e5c9b4 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -4,106 +4,156 @@ NAMESPACE_BEGIN(Grid); /*Allocation types, saying which pointer cache should be used*/ #define Cpu (0) -#define CpuSmall (1) -#define Acc (2) -#define AccSmall (3) -#define Shared (4) -#define SharedSmall (5) +#define CpuHuge (1) +#define CpuSmall (2) +#define Acc (3) +#define AccHuge (4) +#define AccSmall (5) +#define Shared (6) +#define SharedHuge (7) +#define SharedSmall (8) +#undef GRID_MM_VERBOSE uint64_t total_shared; uint64_t total_device; uint64_t total_host;; void MemoryManager::PrintBytes(void) { - std::cout << " MemoryManager : "<>20)<<" shared Mbytes "<>20)<<" accelerator Mbytes "<>20) <<" cpu Mbytes "<>20) <<" cpu cache Mbytes "<>20) <<" acc cache Mbytes "<>20) <<" shared cache Mbytes "<=0) && (Nc < NallocCacheMax)) { + Ncache[CpuHuge]=Nc; + Ncache[AccHuge]=Nc; + Ncache[SharedHuge]=Nc; + } + } + str= getenv("GRID_ALLOC_NCACHE_SMALL"); if ( str ) { Nc = atoi(str); @@ -147,7 +206,9 @@ void MemoryManager::InitMessage(void) { std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + + return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); #else return ptr; #endif } -void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) +void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif + if (ncache == 0) return ptr; + void * ret = NULL; int v = -1; @@ -211,6 +276,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries if ( entries[v].valid ) { ret = entries[v].address; + cacheBytes -= entries[v].bytes; entries[v].valid = 0; entries[v].address = NULL; entries[v].bytes = 0; @@ -219,6 +285,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries entries[v].address=ptr; entries[v].bytes =bytes; entries[v].valid =1; + cacheBytes += bytes; return ret; } @@ -226,23 +293,26 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries void *MemoryManager::Lookup(size_t bytes,int type) { #ifdef ALLOCATION_CACHE - bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); - int cache = type+small; - return Lookup(bytes,Entries[cache],Ncache[cache]); + int cache; + if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; + else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1; + else cache = type; + + return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); #else return NULL; #endif } -void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) +void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) { - assert(ncache>0); #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif for(int e=0;e - #ifndef GRID_UVM #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); -#define dprintf(...) + +#define MAXLINE 512 +static char print_buffer [ MAXLINE ]; + +#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; +#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; +//#define dprintf(...) + //////////////////////////////////////////////////////////// // For caching copies of data on device @@ -22,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes; uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +uint64_t MemoryManager::DeviceEvictions; +uint64_t MemoryManager::DeviceDestroy; //////////////////////////////////// // Priority ordering for unlocked entries @@ -103,15 +111,17 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); + DeviceDestroy++; DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + AccCache.AccPtr=(uint64_t) NULL; + dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -120,26 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::Evict(AcceleratorViewEntry &AccCache) { /////////////////////////////////////////////////////////////////////////// - // Make CPU consistent, remove from Accelerator, remove entry - // Cannot be locked. If allocated must be in LRU pool. + // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry + // Cannot be acclocked. If allocated must be in LRU pool. + // + // Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock. + // and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen + // but there is a weakness where CpuLock entries are attempted for erase + // Take these OUT LRU queue when CPU locked? + // Cannot take out the table as cpuLock data is important. /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); - assert(AccCache.accLock==0); - assert(AccCache.cpuLock==0); + mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n", + (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, + (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); + assert(AccCache.accLock==0); // Cannot evict so logic bomb + assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.state==AccDirty) { Flush(AccCache); } - assert(AccCache.CpuPtr!=(uint64_t)NULL); if(AccCache.AccPtr) { AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); - DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + AccCache.AccPtr=(uint64_t)NULL; + AccCache.state=CpuDirty; // CPU primary now + DeviceBytes -=AccCache.bytes; + dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } - uint64_t CpuPtr = AccCache.CpuPtr; - EntryErase(CpuPtr); + // uint64_t CpuPtr = AccCache.CpuPtr; + DeviceEvictions++; + // EntryErase(CpuPtr); } void MemoryManager::Flush(AcceleratorViewEntry &AccCache) { @@ -149,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - // dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -164,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - // dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -190,6 +210,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::ViewClose(void* Ptr,ViewMode mode) { if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr); } else if( (mode==CpuRead)||(mode==CpuWrite)){ CpuViewClose((uint64_t)Ptr); @@ -201,6 +222,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis { uint64_t CpuPtr = (uint64_t)_CpuPtr; if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); } else if( (mode==CpuRead)||(mode==CpuWrite)){ return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); @@ -211,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis } void MemoryManager::EvictVictims(uint64_t bytes) { + assert(bytes DeviceMaxBytes){ if ( DeviceLRUBytes > 0){ assert(LRU.size()>0); - uint64_t victim = LRU.back(); + uint64_t victim = LRU.back(); // From the LRU auto AccCacheIterator = EntryLookup(victim); auto & AccCache = AccCacheIterator->second; Evict(AccCache); + } else { + return; } } } @@ -227,18 +252,25 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,hint); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + if (!AccCache.AccPtr) { + EvictVictims(bytes); + } assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { + dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", + (uint64_t)AccCache.CpuPtr, + (uint64_t)CpuPtr, + (uint64_t)AccCache.bytes, + (uint64_t)bytes, + (uint64_t)AccCache.accLock); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -273,6 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // Empty + AccRead => Consistent } AccCache.accLock= 1; + dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); } else if(AccCache.state==CpuDirty ){ if(mode==AcceleratorWriteDiscard) { CpuDiscard(AccCache); @@ -285,28 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - // printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - // printf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - // printf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); } else { assert(0); } - // If view is opened on device remove from LRU + assert(AccCache.accLock>0); + // If view is opened on device must remove from LRU if(AccCache.LRU_valid==1){ // must possibly remove from LRU as now locked on GPU + dprintf("AccCache entry removed from LRU \n"); LRUremove(AccCache); } @@ -327,10 +362,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) assert(AccCache.accLock>0); AccCache.accLock--; - // Move to LRU queue if not locked and close on device if(AccCache.accLock==0) { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); LRUinsert(AccCache); + } else { + dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); } } void MemoryManager::CpuViewClose(uint64_t CpuPtr) @@ -361,13 +398,17 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,transient); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + + // CPU doesn't need to free space + // if (!AccCache.AccPtr) { + // EvictVictims(bytes); + // } + assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error @@ -419,20 +460,29 @@ void MemoryManager::NotifyDeletion(void *_ptr) } void MemoryManager::Print(void) { - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "Memory Manager " << std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; - std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; - std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; - std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; - std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; - std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; - std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; - std::cout << GridLogDebug << "--------------------------------------------" << std::endl; - std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<second; @@ -442,13 +492,13 @@ void MemoryManager::Print(void) if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==Consistent)str = std::string("Consistent"); - std::cout << GridLogDebug << "0x"<second; + LruBytes2+=AccCache.bytes; + assert(AccCache.LRU_valid==1); + assert(AccCache.LRU_entry==it); + } + std::cout << " Memory Manager::Audit() LRU queue matches table entries "<second; + + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + + CpuBytes+=AccCache.bytes; + if( AccCache.AccPtr ) AccBytes+=AccCache.bytes; + if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes; + if( AccCache.LRU_valid ) LruCnt++; + + if ( AccCache.cpuLock || AccCache.accLock ) { + assert(AccCache.LRU_valid==0); + std::cout << GridLogError << s<< "\n\t 0x"<second; + std::string str; + if ( AccCache.state==Empty ) str = std::string("Empty"); + if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty"); + if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); + if ( AccCache.state==Consistent)str = std::string("Consistent"); + if ( AccCache.state==EvictNext) str = std::string("EvictNext"); + + std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "< #ifdef GRID_UVM -#warning "Grid is assuming unified virtual memory address space" NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////// // View management is 1:1 address space mapping @@ -13,11 +12,19 @@ uint64_t MemoryManager::HostToDeviceBytes; uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; +uint64_t MemoryManager::DeviceEvictions; +uint64_t MemoryManager::DeviceDestroy; +void MemoryManager::Audit(std::string s){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; int MemoryManager::isOpen (void* CpuPtr) { return 0;} +void MemoryManager::PrintState(void* CpuPtr) +{ +std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl; +}; void MemoryManager::Print(void){}; +void MemoryManager::PrintAll(void){}; void MemoryManager::NotifyDeletion(void *ptr){}; NAMESPACE_END(Grid); diff --git a/Grid/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h index b71981f5..092d4910 100644 --- a/Grid/cartesian/Cartesian_red_black.h +++ b/Grid/cartesian/Cartesian_red_black.h @@ -36,7 +36,7 @@ static const int CbBlack=1; static const int Even =CbRed; static const int Odd =CbBlack; -accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk) +accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk) { int nd=rdim.size(); Coordinate coor(nd); diff --git a/Grid/communicator/Communicator_base.cc b/Grid/communicator/Communicator_base.cc index dfa4846b..79efb90c 100644 --- a/Grid/communicator/Communicator_base.cc +++ b/Grid/communicator/Communicator_base.cc @@ -33,6 +33,8 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); +bool Stencil_force_mpi = true; + /////////////////////////////////////////////////////////////// // Info that is setup once and indept of cartesian layout /////////////////////////////////////////////////////////////// diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index bb06d43f..ffcfe37a 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -1,4 +1,3 @@ - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -36,6 +35,8 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); +extern bool Stencil_force_mpi ; + class CartesianCommunicator : public SharedMemory { public: @@ -108,6 +109,8 @@ public: //////////////////////////////////////////////////////////// // Reduction //////////////////////////////////////////////////////////// + void GlobalMax(RealD &); + void GlobalMax(RealF &); void GlobalSum(RealF &); void GlobalSumVector(RealF *,int N); void GlobalSum(RealD &); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index c6543851..280f5c4e 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -275,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); assert(ierr==0); } +void CartesianCommunicator::GlobalMax(float &f) +{ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalMax(double &d) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); + assert(ierr==0); +} void CartesianCommunicator::GlobalSum(float &f){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); @@ -360,7 +370,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); + assert(shm!=NULL); + // std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl; + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); } - if ( CommunicatorPolicy == CommunicatorPolicySequential ) { - this->StencilSendToRecvFromComplete(list,dir); - } + // if ( CommunicatorPolicy == CommunicatorPolicySequential ) { + // this->StencilSendToRecvFromComplete(list,dir); + // } return off_node_bytes; } void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) { + // std::cout << "Copy Synchronised\n"< +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,6 +36,10 @@ Author: Peter Boyle #ifdef GRID_HIP #include #endif +#ifdef GRID_SYCL +#define GRID_SYCL_LEVEL_ZERO_IPC +#endif + NAMESPACE_BEGIN(Grid); #define header "SharedMemoryMpi: " @@ -69,6 +74,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) WorldNodes = WorldSize/WorldShmSize; assert( (WorldNodes * WorldShmSize) == WorldSize ); + // FIXME: Check all WorldShmSize are the same ? ///////////////////////////////////////////////////////////////////// @@ -169,6 +175,23 @@ static inline int divides(int a,int b) } void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) { + //////////////////////////////////////////////////////////////// + // Allow user to configure through environment variable + //////////////////////////////////////////////////////////////// + char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str()); + if ( str ) { + std::vector IntShmDims; + GridCmdOptionIntVector(std::string(str),IntShmDims); + assert(IntShmDims.size() == WorldDims.size()); + long ShmSize = 1; + for (int dim=0;dim(theGridAccelerator->get_device()); + auto zeContext = cl::sycl::get_native(theGridAccelerator->get_context()); + + ze_ipc_mem_handle_t ihandle; + clone_mem_t handle; + + if ( r==WorldShmRank ) { + auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); + if ( err != ZE_RESULT_SUCCESS ) { + std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "< ranks(size); for(int r=0;r #include NAMESPACE_BEGIN(Grid); +#define header "SharedMemoryNone: " /*Construct from an MPI communicator*/ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) @@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended, use anonymous mmap //////////////////////////////////////////////////////////////////////////////////////////// +#if 1 +void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) +{ + std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "< &rhs, int n1=rhs.Grid()->_slice_stride[dimension]; if ( cbmask ==0x3){ -#ifdef ACCELERATOR_CSHIFT +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); - accelerator_for2d(n,e1,b,e2,1,{ + accelerator_for(nn,e1*e2,1,{ + int n = nn%e1; + int b = nn/e1; int o = n*n1; int offset = b+n*e2; @@ -135,7 +137,9 @@ Gather_plane_extract(const Lattice &rhs, std::cout << " Dense packed buffer WARNING " < void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int _slice_block = rhs.Grid()->_slice_block[dimension]; #ifdef ACCELERATOR_CSHIFT autoView( rhs_v , rhs, AcceleratorWrite); - accelerator_for2d(n,e1,b,e2,1,{ + accelerator_for(nn,e1*e2,1,{ + int n = nn%e1; + int b = nn/e1; int o = n*_slice_stride; int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); @@ -274,7 +280,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA // Case of SIMD split AND checker dim cannot currently be hit, except in // Test_cshift_red_black code. - // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME + std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"< void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + +template +T iDivUp(T a, T b) // Round a / b to nearest higher integer value +{ return (a % b != 0) ? (a / b + 1) : (a / b); } + +template +__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) +{ + int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx >= e1*e2) return; + + int n, b, o; + + n = idx / e2; + b = idx % e2; + o = n*stride + b; + + vector[2*idx + 0] = lo + o; + vector[2*idx + 1] = ro + o; +} + +#endif + ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -315,12 +345,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ +#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) + ent = e1*e2; + dim3 blockSize(acceleratorThreads()); + dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); + populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o,ro+o); } } +#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); + accelerator_barrier(); +#else for(int n=0;n(lo+o+b,ro+o+b); }} +#endif } else { for(int n=0;n void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - cshiftVector send_buf(buffer_size); - cshiftVector recv_buf(buffer_size); + static cshiftVector send_buf; send_buf.resize(buffer_size); + static cshiftVector recv_buf; recv_buf.resize(buffer_size); int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -198,8 +198,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd); - std::vector > recv_buf_extract(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; @@ -294,8 +294,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - cshiftVector send_buf_v(buffer_size); - cshiftVector recv_buf_v(buffer_size); + static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); + static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); vobj *send_buf; vobj *recv_buf; { @@ -381,8 +381,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd); - std::vector > recv_buf_extract(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; { diff --git a/Grid/json/json.hpp b/Grid/json/json.hpp index 618aa7a1..cb27e058 100644 --- a/Grid/json/json.hpp +++ b/Grid/json/json.hpp @@ -1,13 +1,12 @@ -#ifndef __NVCC__ /* __ _____ _____ _____ __| | __| | | | JSON for Modern C++ -| | |__ | | | | | | version 3.2.0 +| | |__ | | | | | | version 3.10.5 |_____|_____|_____|_|___| https://github.com/nlohmann/json Licensed under the MIT License . SPDX-License-Identifier: MIT -Copyright (c) 2013-2018 Niels Lohmann . +Copyright (c) 2013-2022 Niels Lohmann . Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,588 +27,56 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef NLOHMANN_JSON_HPP -#define NLOHMANN_JSON_HPP +/****************************************************************************\ + * Note on documentation: The source files contain links to the online * + * documentation of the public API at https://json.nlohmann.me. This URL * + * contains the most recent documentation and should also be applicable to * + * previous versions; documentation for deprecated functions is not * + * removed, but marked deprecated. See "Generate documentation" section in * + * file doc/README.md. * +\****************************************************************************/ + +#ifndef INCLUDE_NLOHMANN_JSON_HPP_ +#define INCLUDE_NLOHMANN_JSON_HPP_ #define NLOHMANN_JSON_VERSION_MAJOR 3 -#define NLOHMANN_JSON_VERSION_MINOR 2 -#define NLOHMANN_JSON_VERSION_PATCH 0 +#define NLOHMANN_JSON_VERSION_MINOR 10 +#define NLOHMANN_JSON_VERSION_PATCH 5 #include // all_of, find, for_each -#include // assert -#include // and, not, or #include // nullptr_t, ptrdiff_t, size_t #include // hash, less #include // initializer_list -#include // istream, ostream -#include // iterator_traits, random_access_iterator_tag +#ifndef JSON_NO_IO + #include // istream, ostream +#endif // JSON_NO_IO +#include // random_access_iterator_tag +#include // unique_ptr #include // accumulate #include // string, stoi, to_string #include // declval, forward, move, pair, swap - -// #include -#ifndef NLOHMANN_JSON_FWD_HPP -#define NLOHMANN_JSON_FWD_HPP - -#include // int64_t, uint64_t -#include // map -#include // allocator -#include // string #include // vector -/*! -@brief namespace for Niels Lohmann -@see https://github.com/nlohmann -@since version 1.0.0 -*/ -namespace nlohmann -{ -/*! -@brief default JSONSerializer template argument - -This serializer ignores the template arguments and uses ADL -([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl)) -for serialization. -*/ -template -struct adl_serializer; - -template class ObjectType = - std::map, - template class ArrayType = std::vector, - class StringType = std::string, class BooleanType = bool, - class NumberIntegerType = std::int64_t, - class NumberUnsignedType = std::uint64_t, - class NumberFloatType = double, - template class AllocatorType = std::allocator, - template class JSONSerializer = - adl_serializer> -class basic_json; - -/*! -@brief JSON Pointer - -A JSON pointer defines a string syntax for identifying a specific value -within a JSON document. It can be used with functions `at` and -`operator[]`. Furthermore, JSON pointers are the base for JSON patches. - -@sa [RFC 6901](https://tools.ietf.org/html/rfc6901) - -@since version 2.0.0 -*/ -template -class json_pointer; - -/*! -@brief default JSON class - -This type is the default specialization of the @ref basic_json class which -uses the standard template types. - -@since version 1.0.0 -*/ -using json = basic_json<>; -} - -#endif - -// #include - - -// This file contains all internal macro definitions -// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them - -// exclude unsupported compilers -#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK) - #if defined(__clang__) - #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400 - #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER)) - #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800 - #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers" - #endif - #endif -#endif - -// disable float-equal warnings on GCC/clang -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - -// disable documentation warnings on clang -#if defined(__clang__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wdocumentation" -#endif - -// allow for portable deprecation warnings -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #define JSON_DEPRECATED __attribute__((deprecated)) -#elif defined(_MSC_VER) - #define JSON_DEPRECATED __declspec(deprecated) -#else - #define JSON_DEPRECATED -#endif - -// allow to disable exceptions -#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION) - #define JSON_THROW(exception) throw exception - #define JSON_TRY try - #define JSON_CATCH(exception) catch(exception) - #define JSON_INTERNAL_CATCH(exception) catch(exception) -#else - #define JSON_THROW(exception) std::abort() - #define JSON_TRY if(true) - #define JSON_CATCH(exception) if(false) - #define JSON_INTERNAL_CATCH(exception) if(false) -#endif - -// override exception macros -#if defined(JSON_THROW_USER) - #undef JSON_THROW - #define JSON_THROW JSON_THROW_USER -#endif -#if defined(JSON_TRY_USER) - #undef JSON_TRY - #define JSON_TRY JSON_TRY_USER -#endif -#if defined(JSON_CATCH_USER) - #undef JSON_CATCH - #define JSON_CATCH JSON_CATCH_USER - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_CATCH_USER -#endif -#if defined(JSON_INTERNAL_CATCH_USER) - #undef JSON_INTERNAL_CATCH - #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER -#endif - -// manual branch prediction -#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) - #define JSON_LIKELY(x) __builtin_expect(!!(x), 1) - #define JSON_UNLIKELY(x) __builtin_expect(!!(x), 0) -#else - #define JSON_LIKELY(x) x - #define JSON_UNLIKELY(x) x -#endif - -// C++ language standard detection -#if (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464 - #define JSON_HAS_CPP_17 - #define JSON_HAS_CPP_14 -#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1) - #define JSON_HAS_CPP_14 -#endif - -// Ugly macros to avoid uglier copy-paste when specializing basic_json. They -// may be removed in the future once the class is split. - -#define NLOHMANN_BASIC_JSON_TPL_DECLARATION \ - template class ObjectType, \ - template class ArrayType, \ - class StringType, class BooleanType, class NumberIntegerType, \ - class NumberUnsignedType, class NumberFloatType, \ - template class AllocatorType, \ - template class JSONSerializer> - -#define NLOHMANN_BASIC_JSON_TPL \ - basic_json - -// #include - - -#include // not -#include // size_t -#include // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type - -namespace nlohmann -{ -namespace detail -{ -// alias templates to reduce boilerplate -template -using enable_if_t = typename std::enable_if::type; - -template -using uncvref_t = typename std::remove_cv::type>::type; - -// implementation of C++14 index_sequence and affiliates -// source: https://stackoverflow.com/a/32223343 -template -struct index_sequence -{ - using type = index_sequence; - using value_type = std::size_t; - static constexpr std::size_t size() noexcept - { - return sizeof...(Ints); - } -}; - -template -struct merge_and_renumber; - -template -struct merge_and_renumber, index_sequence> - : index_sequence < I1..., (sizeof...(I1) + I2)... > {}; - -template -struct make_index_sequence - : merge_and_renumber < typename make_index_sequence < N / 2 >::type, - typename make_index_sequence < N - N / 2 >::type > {}; - -template<> struct make_index_sequence<0> : index_sequence<> {}; -template<> struct make_index_sequence<1> : index_sequence<0> {}; - -template -using index_sequence_for = make_index_sequence; - -// dispatch utility (taken from ranges-v3) -template struct priority_tag : priority_tag < N - 1 > {}; -template<> struct priority_tag<0> {}; - -// taken from ranges-v3 -template -struct static_const -{ - static constexpr T value{}; -}; - -template -constexpr T static_const::value; -} -} - -// #include - - -#include // not -#include // numeric_limits -#include // false_type, is_constructible, is_integral, is_same, true_type -#include // declval - -// #include - -// #include - -// #include +// #include #include - -// #include - - -namespace nlohmann -{ -namespace detail -{ -template struct make_void -{ - using type = void; -}; -template using void_t = typename make_void::type; -} -} - - -// http://en.cppreference.com/w/cpp/experimental/is_detected -namespace nlohmann -{ -namespace detail -{ -struct nonesuch -{ - nonesuch() = delete; - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; - void operator=(nonesuch const&) = delete; -}; - -template class Op, - class... Args> -struct detector -{ - using value_t = std::false_type; - using type = Default; -}; - -template class Op, class... Args> -struct detector>, Op, Args...> -{ - using value_t = std::true_type; - using type = Op; -}; - -template