diff --git a/README.md b/README.md index 13dd6996..86506f52 100644 --- a/README.md +++ b/README.md @@ -187,10 +187,11 @@ Alternatively, some CPU codenames can be directly used: | `` | Description | | ----------- | -------------------------------------- | | `KNL` | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) | +| `SKL` | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) | | `BGQ` | Blue Gene/Q | #### Notes: -- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced. +- We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced. - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform. - BG/Q performances are currently rather poor. This is being investigated for future versions. - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`. diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 73621bbe..1d9de772 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -48,7 +48,6 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); - std::cout< latt4 = GridDefaultLatt(); int Ls=16; @@ -57,6 +56,10 @@ int main (int argc, char ** argv) std::stringstream ss(argv[i+1]); ss >> Ls; } + GridLogLayout(); + + long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); @@ -187,7 +190,7 @@ int main (int argc, char ** argv) FGrid->Barrier(); double volume=Ls; for(int mu=0;muBarrier(); double volume=Ls; for(int mu=0;muBarrier(); double volume=Ls; for(int mu=0;mu & latt4, int Ls, int threads,int report ) GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); @@ -196,7 +198,7 @@ void benchDw(std::vector & latt4, int Ls, int threads,int report ) if ( ! report ) { double volume=Ls; for(int mu=0;mu & latt4, int Ls, int threads,int report ) if(!report){ double volume=Ls; for(int mu=0;mu & latt4, int Ls, int threads,int report ) #define CHECK_SDW void benchsDw(std::vector & latt4, int Ls, int threads, int report ) { + long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); @@ -321,7 +324,7 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report ) Counter.Report(); } else { double volume=Ls; for(int mu=0;mu & latt4, int Ls, int threads, int report ) CounterSdw.Report(); } else { double volume=Ls; for(int mu=0;muBarrier(); double volume=Ls; for(int mu=0;muBarrier(); double volume=Ls; for(int mu=0;mu Author: paboyle @@ -32,6 +32,9 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; + +#include "Grid/util/Profiling.h" + template struct scal { d internal; @@ -45,6 +48,7 @@ struct scal { }; bool overlapComms = false; +bool perfProfiling = false; int main (int argc, char ** argv) { @@ -53,6 +57,12 @@ int main (int argc, char ** argv) if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; } + if( GridCmdOptionExists(argv,argv+argc,"--perf") ){ + perfProfiling = true; + } + + long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); + std::vector latt_size = GridDefaultLatt(); std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); @@ -61,10 +71,15 @@ int main (int argc, char ** argv) GridRedBlackCartesian RBGrid(&Grid); int threads = GridThread::GetThreads(); - std::cout< seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); @@ -134,9 +149,25 @@ int main (int argc, char ** argv) Dw.Dhop(src,result,0); } double t1=usecond(); - double flops=1344*volume*ncall; + double flops=single_site_flops*volume*ncall; + if (perfProfiling){ + std::cout<()); WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params); - + + // Full operator + bench_wilson(src,result,Dw,volume,DaggerNo); + bench_wilson(src,result,Dw,volume,DaggerYes); + std::cout << "\t"; + // EO bench_wilson(src,result,Dw,volume,DaggerNo); bench_wilson(src,result,Dw,volume,DaggerYes); std::cout << std::endl; @@ -122,9 +132,26 @@ void bench_wilson ( int const dag ) { int ncall = 1000; + long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); double t0 = usecond(); for(int i=0; i TFundtoHirep::getOutput(void) template void TFundtoHirep::setup(void) { - env().template registerLattice(getName()); + envCreateLat(typename Rep::LatticeField, getName()); } // execution /////////////////////////////////////////////////////////////////// @@ -70,6 +70,6 @@ void TFundtoHirep::execute(void) Rep TargetRepresentation(U._grid); TargetRepresentation.update_representation(U); - typename Rep::LatticeField &URep = *env().template createLattice(getName()); + auto &URep = envGet(typename Rep::LatticeField, getName()); URep = TargetRepresentation.U; } diff --git a/lib/algorithms/Algorithms.h b/lib/algorithms/Algorithms.h index 070a1019..ef147c53 100644 --- a/lib/algorithms/Algorithms.h +++ b/lib/algorithms/Algorithms.h @@ -39,6 +39,7 @@ Author: Peter Boyle #include #include +#include #include #include #include diff --git a/lib/algorithms/iterative/Deflation.h b/lib/algorithms/iterative/Deflation.h new file mode 100644 index 00000000..b6aa0d3d --- /dev/null +++ b/lib/algorithms/iterative/Deflation.h @@ -0,0 +1,101 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_DEFLATION_H +#define GRID_DEFLATION_H + +namespace Grid { + +struct ZeroGuesser { +public: + template + void operator()(const Field &src,Field &guess) { guess = Zero(); }; +}; +struct SourceGuesser { +public: + template + void operator()(const Field &src,Field &guess) { guess = src; }; +}; + +//////////////////////////////// +// Fine grid deflation +//////////////////////////////// +template +struct DeflatedGuesser { +private: + const std::vector &evec; + const std::vector &eval; + +public: + + DeflatedGuesser(const std::vector & _evec,const std::vector & _eval) : evec(_evec), eval(_eval) {}; + + void operator()(const Field &src,Field &guess) { + guess = zero; + assert(evec.size()==eval.size()); + auto N = evec.size(); + for (int i=0;i +class LocalCoherenceDeflatedGuesser { +private: + const std::vector &subspace; + const std::vector &evec_coarse; + const std::vector &eval_coarse; +public: + + LocalCoherenceDeflatedGuesser(const std::vector &_subspace, + const std::vector &_evec_coarse, + const std::vector &_eval_coarse) + : subspace(_subspace), + evec_coarse(_evec_coarse), + eval_coarse(_eval_coarse) + { + } + + void operator()(const FineField &src,FineField &guess) { + int N = (int)evec_coarse.size(); + CoarseField src_coarse(evec_coarse[0]._grid); + CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero; + blockProject(src,src_coarse,subspace); + for (int i=0;i & _v,std::vector& sort_vals, boo basisReorderInPlace(_v,sort_vals,idx); } -// PAB: faster to compute the inner products first then fuse loops. -// If performance critical can improve. -template -void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { - result = zero; - assert(_v.size()==eval.size()); - int N = (int)_v.size(); - for (int i=0;i class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester { public: + LinearFunction &_HermOp; ImplicitlyRestartedLanczosHermOpTester(LinearFunction &HermOp) : _HermOp(HermOp) { }; int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox) @@ -243,6 +231,7 @@ class ImplicitlyRestartedLanczos { ///////////////////////// public: + ////////////////////////////////////////////////////////////////// // PAB: ////////////////////////////////////////////////////////////////// diff --git a/lib/algorithms/iterative/LocalCoherenceLanczos.h b/lib/algorithms/iterative/LocalCoherenceLanczos.h index d5d1bbc2..b8348c0c 100644 --- a/lib/algorithms/iterative/LocalCoherenceLanczos.h +++ b/lib/algorithms/iterative/LocalCoherenceLanczos.h @@ -28,7 +28,10 @@ Author: paboyle /* END LEGAL */ #ifndef GRID_LOCAL_COHERENCE_IRL_H #define GRID_LOCAL_COHERENCE_IRL_H + namespace Grid { + + struct LanczosParams : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams, @@ -70,21 +73,24 @@ public: typedef Lattice FineField; LinearOperatorBase &_Linop; - Aggregation &_Aggregate; + std::vector &subspace; - ProjectedHermOp(LinearOperatorBase& linop, Aggregation &aggregate) : - _Linop(linop), - _Aggregate(aggregate) { }; + ProjectedHermOp(LinearOperatorBase& linop, std::vector & _subspace) : + _Linop(linop), subspace(_subspace) + { + assert(subspace.size() >0); + }; void operator()(const CoarseField& in, CoarseField& out) { + GridBase *FineGrid = subspace[0]._grid; + int checkerboard = subspace[0].checkerboard; + + FineField fin (FineGrid); fin.checkerboard= checkerboard; + FineField fout(FineGrid); fout.checkerboard = checkerboard; - GridBase *FineGrid = _Aggregate.FineGrid; - FineField fin(FineGrid); - FineField fout(FineGrid); - - _Aggregate.PromoteFromSubspace(in,fin); std::cout< & _poly; LinearOperatorBase &_Linop; - Aggregation &_Aggregate; + std::vector &subspace; - ProjectedFunctionHermOp(OperatorFunction & poly,LinearOperatorBase& linop, - Aggregation &aggregate) : + ProjectedFunctionHermOp(OperatorFunction & poly, + LinearOperatorBase& linop, + std::vector & _subspace) : _poly(poly), _Linop(linop), - _Aggregate(aggregate) { }; + subspace(_subspace) + { }; void operator()(const CoarseField& in, CoarseField& out) { - - GridBase *FineGrid = _Aggregate.FineGrid; - - FineField fin(FineGrid) ;fin.checkerboard =_Aggregate.checkerboard; - FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard; - _Aggregate.PromoteFromSubspace(in,fin); std::cout< & _Poly; OperatorFunction & _smoother; LinearOperatorBase &_Linop; - Aggregation &_Aggregate; - RealD _coarse_relax_tol; + RealD _coarse_relax_tol; + std::vector &_subspace; + ImplicitlyRestartedLanczosSmoothedTester(LinearFunction &Poly, OperatorFunction &smoother, LinearOperatorBase &Linop, - Aggregation &Aggregate, + std::vector &subspace, RealD coarse_relax_tol=5.0e3) - : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol) { }; + : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace), + _coarse_relax_tol(coarse_relax_tol) + { }; int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) { CoarseField v(B); RealD eval_poly = eval; + // Apply operator _Poly(B,v); @@ -168,14 +181,13 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc } int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) { - GridBase *FineGrid = _Aggregate.FineGrid; - - int checkerboard = _Aggregate.checkerboard; - + GridBase *FineGrid = _subspace[0]._grid; + int checkerboard = _subspace[0].checkerboard; FineField fB(FineGrid);fB.checkerboard =checkerboard; FineField fv(FineGrid);fv.checkerboard =checkerboard; - _Aggregate.PromoteFromSubspace(B,fv); + blockPromote(B,fv,_subspace); + _smoother(_Linop,fv,fB); RealD eval_poly = eval; @@ -217,27 +229,65 @@ protected: int _checkerboard; LinearOperatorBase & _FineOp; - // FIXME replace Aggregation with vector of fine; the code reuse is too small for - // the hassle and complexity of cross coupling. - Aggregation _Aggregate; - std::vector evals_fine; - std::vector evals_coarse; - std::vector evec_coarse; + std::vector &evals_fine; + std::vector &evals_coarse; + std::vector &subspace; + std::vector &evec_coarse; + +private: + std::vector _evals_fine; + std::vector _evals_coarse; + std::vector _subspace; + std::vector _evec_coarse; + public: + LocalCoherenceLanczos(GridBase *FineGrid, - GridBase *CoarseGrid, - LinearOperatorBase &FineOp, - int checkerboard) : + GridBase *CoarseGrid, + LinearOperatorBase &FineOp, + int checkerboard) : _CoarseGrid(CoarseGrid), _FineGrid(FineGrid), - _Aggregate(CoarseGrid,FineGrid,checkerboard), _FineOp(FineOp), - _checkerboard(checkerboard) + _checkerboard(checkerboard), + evals_fine (_evals_fine), + evals_coarse(_evals_coarse), + subspace (_subspace), + evec_coarse(_evec_coarse) { evals_fine.resize(0); evals_coarse.resize(0); }; - void Orthogonalise(void ) { _Aggregate.Orthogonalise(); } + ////////////////////////////////////////////////////////////////////////// + // Alternate constructore, external storage for use by Hadrons module + ////////////////////////////////////////////////////////////////////////// + LocalCoherenceLanczos(GridBase *FineGrid, + GridBase *CoarseGrid, + LinearOperatorBase &FineOp, + int checkerboard, + std::vector &ext_subspace, + std::vector &ext_coarse, + std::vector &ext_eval_fine, + std::vector &ext_eval_coarse + ) : + _CoarseGrid(CoarseGrid), + _FineGrid(FineGrid), + _FineOp(FineOp), + _checkerboard(checkerboard), + evals_fine (ext_eval_fine), + evals_coarse(ext_eval_coarse), + subspace (ext_subspace), + evec_coarse (ext_coarse) + { + evals_fine.resize(0); + evals_coarse.resize(0); + }; + + void Orthogonalise(void ) { + CoarseScalar InnerProd(_CoarseGrid); + blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"< static RealD normalise(T& v) { @@ -246,43 +296,44 @@ public: v = v * (1.0/nn); return nn; } - + /* void fakeFine(void) { int Nk = nbasis; - _Aggregate.subspace.resize(Nk,_FineGrid); - _Aggregate.subspace[0]=1.0; - _Aggregate.subspace[0].checkerboard=_checkerboard; - normalise(_Aggregate.subspace[0]); + subspace.resize(Nk,_FineGrid); + subspace[0]=1.0; + subspace[0].checkerboard=_checkerboard; + normalise(subspace[0]); PlainHermOp Op(_FineOp); for(int k=1;k Op(_FineOp); ImplicitlyRestartedLanczosHermOpTester SimpleTester(Op); for(int k=0;k ChebySmooth(cheby_smooth); - ProjectedFunctionHermOp ChebyOp (ChebySmooth,_FineOp,_Aggregate); - ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); + ProjectedFunctionHermOp ChebyOp (ChebySmooth,_FineOp,_subspace); + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); for(int k=0;k Op(_FineOp); evals_fine.resize(Nm); - _Aggregate.subspace.resize(Nm,_FineGrid); + subspace.resize(Nm,_FineGrid); ImplicitlyRestartedLanczos IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; int Nconv; - IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false); + IRL.calc(evals_fine,subspace,src,Nconv,false); // Shrink down to number saved assert(Nstop>=nbasis); assert(Nconv>=nbasis); evals_fine.resize(nbasis); - _Aggregate.subspace.resize(nbasis,_FineGrid); + subspace.resize(nbasis,_FineGrid); } void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, int Nstop, int Nk, int Nm,RealD resid, RealD MaxIt, RealD betastp, int MinRes) { Chebyshev Cheby(cheby_op); - ProjectedHermOp Op(_FineOp,_Aggregate); - ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,_Aggregate); + ProjectedHermOp Op(_FineOp,_subspace); + ProjectedFunctionHermOp ChebyOp (Cheby,_FineOp,_subspace); ////////////////////////////////////////////////////////////////////////////////////////////////// // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL ////////////////////////////////////////////////////////////////////////////////////////////////// Chebyshev ChebySmooth(cheby_smooth); - ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); + ImplicitlyRestartedLanczosSmoothedTester ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_subspace,relax); evals_coarse.resize(Nm); evec_coarse.resize(Nm,_CoarseGrid); diff --git a/lib/algorithms/iterative/SchurRedBlack.h b/lib/algorithms/iterative/SchurRedBlack.h index 5f5a8b66..fac2030f 100644 --- a/lib/algorithms/iterative/SchurRedBlack.h +++ b/lib/algorithms/iterative/SchurRedBlack.h @@ -107,7 +107,12 @@ namespace Grid { }; template - void operator() (Matrix & _Matrix,const Field &in, Field &out){ + void operator() (Matrix & _Matrix,const Field &in, Field &out){ + ZeroGuesser guess; + (*this)(_Matrix,in,out,guess); + } + template + void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){ // FIXME CGdiagonalMee not implemented virtual function // FIXME use CBfactorise to control schur decomp @@ -129,7 +134,6 @@ namespace Grid { pickCheckerboard(Odd ,src_o,in); pickCheckerboard(Even,sol_e,out); pickCheckerboard(Odd ,sol_o,out); - std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" < - void operator() (Matrix & _Matrix,const Field &in, Field &out){ + void operator() (Matrix & _Matrix,const Field &in, Field &out){ + ZeroGuesser guess; + (*this)(_Matrix,in,out,guess); + } + template + void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){ // FIXME CGdiagonalMee not implemented virtual function // FIXME use CBfactorise to control schur decomp @@ -225,6 +235,7 @@ namespace Grid { // Call the red-black solver ////////////////////////////////////////////////////////////// std::cout< - void operator() (Matrix & _Matrix,const Field &in, Field &out){ + void operator() (Matrix & _Matrix,const Field &in, Field &out){ + ZeroGuesser guess; + (*this)(_Matrix,in,out,guess); + } + template + void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){ // FIXME CGdiagonalMee not implemented virtual function // FIXME use CBfactorise to control schur decomp @@ -305,6 +321,7 @@ namespace Grid { ////////////////////////////////////////////////////////////// std::cout< - void operator() (Matrix & _Matrix,const Field &in, Field &out){ + void operator() (Matrix & _Matrix,const Field &in, Field &out){ + ZeroGuesser guess; + (*this)(_Matrix,in,out,guess); + } + template + void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){ // FIXME CGdiagonalMee not implemented virtual function // FIXME use CBfactorise to control schur decomp @@ -385,6 +407,7 @@ namespace Grid { std::cout< 1 && provided != MPI_THREAD_MULTIPLE) ) + assert(0); } Grid_quiesce_nodes(); + // Never clean up as done once. MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); GlobalSharedMemory::Init(communicator_world); @@ -85,9 +89,17 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &c CartesianCommunicator::CartesianCommunicator(const std::vector &processors) { MPI_Comm optimal_comm; - GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm); // Remap using the shared memory optimising routine + //////////////////////////////////////////////////// + // Remap using the shared memory optimising routine + // The remap creates a comm which must be freed + //////////////////////////////////////////////////// + GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm); InitFromMPICommunicator(processors,optimal_comm); SetCommunicator(optimal_comm); + /////////////////////////////////////////////////// + // Free the temp communicator + /////////////////////////////////////////////////// + MPI_Comm_free(&optimal_comm); } ////////////////////////////////// @@ -183,8 +195,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors, } else { srank = 0; - comm_split = parent.communicator; - // std::cout << " Inherited communicator " < &processors, // Take the right SHM buffers ////////////////////////////////////////////////////////////////////////////////////////////////////// SetCommunicator(comm_split); + + /////////////////////////////////////////////// + // Free the temp communicator + /////////////////////////////////////////////// + MPI_Comm_free(&comm_split); if(0){ std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; @@ -210,6 +227,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors, void CartesianCommunicator::InitFromMPICommunicator(const std::vector &processors, MPI_Comm communicator_base) { + //////////////////////////////////////////////////// + // Creates communicator, and the communicator_halo + //////////////////////////////////////////////////// _ndimension = processors.size(); _processor_coor.resize(_ndimension); diff --git a/lib/communicator/SharedMemory.h b/lib/communicator/SharedMemory.h index 0f647dc6..9f6b1a25 100644 --- a/lib/communicator/SharedMemory.h +++ b/lib/communicator/SharedMemory.h @@ -133,6 +133,7 @@ class SharedMemory public: SharedMemory() {}; + ~SharedMemory(); /////////////////////////////////////////////////////////////////////////////////////// // set the buffers & sizes /////////////////////////////////////////////////////////////////////////////////////// diff --git a/lib/communicator/SharedMemoryMPI.cc b/lib/communicator/SharedMemoryMPI.cc index d7bd7c65..9e5d8f15 100644 --- a/lib/communicator/SharedMemoryMPI.cc +++ b/lib/communicator/SharedMemoryMPI.cc @@ -182,6 +182,7 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector &processors, #ifdef GRID_MPI3_SHMMMAP void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) { + std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +#ifndef GRID_PERF_PROFILING_H +#define GRID_PERF_PROFILING_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct System +{ + static void profile(const std::string& name,std::function body) { + std::string filename = name.find(".data") == std::string::npos ? (name + ".data") : name; + + // Launch profiler + pid_t pid; + std::stringstream s; + s << getpid(); + pid = fork(); + if (pid == 0) { + auto fd=open("/dev/null",O_RDWR); + dup2(fd,1); + dup2(fd,2); + exit(execl("/usr/bin/perf","perf","record","-o",filename.c_str(),"-p",s.str().c_str(),nullptr)); + } + + // Run body + body(); + + // Kill profiler + kill(pid,SIGINT); + waitpid(pid,nullptr,0); + } + + static void profile(std::function body) { + profile("perf.data",body); + } +}; + +#endif // GRID_PERF_PROFILING_H \ No newline at end of file diff --git a/tests/debug/Test_cayley_coarsen_support.cc b/tests/debug/Test_cayley_coarsen_support.cc index c6532a0d..f57823e5 100644 --- a/tests/debug/Test_cayley_coarsen_support.cc +++ b/tests/debug/Test_cayley_coarsen_support.cc @@ -111,6 +111,7 @@ int main (int argc, char ** argv) std::cout< subspace(nbasis,FGrid); @@ -119,7 +120,7 @@ int main (int argc, char ** argv) MdagMLinearOperator HermDefOp(Ddwf); typedef Aggregation Subspace; - Subspace Aggregates(Coarse5d,FGrid); + Subspace Aggregates(Coarse5d,FGrid,cb); Aggregates.CreateSubspaceRandom(RNG5); subspace=Aggregates.subspace; diff --git a/tests/debug/Test_cayley_ldop_cr.cc b/tests/debug/Test_cayley_ldop_cr.cc index cbefdd46..c6005fd0 100644 --- a/tests/debug/Test_cayley_ldop_cr.cc +++ b/tests/debug/Test_cayley_ldop_cr.cc @@ -78,6 +78,7 @@ int main (int argc, char ** argv) RealD mass=0.1; RealD M5=1.5; + int cb=0; std::cout< HermDefOp(Ddwf); - Subspace Aggregates(Coarse5d,FGrid); + Subspace Aggregates(Coarse5d,FGrid,cb); Aggregates.CreateSubspace(RNG5,HermDefOp); diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc index 4c702a33..3dff4b90 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc @@ -56,12 +56,12 @@ public: void checkpointFine(std::string evecs_file,std::string evals_file) { - assert(this->_Aggregate.subspace.size()==nbasis); + assert(this->subspace.size()==nbasis); emptyUserRecord record; Grid::QCD::ScidacWriter WR; WR.open(evecs_file); for(int k=0;k_Aggregate.subspace[k],record); + WR.writeScidacFieldRecord(this->subspace[k],record); } WR.close(); @@ -72,7 +72,7 @@ public: void checkpointFineRestore(std::string evecs_file,std::string evals_file) { this->evals_fine.resize(nbasis); - this->_Aggregate.subspace.resize(nbasis,this->_FineGrid); + this->subspace.resize(nbasis,this->_FineGrid); std::cout << GridLogIRL<< "checkpointFineRestore: Reading evals from "<_Aggregate.subspace[k].checkerboard=this->_checkerboard; - RD.readScidacFieldRecord(this->_Aggregate.subspace[k],record); + this->subspace[k].checkerboard=this->_checkerboard; + RD.readScidacFieldRecord(this->subspace[k],record); } RD.close(); @@ -221,7 +221,9 @@ int main (int argc, char ** argv) { std::cout << GridLogIRL<<"Checkpointing Fine evecs"<