mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-17 23:37:06 +01:00
Merge branch 'develop' of https://github.com/paboyle/Grid into merge
This commit is contained in:
@ -1,28 +1,18 @@
|
||||
extra_sources=
|
||||
extra_headers=
|
||||
if BUILD_COMMS_MPI
|
||||
extra_sources+=communicator/Communicator_mpi.cc
|
||||
extra_sources+=communicator/Communicator_base.cc
|
||||
endif
|
||||
|
||||
if BUILD_COMMS_MPI3
|
||||
extra_sources+=communicator/Communicator_mpi3.cc
|
||||
extra_sources+=communicator/Communicator_base.cc
|
||||
endif
|
||||
|
||||
if BUILD_COMMS_MPIT
|
||||
extra_sources+=communicator/Communicator_mpit.cc
|
||||
extra_sources+=communicator/Communicator_base.cc
|
||||
endif
|
||||
|
||||
if BUILD_COMMS_SHMEM
|
||||
extra_sources+=communicator/Communicator_shmem.cc
|
||||
extra_sources+=communicator/Communicator_base.cc
|
||||
extra_sources+=communicator/SharedMemoryMPI.cc
|
||||
extra_sources+=communicator/SharedMemory.cc
|
||||
endif
|
||||
|
||||
if BUILD_COMMS_NONE
|
||||
extra_sources+=communicator/Communicator_none.cc
|
||||
extra_sources+=communicator/Communicator_base.cc
|
||||
extra_sources+=communicator/SharedMemoryNone.cc
|
||||
extra_sources+=communicator/SharedMemory.cc
|
||||
endif
|
||||
|
||||
if BUILD_HDF5
|
||||
|
@ -39,10 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/algorithms/approx/MultiShiftFunction.h>
|
||||
#include <Grid/algorithms/approx/Forecast.h>
|
||||
|
||||
#include <Grid/algorithms/densematrix/DenseMatrix.h>
|
||||
#include <Grid/algorithms/densematrix/Francis.h>
|
||||
#include <Grid/algorithms/densematrix/Householder.h>
|
||||
|
||||
#include <Grid/algorithms/iterative/Deflation.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateResidual.h>
|
||||
#include <Grid/algorithms/iterative/NormalEquations.h>
|
||||
|
@ -183,11 +183,13 @@ namespace Grid {
|
||||
virtual RealD Mpc (const Field &in, Field &out) =0;
|
||||
virtual RealD MpcDag (const Field &in, Field &out) =0;
|
||||
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
|
||||
Field tmp(in._grid);
|
||||
Field tmp(in._grid);
|
||||
tmp.checkerboard = in.checkerboard;
|
||||
ni=Mpc(in,tmp);
|
||||
no=MpcDag(tmp,out);
|
||||
}
|
||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||
out.checkerboard = in.checkerboard;
|
||||
MpcDagMpc(in,out,n1,n2);
|
||||
}
|
||||
virtual void HermOp(const Field &in, Field &out){
|
||||
@ -215,13 +217,15 @@ namespace Grid {
|
||||
public:
|
||||
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
|
||||
virtual RealD Mpc (const Field &in, Field &out) {
|
||||
Field tmp(in._grid);
|
||||
// std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
|
||||
Field tmp(in._grid);
|
||||
tmp.checkerboard = !in.checkerboard;
|
||||
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
|
||||
|
||||
_Mat.Meooe(in,tmp);
|
||||
_Mat.MooeeInv(tmp,out);
|
||||
_Mat.Meooe(out,tmp);
|
||||
|
||||
//std::cout << "cb in " << in.checkerboard << " cb out " << out.checkerboard << std::endl;
|
||||
_Mat.Mooee(in,out);
|
||||
return axpy_norm(out,-1.0,tmp,out);
|
||||
}
|
||||
|
101
lib/algorithms/iterative/Deflation.h
Normal file
101
lib/algorithms/iterative/Deflation.h
Normal file
@ -0,0 +1,101 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_DEFLATION_H
|
||||
#define GRID_DEFLATION_H
|
||||
|
||||
namespace Grid {
|
||||
|
||||
struct ZeroGuesser {
|
||||
public:
|
||||
template<class Field>
|
||||
void operator()(const Field &src,Field &guess) { guess = Zero(); };
|
||||
};
|
||||
struct SourceGuesser {
|
||||
public:
|
||||
template<class Field>
|
||||
void operator()(const Field &src,Field &guess) { guess = src; };
|
||||
};
|
||||
|
||||
////////////////////////////////
|
||||
// Fine grid deflation
|
||||
////////////////////////////////
|
||||
template<class Field>
|
||||
struct DeflatedGuesser {
|
||||
private:
|
||||
const std::vector<Field> &evec;
|
||||
const std::vector<RealD> &eval;
|
||||
|
||||
public:
|
||||
|
||||
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
|
||||
|
||||
void operator()(const Field &src,Field &guess) {
|
||||
guess = zero;
|
||||
assert(evec.size()==eval.size());
|
||||
auto N = evec.size();
|
||||
for (int i=0;i<N;i++) {
|
||||
const Field& tmp = evec[i];
|
||||
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class FineField, class CoarseField>
|
||||
class LocalCoherenceDeflatedGuesser {
|
||||
private:
|
||||
const std::vector<FineField> &subspace;
|
||||
const std::vector<CoarseField> &evec_coarse;
|
||||
const std::vector<RealD> &eval_coarse;
|
||||
public:
|
||||
|
||||
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
|
||||
const std::vector<CoarseField> &_evec_coarse,
|
||||
const std::vector<RealD> &_eval_coarse)
|
||||
: subspace(_subspace),
|
||||
evec_coarse(_evec_coarse),
|
||||
eval_coarse(_eval_coarse)
|
||||
{
|
||||
}
|
||||
|
||||
void operator()(const FineField &src,FineField &guess) {
|
||||
int N = (int)evec_coarse.size();
|
||||
CoarseField src_coarse(evec_coarse[0]._grid);
|
||||
CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero;
|
||||
blockProject(src_coarse,src,subspace);
|
||||
for (int i=0;i<N;i++) {
|
||||
const CoarseField & tmp = evec_coarse[i];
|
||||
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
|
||||
}
|
||||
blockPromote(guess_coarse,guess,subspace);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
}
|
||||
#endif
|
@ -149,19 +149,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
|
||||
basisReorderInPlace(_v,sort_vals,idx);
|
||||
}
|
||||
|
||||
// PAB: faster to compute the inner products first then fuse loops.
|
||||
// If performance critical can improve.
|
||||
template<class Field>
|
||||
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
|
||||
result = zero;
|
||||
assert(_v.size()==eval.size());
|
||||
int N = (int)_v.size();
|
||||
for (int i=0;i<N;i++) {
|
||||
Field& tmp = _v[i];
|
||||
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// Implicitly restarted lanczos
|
||||
/////////////////////////////////////////////////////////////
|
||||
@ -181,6 +168,7 @@ enum IRLdiagonalisation {
|
||||
template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester<Field>
|
||||
{
|
||||
public:
|
||||
|
||||
LinearFunction<Field> &_HermOp;
|
||||
ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp) { };
|
||||
int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
|
||||
@ -243,6 +231,7 @@ class ImplicitlyRestartedLanczos {
|
||||
/////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// PAB:
|
||||
//////////////////////////////////////////////////////////////////
|
||||
|
@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_LOCAL_COHERENCE_IRL_H
|
||||
#define GRID_LOCAL_COHERENCE_IRL_H
|
||||
|
||||
namespace Grid {
|
||||
|
||||
|
||||
struct LanczosParams : Serializable {
|
||||
public:
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
|
||||
@ -70,21 +73,24 @@ public:
|
||||
typedef Lattice<Fobj> FineField;
|
||||
|
||||
LinearOperatorBase<FineField> &_Linop;
|
||||
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
|
||||
std::vector<FineField> &subspace;
|
||||
|
||||
ProjectedHermOp(LinearOperatorBase<FineField>& linop, Aggregation<Fobj,CComplex,nbasis> &aggregate) :
|
||||
_Linop(linop),
|
||||
_Aggregate(aggregate) { };
|
||||
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
|
||||
_Linop(linop), subspace(_subspace)
|
||||
{
|
||||
assert(subspace.size() >0);
|
||||
};
|
||||
|
||||
void operator()(const CoarseField& in, CoarseField& out) {
|
||||
GridBase *FineGrid = subspace[0]._grid;
|
||||
int checkerboard = subspace[0].checkerboard;
|
||||
|
||||
FineField fin (FineGrid); fin.checkerboard= checkerboard;
|
||||
FineField fout(FineGrid); fout.checkerboard = checkerboard;
|
||||
|
||||
GridBase *FineGrid = _Aggregate.FineGrid;
|
||||
FineField fin(FineGrid);
|
||||
FineField fout(FineGrid);
|
||||
|
||||
_Aggregate.PromoteFromSubspace(in,fin); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
|
||||
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
|
||||
_Aggregate.ProjectToSubspace(out,fout); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
|
||||
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
|
||||
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
|
||||
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
@ -99,24 +105,27 @@ public:
|
||||
|
||||
OperatorFunction<FineField> & _poly;
|
||||
LinearOperatorBase<FineField> &_Linop;
|
||||
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
|
||||
std::vector<FineField> &subspace;
|
||||
|
||||
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop,
|
||||
Aggregation<Fobj,CComplex,nbasis> &aggregate) :
|
||||
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
|
||||
LinearOperatorBase<FineField>& linop,
|
||||
std::vector<FineField> & _subspace) :
|
||||
_poly(poly),
|
||||
_Linop(linop),
|
||||
_Aggregate(aggregate) { };
|
||||
subspace(_subspace)
|
||||
{ };
|
||||
|
||||
void operator()(const CoarseField& in, CoarseField& out) {
|
||||
|
||||
GridBase *FineGrid = _Aggregate.FineGrid;
|
||||
|
||||
FineField fin(FineGrid) ;fin.checkerboard =_Aggregate.checkerboard;
|
||||
FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
|
||||
|
||||
_Aggregate.PromoteFromSubspace(in,fin); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
|
||||
GridBase *FineGrid = subspace[0]._grid;
|
||||
int checkerboard = subspace[0].checkerboard;
|
||||
|
||||
FineField fin (FineGrid); fin.checkerboard =checkerboard;
|
||||
FineField fout(FineGrid);fout.checkerboard =checkerboard;
|
||||
|
||||
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
|
||||
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
|
||||
_Aggregate.ProjectToSubspace(out,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
|
||||
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
@ -132,19 +141,23 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
|
||||
LinearFunction<CoarseField> & _Poly;
|
||||
OperatorFunction<FineField> & _smoother;
|
||||
LinearOperatorBase<FineField> &_Linop;
|
||||
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
|
||||
RealD _coarse_relax_tol;
|
||||
RealD _coarse_relax_tol;
|
||||
std::vector<FineField> &_subspace;
|
||||
|
||||
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
|
||||
OperatorFunction<FineField> &smoother,
|
||||
LinearOperatorBase<FineField> &Linop,
|
||||
Aggregation<Fobj,CComplex,nbasis> &Aggregate,
|
||||
std::vector<FineField> &subspace,
|
||||
RealD coarse_relax_tol=5.0e3)
|
||||
: _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol) { };
|
||||
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
|
||||
_coarse_relax_tol(coarse_relax_tol)
|
||||
{ };
|
||||
|
||||
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
|
||||
{
|
||||
CoarseField v(B);
|
||||
RealD eval_poly = eval;
|
||||
|
||||
// Apply operator
|
||||
_Poly(B,v);
|
||||
|
||||
@ -168,14 +181,13 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
|
||||
}
|
||||
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
|
||||
{
|
||||
GridBase *FineGrid = _Aggregate.FineGrid;
|
||||
|
||||
int checkerboard = _Aggregate.checkerboard;
|
||||
|
||||
GridBase *FineGrid = _subspace[0]._grid;
|
||||
int checkerboard = _subspace[0].checkerboard;
|
||||
FineField fB(FineGrid);fB.checkerboard =checkerboard;
|
||||
FineField fv(FineGrid);fv.checkerboard =checkerboard;
|
||||
|
||||
_Aggregate.PromoteFromSubspace(B,fv);
|
||||
blockPromote(B,fv,_subspace);
|
||||
|
||||
_smoother(_Linop,fv,fB);
|
||||
|
||||
RealD eval_poly = eval;
|
||||
@ -217,27 +229,65 @@ protected:
|
||||
int _checkerboard;
|
||||
LinearOperatorBase<FineField> & _FineOp;
|
||||
|
||||
// FIXME replace Aggregation with vector of fine; the code reuse is too small for
|
||||
// the hassle and complexity of cross coupling.
|
||||
Aggregation<Fobj,CComplex,nbasis> _Aggregate;
|
||||
std::vector<RealD> evals_fine;
|
||||
std::vector<RealD> evals_coarse;
|
||||
std::vector<CoarseField> evec_coarse;
|
||||
std::vector<RealD> &evals_fine;
|
||||
std::vector<RealD> &evals_coarse;
|
||||
std::vector<FineField> &subspace;
|
||||
std::vector<CoarseField> &evec_coarse;
|
||||
|
||||
private:
|
||||
std::vector<RealD> _evals_fine;
|
||||
std::vector<RealD> _evals_coarse;
|
||||
std::vector<FineField> _subspace;
|
||||
std::vector<CoarseField> _evec_coarse;
|
||||
|
||||
public:
|
||||
|
||||
LocalCoherenceLanczos(GridBase *FineGrid,
|
||||
GridBase *CoarseGrid,
|
||||
LinearOperatorBase<FineField> &FineOp,
|
||||
int checkerboard) :
|
||||
GridBase *CoarseGrid,
|
||||
LinearOperatorBase<FineField> &FineOp,
|
||||
int checkerboard) :
|
||||
_CoarseGrid(CoarseGrid),
|
||||
_FineGrid(FineGrid),
|
||||
_Aggregate(CoarseGrid,FineGrid,checkerboard),
|
||||
_FineOp(FineOp),
|
||||
_checkerboard(checkerboard)
|
||||
_checkerboard(checkerboard),
|
||||
evals_fine (_evals_fine),
|
||||
evals_coarse(_evals_coarse),
|
||||
subspace (_subspace),
|
||||
evec_coarse(_evec_coarse)
|
||||
{
|
||||
evals_fine.resize(0);
|
||||
evals_coarse.resize(0);
|
||||
};
|
||||
void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Alternate constructore, external storage for use by Hadrons module
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
LocalCoherenceLanczos(GridBase *FineGrid,
|
||||
GridBase *CoarseGrid,
|
||||
LinearOperatorBase<FineField> &FineOp,
|
||||
int checkerboard,
|
||||
std::vector<FineField> &ext_subspace,
|
||||
std::vector<CoarseField> &ext_coarse,
|
||||
std::vector<RealD> &ext_eval_fine,
|
||||
std::vector<RealD> &ext_eval_coarse
|
||||
) :
|
||||
_CoarseGrid(CoarseGrid),
|
||||
_FineGrid(FineGrid),
|
||||
_FineOp(FineOp),
|
||||
_checkerboard(checkerboard),
|
||||
evals_fine (ext_eval_fine),
|
||||
evals_coarse(ext_eval_coarse),
|
||||
subspace (ext_subspace),
|
||||
evec_coarse (ext_coarse)
|
||||
{
|
||||
evals_fine.resize(0);
|
||||
evals_coarse.resize(0);
|
||||
};
|
||||
|
||||
void Orthogonalise(void ) {
|
||||
CoarseScalar InnerProd(_CoarseGrid);
|
||||
blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
|
||||
blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
|
||||
};
|
||||
|
||||
template<typename T> static RealD normalise(T& v)
|
||||
{
|
||||
@ -246,43 +296,44 @@ public:
|
||||
v = v * (1.0/nn);
|
||||
return nn;
|
||||
}
|
||||
|
||||
/*
|
||||
void fakeFine(void)
|
||||
{
|
||||
int Nk = nbasis;
|
||||
_Aggregate.subspace.resize(Nk,_FineGrid);
|
||||
_Aggregate.subspace[0]=1.0;
|
||||
_Aggregate.subspace[0].checkerboard=_checkerboard;
|
||||
normalise(_Aggregate.subspace[0]);
|
||||
subspace.resize(Nk,_FineGrid);
|
||||
subspace[0]=1.0;
|
||||
subspace[0].checkerboard=_checkerboard;
|
||||
normalise(subspace[0]);
|
||||
PlainHermOp<FineField> Op(_FineOp);
|
||||
for(int k=1;k<Nk;k++){
|
||||
_Aggregate.subspace[k].checkerboard=_checkerboard;
|
||||
Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
|
||||
normalise(_Aggregate.subspace[k]);
|
||||
subspace[k].checkerboard=_checkerboard;
|
||||
Op(subspace[k-1],subspace[k]);
|
||||
normalise(subspace[k]);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
void testFine(RealD resid)
|
||||
{
|
||||
assert(evals_fine.size() == nbasis);
|
||||
assert(_Aggregate.subspace.size() == nbasis);
|
||||
assert(subspace.size() == nbasis);
|
||||
PlainHermOp<FineField> Op(_FineOp);
|
||||
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
|
||||
for(int k=0;k<nbasis;k++){
|
||||
assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
|
||||
assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
|
||||
}
|
||||
}
|
||||
|
||||
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
|
||||
{
|
||||
assert(evals_fine.size() == nbasis);
|
||||
assert(_Aggregate.subspace.size() == nbasis);
|
||||
assert(subspace.size() == nbasis);
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
Chebyshev<FineField> ChebySmooth(cheby_smooth);
|
||||
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
|
||||
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
|
||||
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_subspace);
|
||||
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
|
||||
|
||||
for(int k=0;k<evec_coarse.size();k++){
|
||||
if ( k < nbasis ) {
|
||||
@ -302,34 +353,34 @@ public:
|
||||
PlainHermOp<FineField> Op(_FineOp);
|
||||
|
||||
evals_fine.resize(Nm);
|
||||
_Aggregate.subspace.resize(Nm,_FineGrid);
|
||||
subspace.resize(Nm,_FineGrid);
|
||||
|
||||
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
|
||||
|
||||
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
|
||||
|
||||
int Nconv;
|
||||
IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
|
||||
IRL.calc(evals_fine,subspace,src,Nconv,false);
|
||||
|
||||
// Shrink down to number saved
|
||||
assert(Nstop>=nbasis);
|
||||
assert(Nconv>=nbasis);
|
||||
evals_fine.resize(nbasis);
|
||||
_Aggregate.subspace.resize(nbasis,_FineGrid);
|
||||
subspace.resize(nbasis,_FineGrid);
|
||||
}
|
||||
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
|
||||
int Nstop, int Nk, int Nm,RealD resid,
|
||||
RealD MaxIt, RealD betastp, int MinRes)
|
||||
{
|
||||
Chebyshev<FineField> Cheby(cheby_op);
|
||||
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,_Aggregate);
|
||||
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
|
||||
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,_subspace);
|
||||
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_subspace);
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Chebyshev<FineField> ChebySmooth(cheby_smooth);
|
||||
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
|
||||
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_subspace,relax);
|
||||
|
||||
evals_coarse.resize(Nm);
|
||||
evec_coarse.resize(Nm,_CoarseGrid);
|
||||
|
@ -107,7 +107,12 @@ namespace Grid {
|
||||
};
|
||||
|
||||
template<class Matrix>
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out){
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out){
|
||||
ZeroGuesser guess;
|
||||
(*this)(_Matrix,in,out,guess);
|
||||
}
|
||||
template<class Matrix, class Guesser>
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
|
||||
|
||||
// FIXME CGdiagonalMee not implemented virtual function
|
||||
// FIXME use CBfactorise to control schur decomp
|
||||
@ -129,7 +134,6 @@ namespace Grid {
|
||||
pickCheckerboard(Odd ,src_o,in);
|
||||
pickCheckerboard(Even,sol_e,out);
|
||||
pickCheckerboard(Odd ,sol_o,out);
|
||||
|
||||
std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
@ -146,6 +150,7 @@ namespace Grid {
|
||||
// Call the red-black solver
|
||||
//////////////////////////////////////////////////////////////
|
||||
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
|
||||
guess(src_o,sol_o);
|
||||
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
|
||||
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called the Mpc solver" <<std::endl;
|
||||
|
||||
@ -189,7 +194,12 @@ namespace Grid {
|
||||
CBfactorise=cb;
|
||||
};
|
||||
template<class Matrix>
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out){
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out){
|
||||
ZeroGuesser guess;
|
||||
(*this)(_Matrix,in,out,guess);
|
||||
}
|
||||
template<class Matrix, class Guesser>
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
|
||||
|
||||
// FIXME CGdiagonalMee not implemented virtual function
|
||||
// FIXME use CBfactorise to control schur decomp
|
||||
@ -225,6 +235,7 @@ namespace Grid {
|
||||
// Call the red-black solver
|
||||
//////////////////////////////////////////////////////////////
|
||||
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
|
||||
guess(src_o,sol_o);
|
||||
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
@ -268,7 +279,12 @@ namespace Grid {
|
||||
};
|
||||
|
||||
template<class Matrix>
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out){
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out){
|
||||
ZeroGuesser guess;
|
||||
(*this)(_Matrix,in,out,guess);
|
||||
}
|
||||
template<class Matrix,class Guesser>
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
|
||||
|
||||
// FIXME CGdiagonalMee not implemented virtual function
|
||||
// FIXME use CBfactorise to control schur decomp
|
||||
@ -305,6 +321,7 @@ namespace Grid {
|
||||
//////////////////////////////////////////////////////////////
|
||||
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
|
||||
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
|
||||
guess(src_o,tmp);
|
||||
_HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
|
||||
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
|
||||
|
||||
@ -347,7 +364,12 @@ namespace Grid {
|
||||
};
|
||||
|
||||
template<class Matrix>
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out){
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out){
|
||||
ZeroGuesser guess;
|
||||
(*this)(_Matrix,in,out,guess);
|
||||
}
|
||||
template<class Matrix, class Guesser>
|
||||
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
|
||||
|
||||
// FIXME CGdiagonalMee not implemented virtual function
|
||||
// FIXME use CBfactorise to control schur decomp
|
||||
@ -385,6 +407,7 @@ namespace Grid {
|
||||
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
|
||||
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
|
||||
// _HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
|
||||
guess(src_o,tmp);
|
||||
_HermitianRBSolver(src_o,tmp); assert(tmp.checkerboard==Odd);
|
||||
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
|
||||
|
||||
|
@ -3,9 +3,12 @@
|
||||
|
||||
namespace Grid {
|
||||
|
||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||
bool MemoryProfiler::debug = false;
|
||||
|
||||
int PointerCache::victim;
|
||||
|
||||
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
|
||||
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
|
||||
|
||||
void *PointerCache::Insert(void *ptr,size_t bytes) {
|
||||
|
||||
@ -94,4 +97,29 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string sizeString(const size_t bytes)
|
||||
{
|
||||
constexpr unsigned int bufSize = 256;
|
||||
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
|
||||
char buf[256];
|
||||
size_t s = 0;
|
||||
double count = bytes;
|
||||
|
||||
while (count >= 1024 && s < 7)
|
||||
{
|
||||
s++;
|
||||
count /= 1024;
|
||||
}
|
||||
if (count - floor(count) == 0.0)
|
||||
{
|
||||
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
|
||||
}
|
||||
else
|
||||
{
|
||||
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
|
||||
}
|
||||
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -63,6 +63,64 @@ namespace Grid {
|
||||
static void *Lookup(size_t bytes) ;
|
||||
|
||||
};
|
||||
|
||||
std::string sizeString(size_t bytes);
|
||||
|
||||
struct MemoryStats
|
||||
{
|
||||
size_t totalAllocated{0}, maxAllocated{0},
|
||||
currentlyAllocated{0}, totalFreed{0};
|
||||
};
|
||||
|
||||
class MemoryProfiler
|
||||
{
|
||||
public:
|
||||
static MemoryStats *stats;
|
||||
static bool debug;
|
||||
};
|
||||
|
||||
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
|
||||
#define profilerDebugPrint \
|
||||
if (MemoryProfiler::stats)\
|
||||
{\
|
||||
auto s = MemoryProfiler::stats;\
|
||||
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
|
||||
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
|
||||
<< std::endl;\
|
||||
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
|
||||
<< std::endl;\
|
||||
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
|
||||
<< std::endl;\
|
||||
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
|
||||
<< std::endl;\
|
||||
}
|
||||
|
||||
#define profilerAllocate(bytes)\
|
||||
if (MemoryProfiler::stats)\
|
||||
{\
|
||||
auto s = MemoryProfiler::stats;\
|
||||
s->totalAllocated += (bytes);\
|
||||
s->currentlyAllocated += (bytes);\
|
||||
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated);\
|
||||
}\
|
||||
if (MemoryProfiler::debug)\
|
||||
{\
|
||||
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
|
||||
profilerDebugPrint;\
|
||||
}
|
||||
|
||||
#define profilerFree(bytes)\
|
||||
if (MemoryProfiler::stats)\
|
||||
{\
|
||||
auto s = MemoryProfiler::stats;\
|
||||
s->totalFreed += (bytes);\
|
||||
s->currentlyAllocated -= (bytes);\
|
||||
}\
|
||||
if (MemoryProfiler::debug)\
|
||||
{\
|
||||
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
|
||||
profilerDebugPrint;\
|
||||
}
|
||||
|
||||
void check_huge_pages(void *Buf,uint64_t BYTES);
|
||||
|
||||
@ -92,6 +150,7 @@ public:
|
||||
pointer allocate(size_type __n, const void* _p= 0)
|
||||
{
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
profilerAllocate(bytes);
|
||||
|
||||
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
||||
// if ( ptr != NULL )
|
||||
@ -122,6 +181,8 @@ public:
|
||||
void deallocate(pointer __p, size_type __n) {
|
||||
size_type bytes = __n * sizeof(_Tp);
|
||||
|
||||
profilerFree(bytes);
|
||||
|
||||
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
||||
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
@ -172,10 +233,13 @@ public:
|
||||
#ifdef GRID_COMMS_SHMEM
|
||||
pointer allocate(size_type __n, const void* _p= 0)
|
||||
{
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
|
||||
profilerAllocate(bytes);
|
||||
#ifdef CRAY
|
||||
_Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
|
||||
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
|
||||
#else
|
||||
_Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
|
||||
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
|
||||
#endif
|
||||
#ifdef PARANOID_SYMMETRIC_HEAP
|
||||
static void * bcast;
|
||||
@ -193,18 +257,23 @@ public:
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
void deallocate(pointer __p, size_type) {
|
||||
void deallocate(pointer __p, size_type __n) {
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
|
||||
profilerFree(bytes);
|
||||
shmem_free((void *)__p);
|
||||
}
|
||||
#else
|
||||
pointer allocate(size_type __n, const void* _p= 0)
|
||||
{
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
|
||||
#else
|
||||
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
|
||||
#endif
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
|
||||
profilerAllocate(bytes);
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
|
||||
#else
|
||||
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
|
||||
#endif
|
||||
uint8_t *cp = (uint8_t *)ptr;
|
||||
if ( ptr ) {
|
||||
// One touch per 4k page, static OMP loop to catch same loop order
|
||||
@ -215,7 +284,10 @@ public:
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
void deallocate(pointer __p, size_type) {
|
||||
void deallocate(pointer __p, size_type __n) {
|
||||
size_type bytes = __n*sizeof(_Tp);
|
||||
|
||||
profilerFree(bytes);
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
_mm_free((void *)__p);
|
||||
#else
|
||||
|
@ -59,6 +59,7 @@ public:
|
||||
|
||||
virtual ~GridBase() = default;
|
||||
|
||||
|
||||
// Physics Grid information.
|
||||
std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
|
||||
std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
|
||||
@ -78,6 +79,8 @@ public:
|
||||
std::vector<int> _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
|
||||
std::vector<int> _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
|
||||
|
||||
bool _isCheckerBoarded;
|
||||
|
||||
public:
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
@ -97,6 +97,7 @@ public:
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
_isCheckerBoarded = false;
|
||||
_ndimension = dimensions.size();
|
||||
|
||||
_fdimensions.resize(_ndimension);
|
||||
@ -122,6 +123,7 @@ public:
|
||||
|
||||
// Use a reduced simd grid
|
||||
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
|
||||
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
|
||||
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
|
||||
|
||||
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
|
||||
@ -166,6 +168,7 @@ public:
|
||||
block = block * _rdimensions[d];
|
||||
}
|
||||
};
|
||||
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
@ -171,9 +171,8 @@ public:
|
||||
const std::vector<int> &checker_dim_mask,
|
||||
int checker_dim)
|
||||
{
|
||||
///////////////////////
|
||||
// Grid information
|
||||
///////////////////////
|
||||
|
||||
_isCheckerBoarded = true;
|
||||
_checker_dim = checker_dim;
|
||||
assert(checker_dim_mask[checker_dim] == 1);
|
||||
_ndimension = dimensions.size();
|
||||
|
@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#ifndef GRID_COMMUNICATOR_H
|
||||
#define GRID_COMMUNICATOR_H
|
||||
|
||||
#include <Grid/communicator/SharedMemory.h>
|
||||
#include <Grid/communicator/Communicator_base.h>
|
||||
|
||||
#endif
|
||||
|
@ -36,33 +36,9 @@ namespace Grid {
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////
|
||||
void * CartesianCommunicator::ShmCommBuf;
|
||||
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
|
||||
CartesianCommunicator::CommunicatorPolicy_t
|
||||
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
|
||||
int CartesianCommunicator::nCommThreads = -1;
|
||||
int CartesianCommunicator::Hugepages = 0;
|
||||
|
||||
/////////////////////////////////
|
||||
// Alloc, free shmem region
|
||||
/////////////////////////////////
|
||||
void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
|
||||
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
|
||||
void *ptr = (void *)heap_top;
|
||||
heap_top += bytes;
|
||||
heap_bytes+= bytes;
|
||||
if (heap_bytes >= MAX_MPI_SHM_BYTES) {
|
||||
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
||||
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
||||
std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
|
||||
assert(heap_bytes<MAX_MPI_SHM_BYTES);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
void CartesianCommunicator::ShmBufferFreeAll(void) {
|
||||
heap_top =(size_t)ShmBufferSelf();
|
||||
heap_bytes=0;
|
||||
}
|
||||
|
||||
/////////////////////////////////
|
||||
// Grid information queries
|
||||
@ -95,282 +71,6 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
||||
{
|
||||
GlobalSumVector((double *)c,2*N);
|
||||
}
|
||||
|
||||
|
||||
#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
|
||||
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
||||
{
|
||||
std::vector<int> row(_ndimension,1);
|
||||
assert(dim>=0 && dim<_ndimension);
|
||||
|
||||
// Split the communicator
|
||||
row[dim] = _processors[dim];
|
||||
|
||||
int me;
|
||||
CartesianCommunicator Comm(row,*this,me);
|
||||
Comm.AllToAll(in,out,words,bytes);
|
||||
}
|
||||
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
|
||||
{
|
||||
// MPI is a pain and uses "int" arguments
|
||||
// 64*64*64*128*16 == 500Million elements of data.
|
||||
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
||||
// (Turns up on 32^3 x 64 Gparity too)
|
||||
MPI_Datatype object;
|
||||
int iwords;
|
||||
int ibytes;
|
||||
iwords = words;
|
||||
ibytes = bytes;
|
||||
assert(words == iwords); // safe to cast to int ?
|
||||
assert(bytes == ibytes); // safe to cast to int ?
|
||||
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
|
||||
MPI_Type_commit(&object);
|
||||
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
|
||||
MPI_Type_free(&object);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
|
||||
{
|
||||
_ndimension = processors.size();
|
||||
|
||||
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
|
||||
std::vector<int> parent_processor_coor(_ndimension,0);
|
||||
std::vector<int> parent_processors (_ndimension,1);
|
||||
|
||||
// Can make 5d grid from 4d etc...
|
||||
int pad = _ndimension-parent_ndimension;
|
||||
for(int d=0;d<parent_ndimension;d++){
|
||||
parent_processor_coor[pad+d]=parent._processor_coor[d];
|
||||
parent_processors [pad+d]=parent._processors[d];
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// split the communicator
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int Nparent;
|
||||
MPI_Comm_size(parent.communicator,&Nparent);
|
||||
|
||||
int childsize=1;
|
||||
for(int d=0;d<processors.size();d++) {
|
||||
childsize *= processors[d];
|
||||
}
|
||||
int Nchild = Nparent/childsize;
|
||||
assert (childsize * Nchild == Nparent);
|
||||
|
||||
std::vector<int> ccoor(_ndimension); // coor within subcommunicator
|
||||
std::vector<int> scoor(_ndimension); // coor of split within parent
|
||||
std::vector<int> ssize(_ndimension); // coor of split within parent
|
||||
|
||||
std::vector<int> pcoor(_ndimension,0);
|
||||
std::vector<int> pdims(_ndimension,1);
|
||||
|
||||
if(parent._processors.size()==4 && _ndimension==5){
|
||||
for(int i=0;i<4;i++) pcoor[i+1]=parent._processor_coor[i];
|
||||
for(int i=0;i<4;i++) pdims[i+1]=parent._processors[i];
|
||||
} else {
|
||||
assert(_ndimension == parent._ndimension);
|
||||
for(int i=0;i<_ndimension;i++) pcoor[i]=parent._processor_coor[i];
|
||||
for(int i=0;i<_ndimension;i++) pdims[i]=parent._processors[i];
|
||||
}
|
||||
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
ccoor[d] = pcoor[d] % processors[d];
|
||||
scoor[d] = pcoor[d] / processors[d];
|
||||
ssize[d] = pdims[d] / processors[d];
|
||||
}
|
||||
int crank; // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
|
||||
// Mpi uses the reverse Lexico convention to us
|
||||
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors);
|
||||
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);
|
||||
|
||||
MPI_Comm comm_split;
|
||||
if ( Nchild > 1 ) {
|
||||
|
||||
if(0){
|
||||
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
|
||||
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
|
||||
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
|
||||
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
|
||||
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
|
||||
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
|
||||
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
|
||||
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
|
||||
assert(ierr==0);
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Declare victory
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
|
||||
// << Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
|
||||
} else {
|
||||
comm_split=parent.communicator;
|
||||
srank = 0;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Set up from the new split communicator
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
InitFromMPICommunicator(processors,comm_split);
|
||||
|
||||
if(0){
|
||||
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
|
||||
for(int d=0;d<processors.size();d++){
|
||||
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
|
||||
}
|
||||
}
|
||||
for(int d=0;d<processors.size();d++){
|
||||
assert(_processor_coor[d] == ccoor[d] );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Take an MPI_Comm and self assemble
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
|
||||
{
|
||||
_ndimension = processors.size();
|
||||
_processor_coor.resize(_ndimension);
|
||||
|
||||
/////////////////////////////////
|
||||
// Count the requested nodes
|
||||
/////////////////////////////////
|
||||
_Nprocessors=1;
|
||||
_processors = processors;
|
||||
for(int i=0;i<_ndimension;i++){
|
||||
_Nprocessors*=_processors[i];
|
||||
}
|
||||
|
||||
std::vector<int> periodic(_ndimension,1);
|
||||
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
|
||||
MPI_Comm_rank(communicator,&_processor);
|
||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||
|
||||
if ( 0 && (communicator_base != communicator_world) ) {
|
||||
std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
|
||||
|
||||
std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
|
||||
for(int d=0;d<_processors.size();d++){
|
||||
std::cout << _processor_coor[d]<<" ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
int Size;
|
||||
MPI_Comm_size(communicator,&Size);
|
||||
|
||||
#if defined(GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
|
||||
communicator_halo.resize (2*_ndimension);
|
||||
for(int i=0;i<_ndimension*2;i++){
|
||||
MPI_Comm_dup(communicator,&communicator_halo[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(Size==_Nprocessors);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
{
|
||||
InitFromMPICommunicator(processors,communicator_world);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined( GRID_COMMS_MPI3)
|
||||
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
|
||||
int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
|
||||
#endif
|
||||
|
||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
SendToRecvFromComplete(list);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
SendToRecvFromComplete(waitall);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined( GRID_COMMS_MPI3)
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void){};
|
||||
|
||||
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
|
||||
|
||||
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
|
||||
|
||||
void *CartesianCommunicator::ShmBuffer(int rank) {
|
||||
return NULL;
|
||||
}
|
||||
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
|
||||
return NULL;
|
||||
}
|
||||
void CartesianCommunicator::ShmInitGeneric(void){
|
||||
#if 1
|
||||
int mmap_flag =0;
|
||||
#ifdef MAP_ANONYMOUS
|
||||
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
|
||||
#endif
|
||||
#ifdef MAP_ANON
|
||||
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
|
||||
#endif
|
||||
#ifdef MAP_HUGETLB
|
||||
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
|
||||
#endif
|
||||
ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
|
||||
if (ShmCommBuf == (void *)MAP_FAILED) {
|
||||
perror("mmap failed ");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#ifdef MADV_HUGEPAGE
|
||||
if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
|
||||
#endif
|
||||
#else
|
||||
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
|
||||
ShmCommBuf=(void *)&ShmBufStorageVector[0];
|
||||
#endif
|
||||
bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
@ -32,117 +32,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
///////////////////////////////////
|
||||
// Processor layout information
|
||||
///////////////////////////////////
|
||||
#ifdef GRID_COMMS_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#ifdef GRID_COMMS_MPI3
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#ifdef GRID_COMMS_MPIT
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#ifdef GRID_COMMS_SHMEM
|
||||
#include <mpp/shmem.h>
|
||||
#endif
|
||||
#include <Grid/communicator/SharedMemory.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
class CartesianCommunicator {
|
||||
public:
|
||||
class CartesianCommunicator : public SharedMemory {
|
||||
|
||||
public:
|
||||
|
||||
////////////////////////////////////////////
|
||||
// Isend/Irecv/Wait, or Sendrecv blocking
|
||||
// Policies
|
||||
////////////////////////////////////////////
|
||||
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
|
||||
static CommunicatorPolicy_t CommunicatorPolicy;
|
||||
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Up to 65536 ranks per node adequate for now
|
||||
// 128MB shared memory for comms enought for 48^4 local vol comms
|
||||
// Give external control (command line override?) of this
|
||||
///////////////////////////////////////////
|
||||
static const int MAXLOG2RANKSPERNODE = 16;
|
||||
static uint64_t MAX_MPI_SHM_BYTES;
|
||||
static int nCommThreads;
|
||||
// use explicit huge pages
|
||||
static int Hugepages;
|
||||
|
||||
////////////////////////////////////////////
|
||||
// Communicator should know nothing of the physics grid, only processor grid.
|
||||
////////////////////////////////////////////
|
||||
int _Nprocessors; // How many in all
|
||||
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
|
||||
int _processor; // linear processor rank
|
||||
std::vector<int> _processor_coor; // linear processor coordinate
|
||||
unsigned long _ndimension;
|
||||
|
||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
||||
static MPI_Comm communicator_world;
|
||||
|
||||
MPI_Comm communicator;
|
||||
std::vector<MPI_Comm> communicator_halo;
|
||||
|
||||
typedef MPI_Request CommsRequest_t;
|
||||
|
||||
#else
|
||||
typedef int CommsRequest_t;
|
||||
#endif
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Helper functionality for SHM Windows common to all other impls
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Longer term; drop this in favour of a master / slave model with
|
||||
// cartesian communicator on a subset of ranks, slave ranks controlled
|
||||
// by group leader with data xfer via shared memory
|
||||
////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_COMMS_MPI3
|
||||
|
||||
static int ShmRank;
|
||||
static int ShmSize;
|
||||
static int GroupRank;
|
||||
static int GroupSize;
|
||||
static int WorldRank;
|
||||
static int WorldSize;
|
||||
|
||||
std::vector<int> WorldDims;
|
||||
std::vector<int> GroupDims;
|
||||
std::vector<int> ShmDims;
|
||||
|
||||
std::vector<int> GroupCoor;
|
||||
std::vector<int> ShmCoor;
|
||||
std::vector<int> WorldCoor;
|
||||
|
||||
static std::vector<int> GroupRanks;
|
||||
static std::vector<int> MyGroup;
|
||||
static int ShmSetup;
|
||||
static MPI_Win ShmWindow;
|
||||
static MPI_Comm ShmComm;
|
||||
|
||||
std::vector<int> LexicographicToWorldRank;
|
||||
|
||||
static std::vector<void *> ShmCommBufs;
|
||||
|
||||
#else
|
||||
static void ShmInitGeneric(void);
|
||||
static commVector<uint8_t> ShmBufStorageVector;
|
||||
#endif
|
||||
|
||||
/////////////////////////////////
|
||||
// Grid information and queries
|
||||
// Implemented in Communicator_base.C
|
||||
/////////////////////////////////
|
||||
static void * ShmCommBuf;
|
||||
|
||||
|
||||
size_t heap_top;
|
||||
size_t heap_bytes;
|
||||
|
||||
void *ShmBufferSelf(void);
|
||||
void *ShmBuffer(int rank);
|
||||
void *ShmBufferTranslate(int rank,void * local_p);
|
||||
void *ShmBufferMalloc(size_t bytes);
|
||||
void ShmBufferFreeAll(void) ;
|
||||
unsigned long _ndimension;
|
||||
static Grid_MPI_Comm communicator_world;
|
||||
Grid_MPI_Comm communicator;
|
||||
std::vector<Grid_MPI_Comm> communicator_halo;
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Must call in Grid startup
|
||||
@ -158,14 +74,15 @@ class CartesianCommunicator {
|
||||
virtual ~CartesianCommunicator();
|
||||
|
||||
private:
|
||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Private initialise from an MPI communicator
|
||||
// Can use after an MPI_Comm_split, but hidden from user so private
|
||||
////////////////////////////////////////////////
|
||||
void InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base);
|
||||
#endif
|
||||
void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
|
||||
|
||||
public:
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Wraps MPI_Cart routines, or implements equivalent on other impls
|
||||
@ -181,8 +98,6 @@ class CartesianCommunicator {
|
||||
const std::vector<int> & ThisProcessorCoor(void) ;
|
||||
const std::vector<int> & ProcessorGrid(void) ;
|
||||
int ProcessorCount(void) ;
|
||||
int NodeCount(void) ;
|
||||
int RankCount(void) ;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||
@ -270,15 +185,10 @@ class CartesianCommunicator {
|
||||
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
|
||||
assert(dim>=0);
|
||||
assert(dim<_ndimension);
|
||||
int numnode = _processors[dim];
|
||||
// std::cerr << " AllToAll in.size() "<<in.size()<<std::endl;
|
||||
// std::cerr << " AllToAll out.size() "<<out.size()<<std::endl;
|
||||
assert(in.size()==out.size());
|
||||
int numnode = _processors[dim];
|
||||
uint64_t bytes=sizeof(T);
|
||||
uint64_t words=in.size()/numnode;
|
||||
// std:: cout << "AllToAll buffer size "<< in.size()*sizeof(T)<<std::endl;
|
||||
// std:: cout << "AllToAll datum bytes "<< bytes<<std::endl;
|
||||
// std:: cout << "AllToAll datum count "<< words<<std::endl;
|
||||
assert(numnode * words == in.size());
|
||||
assert(words < (1ULL<<31));
|
||||
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
|
||||
|
@ -1,222 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/communicator/Communicator_mpi.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/GridCore.h>
|
||||
#include <Grid/GridQCDcore.h>
|
||||
#include <Grid/qcd/action/ActionCore.h>
|
||||
#include <mpi.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
MPI_Comm CartesianCommunicator::communicator_world;
|
||||
|
||||
// Should error check all MPI calls.
|
||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
int flag;
|
||||
int provided;
|
||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||
if ( !flag ) {
|
||||
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||
if ( provided != MPI_THREAD_MULTIPLE ) {
|
||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||
}
|
||||
}
|
||||
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||
ShmInitGeneric();
|
||||
}
|
||||
|
||||
CartesianCommunicator::~CartesianCommunicator()
|
||||
{
|
||||
int MPI_is_finalised;
|
||||
MPI_Finalized(&MPI_is_finalised);
|
||||
if (communicator && !MPI_is_finalised)
|
||||
MPI_Comm_free(&communicator);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||
assert(ierr==0);
|
||||
}
|
||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||
{
|
||||
int rank;
|
||||
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||
assert(ierr==0);
|
||||
return rank;
|
||||
}
|
||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||
{
|
||||
coor.resize(_ndimension);
|
||||
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||
SendToRecvFromComplete(reqs);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int sender,
|
||||
int receiver,
|
||||
int bytes)
|
||||
{
|
||||
MPI_Status stat;
|
||||
assert(sender != receiver);
|
||||
int tag = sender;
|
||||
if ( _processor == sender ) {
|
||||
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||
}
|
||||
if ( _processor == receiver ) {
|
||||
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||
}
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
list.push_back(rrq);
|
||||
} else {
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||
recv,bytes,MPI_CHAR,from, from,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
}
|
||||
}
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||
int nreq=list.size();
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
}
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
int ierr=MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
///////////////////////////////////////////////////////
|
||||
// Should only be used prior to Grid Init finished.
|
||||
// Check for this?
|
||||
///////////////////////////////////////////////////////
|
||||
int CartesianCommunicator::RankWorld(void){
|
||||
int r;
|
||||
MPI_Comm_rank(communicator_world,&r);
|
||||
return r;
|
||||
}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
{
|
||||
int ierr= MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator_world);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
@ -26,580 +26,246 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include <semaphore.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/ipc.h>
|
||||
#include <sys/shm.h>
|
||||
#include <sys/mman.h>
|
||||
#include <zlib.h>
|
||||
#ifdef HAVE_NUMAIF_H
|
||||
#include <numaif.h>
|
||||
#endif
|
||||
|
||||
#include <Grid/communicator/SharedMemory.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int CartesianCommunicator::ShmSetup = 0;
|
||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||
|
||||
int CartesianCommunicator::ShmRank;
|
||||
int CartesianCommunicator::ShmSize;
|
||||
int CartesianCommunicator::GroupRank;
|
||||
int CartesianCommunicator::GroupSize;
|
||||
int CartesianCommunicator::WorldRank;
|
||||
int CartesianCommunicator::WorldSize;
|
||||
|
||||
MPI_Comm CartesianCommunicator::communicator_world;
|
||||
MPI_Comm CartesianCommunicator::ShmComm;
|
||||
MPI_Win CartesianCommunicator::ShmWindow;
|
||||
|
||||
std::vector<int> CartesianCommunicator::GroupRanks;
|
||||
std::vector<int> CartesianCommunicator::MyGroup;
|
||||
std::vector<void *> CartesianCommunicator::ShmCommBufs;
|
||||
|
||||
int CartesianCommunicator::NodeCount(void) { return GroupSize;};
|
||||
int CartesianCommunicator::RankCount(void) { return WorldSize;};
|
||||
|
||||
|
||||
#undef FORCE_COMMS
|
||||
void *CartesianCommunicator::ShmBufferSelf(void)
|
||||
////////////////////////////////////////////
|
||||
// First initialise of comms system
|
||||
////////////////////////////////////////////
|
||||
void CartesianCommunicator::Init(int *argc, char ***argv)
|
||||
{
|
||||
return ShmCommBufs[ShmRank];
|
||||
}
|
||||
void *CartesianCommunicator::ShmBuffer(int rank)
|
||||
{
|
||||
int gpeer = GroupRanks[rank];
|
||||
#ifdef FORCE_COMMS
|
||||
return NULL;
|
||||
#endif
|
||||
if (gpeer == MPI_UNDEFINED){
|
||||
return NULL;
|
||||
} else {
|
||||
return ShmCommBufs[gpeer];
|
||||
}
|
||||
}
|
||||
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
|
||||
{
|
||||
static int count =0;
|
||||
int gpeer = GroupRanks[rank];
|
||||
assert(gpeer!=ShmRank); // never send to self
|
||||
assert(rank!=WorldRank);// never send to self
|
||||
#ifdef FORCE_COMMS
|
||||
return NULL;
|
||||
#endif
|
||||
if (gpeer == MPI_UNDEFINED){
|
||||
return NULL;
|
||||
} else {
|
||||
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
|
||||
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
|
||||
return (void *) remote;
|
||||
}
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
|
||||
int flag;
|
||||
int provided;
|
||||
// mtrace();
|
||||
|
||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||
if ( !flag ) {
|
||||
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||
assert (provided == MPI_THREAD_MULTIPLE);
|
||||
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
|
||||
if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
|
||||
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
|
||||
assert(0);
|
||||
}
|
||||
|
||||
Grid_quiesce_nodes();
|
||||
|
||||
// Never clean up as done once.
|
||||
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||
MPI_Comm_rank(communicator_world,&WorldRank);
|
||||
MPI_Comm_size(communicator_world,&WorldSize);
|
||||
|
||||
if ( WorldRank == 0 ) {
|
||||
std::cout << GridLogMessage<< "Initialising MPI "<< WorldRank <<"/"<<WorldSize <<std::endl;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// Split into groups that can share memory
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
|
||||
MPI_Comm_rank(ShmComm ,&ShmRank);
|
||||
MPI_Comm_size(ShmComm ,&ShmSize);
|
||||
GroupSize = WorldSize/ShmSize;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// find world ranks in our SHM group (i.e. which ranks are on our node)
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
MPI_Group WorldGroup, ShmGroup;
|
||||
MPI_Comm_group (communicator_world, &WorldGroup);
|
||||
MPI_Comm_group (ShmComm, &ShmGroup);
|
||||
|
||||
std::vector<int> world_ranks(WorldSize);
|
||||
GroupRanks.resize(WorldSize);
|
||||
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
|
||||
|
||||
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Identify who is in my group and noninate the leader
|
||||
///////////////////////////////////////////////////////////////////
|
||||
int g=0;
|
||||
MyGroup.resize(ShmSize);
|
||||
for(int rank=0;rank<WorldSize;rank++){
|
||||
if(GroupRanks[rank]!=MPI_UNDEFINED){
|
||||
assert(g<ShmSize);
|
||||
MyGroup[g++] = rank;
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
|
||||
int myleader = MyGroup[0];
|
||||
|
||||
std::vector<int> leaders_1hot(WorldSize,0);
|
||||
std::vector<int> leaders_group(GroupSize,0);
|
||||
leaders_1hot [ myleader ] = 1;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// global sum leaders over comm world
|
||||
///////////////////////////////////////////////////////////////////
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
|
||||
assert(ierr==0);
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// find the group leaders world rank
|
||||
///////////////////////////////////////////////////////////////////
|
||||
int group=0;
|
||||
for(int l=0;l<WorldSize;l++){
|
||||
if(leaders_1hot[l]){
|
||||
leaders_group[group++] = l;
|
||||
}
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Identify the rank of the group in which I (and my leader) live
|
||||
///////////////////////////////////////////////////////////////////
|
||||
GroupRank=-1;
|
||||
for(int g=0;g<GroupSize;g++){
|
||||
if (myleader == leaders_group[g]){
|
||||
GroupRank=g;
|
||||
}
|
||||
}
|
||||
assert(GroupRank!=-1);
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// allocate the shared window for our group
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
MPI_Barrier(ShmComm);
|
||||
|
||||
ShmCommBuf = 0;
|
||||
ShmCommBufs.resize(ShmSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbf and others map filesystems as mappable huge pages
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_MPI3_SHMMMAP
|
||||
char shm_name [NAME_MAX];
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
|
||||
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
|
||||
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
|
||||
//sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
|
||||
// printf("Opening file %s \n",shm_name);
|
||||
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
|
||||
if ( fd == -1) {
|
||||
printf("open %s failed\n",shm_name);
|
||||
perror("open hugetlbfs");
|
||||
exit(0);
|
||||
}
|
||||
int mmap_flag = MAP_SHARED ;
|
||||
#ifdef MAP_POPULATE
|
||||
mmap_flag|=MAP_POPULATE;
|
||||
#endif
|
||||
#ifdef MAP_HUGETLB
|
||||
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
|
||||
#endif
|
||||
void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
|
||||
if ( ptr == (void *)MAP_FAILED ) {
|
||||
printf("mmap %s failed\n",shm_name);
|
||||
perror("failed mmap"); assert(0);
|
||||
}
|
||||
assert(((uint64_t)ptr&0x3F)==0);
|
||||
ShmCommBufs[r] =ptr;
|
||||
|
||||
}
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
|
||||
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
|
||||
// the posix shm virtual file system
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_MPI3_SHMOPEN
|
||||
char shm_name [NAME_MAX];
|
||||
if ( ShmRank == 0 ) {
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
|
||||
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
|
||||
|
||||
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
|
||||
|
||||
shm_unlink(shm_name);
|
||||
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
|
||||
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
|
||||
ftruncate(fd, size);
|
||||
|
||||
int mmap_flag = MAP_SHARED;
|
||||
#ifdef MAP_POPULATE
|
||||
mmap_flag |= MAP_POPULATE;
|
||||
#endif
|
||||
#ifdef MAP_HUGETLB
|
||||
if (Hugepages) mmap_flag |= MAP_HUGETLB;
|
||||
#endif
|
||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
|
||||
|
||||
if ( ptr == (void * )MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
||||
assert(((uint64_t)ptr&0x3F)==0);
|
||||
|
||||
// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
|
||||
#if 0
|
||||
//#ifdef HAVE_NUMAIF_H
|
||||
int status;
|
||||
int flags=MPOL_MF_MOVE;
|
||||
#ifdef KNL
|
||||
int nodes=1; // numa domain == MCDRAM
|
||||
// Find out if in SNC2,SNC4 mode ?
|
||||
#else
|
||||
int nodes=r; // numa domain == MPI ID
|
||||
#endif
|
||||
unsigned long count=1;
|
||||
for(uint64_t page=0;page<size;page+=4096){
|
||||
void *pages = (void *) ( page + (uint64_t)ptr );
|
||||
uint64_t *cow_it = (uint64_t *)pages; *cow_it = 1;
|
||||
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
|
||||
if (ierr && (page==0)) perror("numa relocate command failed");
|
||||
}
|
||||
#endif
|
||||
ShmCommBufs[r] =ptr;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(ShmComm);
|
||||
|
||||
if ( ShmRank != 0 ) {
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
|
||||
|
||||
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
|
||||
|
||||
int fd=shm_open(shm_name,O_RDWR,0666);
|
||||
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
|
||||
|
||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
||||
assert(((uint64_t)ptr&0x3F)==0);
|
||||
ShmCommBufs[r] =ptr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// SHMGET SHMAT and SHM_HUGETLB flag
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_MPI3_SHMGET
|
||||
std::vector<int> shmids(ShmSize);
|
||||
|
||||
if ( ShmRank == 0 ) {
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
|
||||
key_t key = IPC_PRIVATE;
|
||||
int flags = IPC_CREAT | SHM_R | SHM_W;
|
||||
#ifdef SHM_HUGETLB
|
||||
if (Hugepages) flags|=SHM_HUGETLB;
|
||||
#endif
|
||||
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
|
||||
int errsv = errno;
|
||||
printf("Errno %d\n",errsv);
|
||||
printf("key %d\n",key);
|
||||
printf("size %lld\n",size);
|
||||
printf("flags %d\n",flags);
|
||||
perror("shmget");
|
||||
exit(1);
|
||||
} else {
|
||||
printf("shmid: 0x%x\n", shmids[r]);
|
||||
}
|
||||
}
|
||||
}
|
||||
MPI_Barrier(ShmComm);
|
||||
MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm);
|
||||
MPI_Barrier(ShmComm);
|
||||
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
ShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
|
||||
if (ShmCommBufs[r] == (uint64_t *)-1) {
|
||||
perror("Shared memory attach failure");
|
||||
shmctl(shmids[r], IPC_RMID, NULL);
|
||||
exit(2);
|
||||
}
|
||||
printf("shmaddr: %p\n", ShmCommBufs[r]);
|
||||
}
|
||||
MPI_Barrier(ShmComm);
|
||||
// Mark for clean up
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
|
||||
}
|
||||
MPI_Barrier(ShmComm);
|
||||
|
||||
#endif
|
||||
ShmCommBuf = ShmCommBufs[ShmRank];
|
||||
|
||||
MPI_Barrier(ShmComm);
|
||||
if ( ShmRank == 0 ) {
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
uint64_t * check = (uint64_t *) ShmCommBufs[r];
|
||||
check[0] = GroupRank;
|
||||
check[1] = r;
|
||||
check[2] = 0x5A5A5A;
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(ShmComm);
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
uint64_t * check = (uint64_t *) ShmCommBufs[r];
|
||||
|
||||
assert(check[0]==GroupRank);
|
||||
assert(check[1]==r);
|
||||
assert(check[2]==0x5A5A5A);
|
||||
|
||||
}
|
||||
MPI_Barrier(ShmComm);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Verbose for now
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
if (WorldRank == 0){
|
||||
std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
|
||||
std::cout<< WorldSize << " Ranks " ;
|
||||
std::cout<< GroupSize << " Nodes " ;
|
||||
std::cout<< " with "<< ShmSize << " ranks-per-node "<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage <<"Grid MPI-3 configuration: allocated shared memory region of size ";
|
||||
std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
|
||||
|
||||
for(int g=0;g<GroupSize;g++){
|
||||
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
|
||||
for(int g=0;g<ShmSize;g++){
|
||||
std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
|
||||
if(g!=ShmSize-1) std::cout<<",";
|
||||
else std::cout<<"}"<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
for(int g=0;g<GroupSize;g++){
|
||||
if ( (ShmRank == 0) && (GroupRank==g) ) std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
if ( (ShmRank == 0) && (GroupRank==g) ) {
|
||||
std::cout<<MyGroup[r];
|
||||
if(r<ShmSize-1) std::cout<<",";
|
||||
else std::cout<<"}"<<std::endl<<std::flush;
|
||||
}
|
||||
MPI_Barrier(communicator_world);
|
||||
}
|
||||
}
|
||||
|
||||
assert(ShmSetup==0); ShmSetup=1;
|
||||
GlobalSharedMemory::Init(communicator_world);
|
||||
GlobalSharedMemory::SharedMemoryAllocate(
|
||||
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
|
||||
GlobalSharedMemory::Hugepages);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Want to implement some magic ... Group sub-cubes into those on same node
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Use cartesian communicators now even in MPI3
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
std::vector<int> coor = _processor_coor; // my coord
|
||||
assert(std::abs(shift) <_processors[dim]);
|
||||
|
||||
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
|
||||
Lexicographic::IndexFromCoor(coor,source,_processors);
|
||||
source = LexicographicToWorldRank[source];
|
||||
|
||||
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
|
||||
Lexicographic::IndexFromCoor(coor,dest,_processors);
|
||||
dest = LexicographicToWorldRank[dest];
|
||||
|
||||
}// rank is world rank.
|
||||
|
||||
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||
assert(ierr==0);
|
||||
}
|
||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||
{
|
||||
int rank;
|
||||
Lexicographic::IndexFromCoor(coor,rank,_processors);
|
||||
rank = LexicographicToWorldRank[rank];
|
||||
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||
assert(ierr==0);
|
||||
return rank;
|
||||
}// rank is world rank
|
||||
|
||||
}
|
||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||
{
|
||||
int lr=-1;
|
||||
for(int r=0;r<WorldSize;r++){// map world Rank to lexico and then to coor
|
||||
if( LexicographicToWorldRank[r]==rank) lr = r;
|
||||
}
|
||||
assert(lr!=-1);
|
||||
Lexicographic::CoorFromIndex(coor,lr,_processors);
|
||||
coor.resize(_ndimension);
|
||||
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Initialises from communicator_world
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
{
|
||||
MPI_Comm optimal_comm;
|
||||
////////////////////////////////////////////////////
|
||||
// Remap using the shared memory optimising routine
|
||||
// The remap creates a comm which must be freed
|
||||
////////////////////////////////////////////////////
|
||||
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
|
||||
InitFromMPICommunicator(processors,optimal_comm);
|
||||
SetCommunicator(optimal_comm);
|
||||
///////////////////////////////////////////////////
|
||||
// Free the temp communicator
|
||||
///////////////////////////////////////////////////
|
||||
MPI_Comm_free(&optimal_comm);
|
||||
}
|
||||
|
||||
//////////////////////////////////
|
||||
// Try to subdivide communicator
|
||||
//////////////////////////////////
|
||||
/*
|
||||
* Use default in MPI compile
|
||||
*/
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
|
||||
: CartesianCommunicator(processors)
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
|
||||
{
|
||||
std::cout << "Attempts to split MPI3 communicators will fail until implemented" <<std::endl;
|
||||
_ndimension = processors.size();
|
||||
|
||||
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
|
||||
std::vector<int> parent_processor_coor(_ndimension,0);
|
||||
std::vector<int> parent_processors (_ndimension,1);
|
||||
|
||||
// Can make 5d grid from 4d etc...
|
||||
int pad = _ndimension-parent_ndimension;
|
||||
for(int d=0;d<parent_ndimension;d++){
|
||||
parent_processor_coor[pad+d]=parent._processor_coor[d];
|
||||
parent_processors [pad+d]=parent._processors[d];
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// split the communicator
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// int Nparent = parent._processors ;
|
||||
// std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
|
||||
int Nparent;
|
||||
MPI_Comm_size(parent.communicator,&Nparent);
|
||||
// std::cout << " Parent size "<<Nparent <<std::endl;
|
||||
|
||||
int childsize=1;
|
||||
for(int d=0;d<processors.size();d++) {
|
||||
childsize *= processors[d];
|
||||
}
|
||||
int Nchild = Nparent/childsize;
|
||||
assert (childsize * Nchild == Nparent);
|
||||
|
||||
// std::cout << " child size "<<childsize <<std::endl;
|
||||
|
||||
std::vector<int> ccoor(_ndimension); // coor within subcommunicator
|
||||
std::vector<int> scoor(_ndimension); // coor of split within parent
|
||||
std::vector<int> ssize(_ndimension); // coor of split within parent
|
||||
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
ccoor[d] = parent_processor_coor[d] % processors[d];
|
||||
scoor[d] = parent_processor_coor[d] / processors[d];
|
||||
ssize[d] = parent_processors[d] / processors[d];
|
||||
}
|
||||
|
||||
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
|
||||
int crank;
|
||||
// Mpi uses the reverse Lexico convention to us; so reversed routines called
|
||||
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
|
||||
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
|
||||
|
||||
MPI_Comm comm_split;
|
||||
if ( Nchild > 1 ) {
|
||||
|
||||
if(0){
|
||||
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
|
||||
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
|
||||
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
|
||||
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
|
||||
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
|
||||
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
|
||||
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
|
||||
std::cout<<std::endl;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Declare victory
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
|
||||
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
|
||||
std::cout << " Split communicator " <<comm_split <<std::endl;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Split the communicator
|
||||
////////////////////////////////////////////////////////////////
|
||||
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
|
||||
assert(ierr==0);
|
||||
|
||||
} else {
|
||||
srank = 0;
|
||||
int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Set up from the new split communicator
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
InitFromMPICommunicator(processors,comm_split);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Take the right SHM buffers
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
SetCommunicator(comm_split);
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Free the temp communicator
|
||||
///////////////////////////////////////////////
|
||||
MPI_Comm_free(&comm_split);
|
||||
|
||||
if(0){
|
||||
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
|
||||
for(int d=0;d<processors.size();d++){
|
||||
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
|
||||
}
|
||||
}
|
||||
for(int d=0;d<processors.size();d++){
|
||||
assert(_processor_coor[d] == ccoor[d] );
|
||||
}
|
||||
}
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
{
|
||||
int ierr;
|
||||
communicator=communicator_world;
|
||||
|
||||
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
|
||||
{
|
||||
////////////////////////////////////////////////////
|
||||
// Creates communicator, and the communicator_halo
|
||||
////////////////////////////////////////////////////
|
||||
_ndimension = processors.size();
|
||||
_processor_coor.resize(_ndimension);
|
||||
|
||||
/////////////////////////////////
|
||||
// Count the requested nodes
|
||||
/////////////////////////////////
|
||||
_Nprocessors=1;
|
||||
_processors = processors;
|
||||
for(int i=0;i<_ndimension;i++){
|
||||
_Nprocessors*=_processors[i];
|
||||
}
|
||||
|
||||
std::vector<int> periodic(_ndimension,1);
|
||||
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
|
||||
MPI_Comm_rank(communicator,&_processor);
|
||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||
|
||||
if ( 0 && (communicator_base != communicator_world) ) {
|
||||
std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
|
||||
std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
|
||||
for(int d=0;d<_processors.size();d++){
|
||||
std::cout << _processor_coor[d]<<" ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
int Size;
|
||||
MPI_Comm_size(communicator,&Size);
|
||||
|
||||
communicator_halo.resize (2*_ndimension);
|
||||
for(int i=0;i<_ndimension*2;i++){
|
||||
MPI_Comm_dup(communicator,&communicator_halo[i]);
|
||||
}
|
||||
assert(Size==_Nprocessors);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Assert power of two shm_size.
|
||||
////////////////////////////////////////////////////////////////
|
||||
int log2size = -1;
|
||||
for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){
|
||||
if ( (0x1<<i) == ShmSize ) {
|
||||
log2size = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(log2size != -1);
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Identify subblock of ranks on node spreading across dims
|
||||
// in a maximally symmetrical way
|
||||
////////////////////////////////////////////////////////////////
|
||||
std::vector<int> WorldDims = processors;
|
||||
|
||||
ShmDims.resize (_ndimension,1);
|
||||
GroupDims.resize(_ndimension);
|
||||
ShmCoor.resize (_ndimension);
|
||||
GroupCoor.resize(_ndimension);
|
||||
WorldCoor.resize(_ndimension);
|
||||
|
||||
int dim = 0;
|
||||
for(int l2=0;l2<log2size;l2++){
|
||||
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%_ndimension;
|
||||
ShmDims[dim]*=2;
|
||||
dim=(dim+1)%_ndimension;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Establish torus of processes and nodes with sub-blockings
|
||||
////////////////////////////////////////////////////////////////
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
GroupDims[d] = WorldDims[d]/ShmDims[d];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Verbose
|
||||
////////////////////////////////////////////////////////////////
|
||||
#if 0
|
||||
std::cout<< GridLogMessage << "MPI-3 usage "<<std::endl;
|
||||
std::cout<< GridLogMessage << "SHM ";
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
std::cout<< ShmDims[d] <<" ";
|
||||
}
|
||||
std::cout<< std::endl;
|
||||
|
||||
std::cout<< GridLogMessage << "Group ";
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
std::cout<< GroupDims[d] <<" ";
|
||||
}
|
||||
std::cout<< std::endl;
|
||||
|
||||
std::cout<< GridLogMessage<<"World ";
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
std::cout<< WorldDims[d] <<" ";
|
||||
}
|
||||
std::cout<< std::endl;
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Check processor counts match
|
||||
////////////////////////////////////////////////////////////////
|
||||
_Nprocessors=1;
|
||||
_processors = processors;
|
||||
_processor_coor.resize(_ndimension);
|
||||
for(int i=0;i<_ndimension;i++){
|
||||
_Nprocessors*=_processors[i];
|
||||
}
|
||||
assert(WorldSize==_Nprocessors);
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Establish mapping between lexico physics coord and WorldRank
|
||||
////////////////////////////////////////////////////////////////
|
||||
Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
|
||||
Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
|
||||
}
|
||||
_processor_coor = WorldCoor;
|
||||
_processor = WorldRank;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// global sum Lexico to World mapping
|
||||
///////////////////////////////////////////////////////////////////
|
||||
int lexico;
|
||||
LexicographicToWorldRank.resize(WorldSize,0);
|
||||
Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
|
||||
LexicographicToWorldRank[lexico] = WorldRank;
|
||||
ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
|
||||
for(int i=0;i<WorldSize;i++){
|
||||
|
||||
int wr = LexicographicToWorldRank[i];
|
||||
// int wr = i;
|
||||
|
||||
std::vector<int> coor(_ndimension);
|
||||
ProcessorCoorFromRank(wr,coor); // from world rank
|
||||
int ck = RankFromProcessorCoor(coor);
|
||||
assert(ck==wr);
|
||||
|
||||
if ( wr == WorldRank ) {
|
||||
for(int j=0;j<coor.size();j++) {
|
||||
assert(coor[j] == _processor_coor[j]);
|
||||
}
|
||||
}
|
||||
/*
|
||||
std::cout << GridLogMessage<< " Lexicographic "<<i;
|
||||
std::cout << " MPI rank "<<wr;
|
||||
std::cout << " Coor ";
|
||||
for(int j=0;j<coor.size();j++) std::cout << coor[j];
|
||||
std::cout<< std::endl;
|
||||
*/
|
||||
/////////////////////////////////////////////////////
|
||||
// Check everyone agrees on everyone elses coords
|
||||
/////////////////////////////////////////////////////
|
||||
std::vector<int> mcoor = coor;
|
||||
this->Broadcast(0,(void *)&mcoor[0],mcoor.size()*sizeof(int));
|
||||
for(int d = 0 ; d< _ndimension; d++) {
|
||||
assert(coor[d] == mcoor[d]);
|
||||
}
|
||||
}
|
||||
};
|
||||
CartesianCommunicator::~CartesianCommunicator()
|
||||
{
|
||||
int MPI_is_finalised;
|
||||
@ -734,19 +400,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
MPI_Request rrq;
|
||||
|
||||
int ierr;
|
||||
int gdest = GroupRanks[dest];
|
||||
int gfrom = GroupRanks[from];
|
||||
int gme = GroupRanks[_processor];
|
||||
int gdest = ShmRanks[dest];
|
||||
int gfrom = ShmRanks[from];
|
||||
int gme = ShmRanks[_processor];
|
||||
|
||||
assert(dest != _processor);
|
||||
assert(from != _processor);
|
||||
assert(gme == ShmRank);
|
||||
double off_node_bytes=0.0;
|
||||
|
||||
#ifdef FORCE_COMMS
|
||||
gdest = MPI_UNDEFINED;
|
||||
gfrom = MPI_UNDEFINED;
|
||||
#endif
|
||||
if ( gfrom ==MPI_UNDEFINED) {
|
||||
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
@ -815,5 +477,38 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
||||
{
|
||||
std::vector<int> row(_ndimension,1);
|
||||
assert(dim>=0 && dim<_ndimension);
|
||||
|
||||
// Split the communicator
|
||||
row[dim] = _processors[dim];
|
||||
|
||||
int me;
|
||||
CartesianCommunicator Comm(row,*this,me);
|
||||
Comm.AllToAll(in,out,words,bytes);
|
||||
}
|
||||
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
|
||||
{
|
||||
// MPI is a pain and uses "int" arguments
|
||||
// 64*64*64*128*16 == 500Million elements of data.
|
||||
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
||||
// (Turns up on 32^3 x 64 Gparity too)
|
||||
MPI_Datatype object;
|
||||
int iwords;
|
||||
int ibytes;
|
||||
iwords = words;
|
||||
ibytes = bytes;
|
||||
assert(words == iwords); // safe to cast to int ?
|
||||
assert(bytes == ibytes); // safe to cast to int ?
|
||||
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
|
||||
MPI_Type_commit(&object);
|
||||
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
|
||||
MPI_Type_free(&object);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,988 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/communicator/Communicator_mpi.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include "Grid.h"
|
||||
#include <mpi.h>
|
||||
//#include <numaif.h>
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Workarounds:
|
||||
/// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
|
||||
/// darwin dispatch semaphores don't seem to be multiprocess.
|
||||
///
|
||||
/// ii) openmpi under --mca shmem posix works with two squadrons per node;
|
||||
/// openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
|
||||
/// memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
|
||||
///
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#include <semaphore.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
typedef sem_t *Grid_semaphore;
|
||||
|
||||
|
||||
#error /*THis is deprecated*/
|
||||
|
||||
#if 0
|
||||
#define SEM_INIT(S) S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
|
||||
#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
|
||||
#define SEM_POST(S) assert ( sem_post(S) == 0 );
|
||||
#define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
|
||||
#else
|
||||
#define SEM_INIT(S) ;
|
||||
#define SEM_INIT_EXCL(S) ;
|
||||
#define SEM_POST(S) ;
|
||||
#define SEM_WAIT(S) ;
|
||||
#endif
|
||||
#include <sys/mman.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV };
|
||||
|
||||
struct Descriptor {
|
||||
uint64_t buf;
|
||||
size_t bytes;
|
||||
int rank;
|
||||
int tag;
|
||||
int command;
|
||||
uint64_t xbuf;
|
||||
uint64_t rbuf;
|
||||
int xtag;
|
||||
int rtag;
|
||||
int src;
|
||||
int dest;
|
||||
MPI_Request request;
|
||||
};
|
||||
|
||||
const int pool = 48;
|
||||
|
||||
class SlaveState {
|
||||
public:
|
||||
volatile int head;
|
||||
volatile int start;
|
||||
volatile int tail;
|
||||
volatile Descriptor Descrs[pool];
|
||||
};
|
||||
|
||||
class Slave {
|
||||
public:
|
||||
Grid_semaphore sem_head;
|
||||
Grid_semaphore sem_tail;
|
||||
SlaveState *state;
|
||||
MPI_Comm squadron;
|
||||
uint64_t base;
|
||||
int universe_rank;
|
||||
int vertical_rank;
|
||||
char sem_name [NAME_MAX];
|
||||
////////////////////////////////////////////////////////////
|
||||
// Descriptor circular pointers
|
||||
////////////////////////////////////////////////////////////
|
||||
Slave() {};
|
||||
|
||||
void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
|
||||
|
||||
void SemInit(void) {
|
||||
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
|
||||
SEM_INIT(sem_head);
|
||||
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
|
||||
SEM_INIT(sem_tail);
|
||||
}
|
||||
void SemInitExcl(void) {
|
||||
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
|
||||
SEM_INIT_EXCL(sem_head);
|
||||
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
|
||||
SEM_INIT_EXCL(sem_tail);
|
||||
}
|
||||
void WakeUpDMA(void) {
|
||||
SEM_POST(sem_head);
|
||||
};
|
||||
void WakeUpCompute(void) {
|
||||
SEM_POST(sem_tail);
|
||||
};
|
||||
void WaitForCommand(void) {
|
||||
SEM_WAIT(sem_head);
|
||||
};
|
||||
void WaitForComplete(void) {
|
||||
SEM_WAIT(sem_tail);
|
||||
};
|
||||
void EventLoop (void) {
|
||||
// std::cout<< " Entering event loop "<<std::endl;
|
||||
while(1){
|
||||
WaitForCommand();
|
||||
// std::cout << "Getting command "<<std::endl;
|
||||
#if 0
|
||||
_mm_monitor((void *)&state->head,0,0);
|
||||
int s=state->start;
|
||||
if ( s != state->head ) {
|
||||
_mm_mwait(0,0);
|
||||
}
|
||||
#endif
|
||||
Event();
|
||||
}
|
||||
}
|
||||
|
||||
int Event (void) ;
|
||||
|
||||
uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
|
||||
void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ;
|
||||
|
||||
void WaitAll() {
|
||||
// std::cout << "Queueing WAIT command "<<std::endl;
|
||||
QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
|
||||
// std::cout << "Waking up DMA "<<std::endl;
|
||||
WakeUpDMA();
|
||||
// std::cout << "Waiting from semaphore "<<std::endl;
|
||||
WaitForComplete();
|
||||
// std::cout << "Checking FIFO is empty "<<std::endl;
|
||||
while ( state->tail != state->head );
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// One instance of a data mover.
|
||||
// Master and Slave must agree on location in shared memory
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class MPIoffloadEngine {
|
||||
public:
|
||||
|
||||
static std::vector<Slave> Slaves;
|
||||
|
||||
static int ShmSetup;
|
||||
|
||||
static int UniverseRank;
|
||||
static int UniverseSize;
|
||||
|
||||
static MPI_Comm communicator_universe;
|
||||
static MPI_Comm communicator_cached;
|
||||
|
||||
static MPI_Comm HorizontalComm;
|
||||
static int HorizontalRank;
|
||||
static int HorizontalSize;
|
||||
|
||||
static MPI_Comm VerticalComm;
|
||||
static MPI_Win VerticalWindow;
|
||||
static int VerticalSize;
|
||||
static int VerticalRank;
|
||||
|
||||
static std::vector<void *> VerticalShmBufs;
|
||||
static std::vector<std::vector<int> > UniverseRanks;
|
||||
static std::vector<int> UserCommunicatorToWorldRanks;
|
||||
|
||||
static MPI_Group WorldGroup, CachedGroup;
|
||||
|
||||
static void CommunicatorInit (MPI_Comm &communicator_world,
|
||||
MPI_Comm &ShmComm,
|
||||
void * &ShmCommBuf);
|
||||
|
||||
static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// routines for master proc must handle any communicator
|
||||
/////////////////////////////////////////////////////////
|
||||
|
||||
static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||
// std::cout<< " Queueing send "<< bytes<< " slave "<< slave << " to comm "<<rank <<std::endl;
|
||||
Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
|
||||
// std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
|
||||
Slaves[slave].WakeUpDMA();
|
||||
// std::cout << "Waking up DMA "<< slave<<std::endl;
|
||||
};
|
||||
|
||||
static void QueueSendRecv(int slave,void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)
|
||||
{
|
||||
Slaves[slave].QueueSendRecv(xbuf,rbuf,bytes,xtag,rtag,comm,dest,src);
|
||||
Slaves[slave].WakeUpDMA();
|
||||
}
|
||||
|
||||
static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||
// std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank <<std::endl;
|
||||
Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
|
||||
// std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
|
||||
Slaves[slave].WakeUpDMA();
|
||||
// std::cout << "Waking up DMA "<< slave<<std::endl;
|
||||
};
|
||||
|
||||
static void WaitAll() {
|
||||
for(int s=1;s<VerticalSize;s++) {
|
||||
// std::cout << "Waiting for slave "<< s<<std::endl;
|
||||
Slaves[s].WaitAll();
|
||||
}
|
||||
// std::cout << " Wait all Complete "<<std::endl;
|
||||
};
|
||||
|
||||
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
|
||||
int basework = nwork/units;
|
||||
int backfill = units-(nwork%units);
|
||||
if ( me >= units ) {
|
||||
mywork = myoff = 0;
|
||||
} else {
|
||||
mywork = (nwork+me)/units;
|
||||
myoff = basework * me;
|
||||
if ( me > backfill )
|
||||
myoff+= (me-backfill);
|
||||
}
|
||||
return;
|
||||
};
|
||||
|
||||
static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
|
||||
uint8_t * cxbuf = (uint8_t *) xbuf;
|
||||
uint8_t * crbuf = (uint8_t *) rbuf;
|
||||
static int rrp=0;
|
||||
int procs = VerticalSize-1;
|
||||
int myoff=0;
|
||||
int mywork=bytes;
|
||||
QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
|
||||
rrp = rrp+1;
|
||||
if ( rrp == (VerticalSize-1) ) rrp = 0;
|
||||
}
|
||||
|
||||
static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
|
||||
uint8_t * cxbuf = (uint8_t *) xbuf;
|
||||
uint8_t * crbuf = (uint8_t *) rbuf;
|
||||
int mywork, myoff, procs;
|
||||
procs = VerticalSize-1;
|
||||
for(int s=0;s<procs;s++) {
|
||||
GetWork(bytes,s,mywork,myoff,procs);
|
||||
QueueSendRecv(s+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
|
||||
}
|
||||
};
|
||||
static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||
uint8_t * cbuf = (uint8_t *) buf;
|
||||
int mywork, myoff, procs;
|
||||
procs = VerticalSize-1;
|
||||
for(int s=0;s<procs;s++) {
|
||||
GetWork(bytes,s,mywork,myoff,procs);
|
||||
QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
|
||||
}
|
||||
};
|
||||
|
||||
static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
|
||||
uint8_t * cbuf = (uint8_t *) buf;
|
||||
int mywork, myoff, procs;
|
||||
procs = VerticalSize-1;
|
||||
for(int s=0;s<procs;s++) {
|
||||
GetWork(bytes,s,mywork,myoff,procs);
|
||||
QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
|
||||
}
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::vector<Slave> MPIoffloadEngine::Slaves;
|
||||
|
||||
int MPIoffloadEngine::UniverseRank;
|
||||
int MPIoffloadEngine::UniverseSize;
|
||||
|
||||
MPI_Comm MPIoffloadEngine::communicator_universe;
|
||||
MPI_Comm MPIoffloadEngine::communicator_cached;
|
||||
MPI_Group MPIoffloadEngine::WorldGroup;
|
||||
MPI_Group MPIoffloadEngine::CachedGroup;
|
||||
|
||||
MPI_Comm MPIoffloadEngine::HorizontalComm;
|
||||
int MPIoffloadEngine::HorizontalRank;
|
||||
int MPIoffloadEngine::HorizontalSize;
|
||||
|
||||
MPI_Comm MPIoffloadEngine::VerticalComm;
|
||||
int MPIoffloadEngine::VerticalSize;
|
||||
int MPIoffloadEngine::VerticalRank;
|
||||
MPI_Win MPIoffloadEngine::VerticalWindow;
|
||||
std::vector<void *> MPIoffloadEngine::VerticalShmBufs;
|
||||
std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
|
||||
std::vector<int> MPIoffloadEngine::UserCommunicatorToWorldRanks;
|
||||
|
||||
int CartesianCommunicator::NodeCount(void) { return HorizontalSize;};
|
||||
int MPIoffloadEngine::ShmSetup = 0;
|
||||
|
||||
void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
|
||||
MPI_Comm &ShmComm,
|
||||
void * &ShmCommBuf)
|
||||
{
|
||||
int flag;
|
||||
assert(ShmSetup==0);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Universe is all nodes prior to squadron grouping
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
|
||||
MPI_Comm_rank(communicator_universe,&UniverseRank);
|
||||
MPI_Comm_size(communicator_universe,&UniverseSize);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// Split into groups that can share memory (Verticals)
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
#undef MPI_SHARED_MEM_DEBUG
|
||||
#ifdef MPI_SHARED_MEM_DEBUG
|
||||
MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
|
||||
#else
|
||||
MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
|
||||
#endif
|
||||
MPI_Comm_rank(VerticalComm ,&VerticalRank);
|
||||
MPI_Comm_size(VerticalComm ,&VerticalSize);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Split into horizontal groups by rank in squadron
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
|
||||
MPI_Comm_rank(HorizontalComm,&HorizontalRank);
|
||||
MPI_Comm_size(HorizontalComm,&HorizontalSize);
|
||||
assert(HorizontalSize*VerticalSize==UniverseSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// What is my place in the world
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int WorldRank=0;
|
||||
if(VerticalRank==0) WorldRank = HorizontalRank;
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
|
||||
assert(ierr==0);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Where is the world in the universe?
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
|
||||
UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
|
||||
for(int w=0;w<HorizontalSize;w++){
|
||||
ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// allocate the shared window for our group, pass back Shm info to CartesianCommunicator
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
VerticalShmBufs.resize(VerticalSize);
|
||||
|
||||
#undef MPI_SHARED_MEM
|
||||
#ifdef MPI_SHARED_MEM
|
||||
ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
|
||||
ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
|
||||
assert(ierr==0);
|
||||
// std::cout<<"SHM "<<ShmCommBuf<<std::endl;
|
||||
|
||||
for(int r=0;r<VerticalSize;r++){
|
||||
MPI_Aint sz;
|
||||
int dsp_unit;
|
||||
MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
|
||||
// std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
|
||||
}
|
||||
#else
|
||||
char shm_name [NAME_MAX];
|
||||
MPI_Barrier(VerticalComm);
|
||||
|
||||
if ( VerticalRank == 0 ) {
|
||||
for(int r=0;r<VerticalSize;r++){
|
||||
|
||||
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
|
||||
if ( r>0 ) size = sizeof(SlaveState);
|
||||
|
||||
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
|
||||
|
||||
shm_unlink(shm_name);
|
||||
|
||||
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
|
||||
if ( fd < 0 ) {
|
||||
perror("failed shm_open");
|
||||
assert(0);
|
||||
}
|
||||
|
||||
ftruncate(fd, size);
|
||||
|
||||
VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if ( VerticalShmBufs[r] == MAP_FAILED ) {
|
||||
perror("failed mmap");
|
||||
assert(0);
|
||||
}
|
||||
|
||||
/*
|
||||
for(uint64_t page=0;page<size;page+=4096){
|
||||
void *pages = (void *) ( page + (uint64_t)VerticalShmBufs[r] );
|
||||
int status;
|
||||
int flags=MPOL_MF_MOVE_ALL;
|
||||
int nodes=1; // numa domain == MCDRAM
|
||||
unsigned long count=1;
|
||||
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
|
||||
if (ierr && (page==0)) perror("numa relocate command failed");
|
||||
}
|
||||
*/
|
||||
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
|
||||
check[0] = WorldRank;
|
||||
check[1] = r;
|
||||
|
||||
// std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(VerticalComm);
|
||||
|
||||
if ( VerticalRank != 0 ) {
|
||||
for(int r=0;r<VerticalSize;r++){
|
||||
|
||||
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
|
||||
if ( r>0 ) size = sizeof(SlaveState);
|
||||
|
||||
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
|
||||
|
||||
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
|
||||
if ( fd<0 ) {
|
||||
perror("failed shm_open");
|
||||
assert(0);
|
||||
}
|
||||
VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
|
||||
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
|
||||
assert(check[0]== WorldRank);
|
||||
assert(check[1]== r);
|
||||
// std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
MPI_Barrier(VerticalComm);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Map rank of leader on node in their in new world, to the
|
||||
// rank in this vertical plane's horizontal communicator
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
communicator_world = HorizontalComm;
|
||||
ShmComm = VerticalComm;
|
||||
ShmCommBuf = VerticalShmBufs[0];
|
||||
MPI_Comm_group (communicator_world, &WorldGroup);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Start the slave data movers
|
||||
///////////////////////////////////////////////////////////
|
||||
if ( VerticalRank != 0 ) {
|
||||
Slave indentured;
|
||||
indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
|
||||
indentured.SemInitExcl();// init semaphore in shared memory
|
||||
MPI_Barrier(VerticalComm);
|
||||
MPI_Barrier(VerticalComm);
|
||||
indentured.EventLoop();
|
||||
assert(0);
|
||||
} else {
|
||||
Slaves.resize(VerticalSize);
|
||||
for(int i=1;i<VerticalSize;i++){
|
||||
Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
|
||||
}
|
||||
MPI_Barrier(VerticalComm);
|
||||
for(int i=1;i<VerticalSize;i++){
|
||||
Slaves[i].SemInit();// init semaphore in shared memory
|
||||
}
|
||||
MPI_Barrier(VerticalComm);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Verbose for now
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
ShmSetup=1;
|
||||
|
||||
if (UniverseRank == 0){
|
||||
|
||||
std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
|
||||
std::cout<<UniverseSize << " Ranks " ;
|
||||
std::cout<<HorizontalSize << " Nodes " ;
|
||||
std::cout<<VerticalSize << " with ranks-per-node "<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
|
||||
std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
|
||||
|
||||
for(int g=0;g<HorizontalSize;g++){
|
||||
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
|
||||
}
|
||||
|
||||
for(int g=0;g<HorizontalSize;g++){
|
||||
std::cout<<GridLogMessage<<" { ";
|
||||
for(int s=0;s<VerticalSize;s++){
|
||||
std::cout<< UniverseRanks[g][s];
|
||||
if ( s<VerticalSize-1 ) {
|
||||
std::cout<<",";
|
||||
}
|
||||
}
|
||||
std::cout<<" } "<<std::endl;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Map the communicator into communicator_world, and find the neighbour.
|
||||
// Cache the mappings; cache size is 1.
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
|
||||
|
||||
if ( comm == HorizontalComm ) {
|
||||
comm_world_peer = rank;
|
||||
// std::cout << " MapCommRankToWorldRank horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
|
||||
} else if ( comm == communicator_cached ) {
|
||||
comm_world_peer = UserCommunicatorToWorldRanks[rank];
|
||||
// std::cout << " MapCommRankToWorldRank cached " <<rank<<"->"<<comm_world_peer<<std::endl;
|
||||
} else {
|
||||
|
||||
int size;
|
||||
|
||||
MPI_Comm_size(comm,&size);
|
||||
|
||||
UserCommunicatorToWorldRanks.resize(size);
|
||||
|
||||
std::vector<int> cached_ranks(size);
|
||||
|
||||
for(int r=0;r<size;r++) {
|
||||
cached_ranks[r]=r;
|
||||
}
|
||||
|
||||
communicator_cached=comm;
|
||||
|
||||
MPI_Comm_group(communicator_cached, &CachedGroup);
|
||||
|
||||
MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]);
|
||||
|
||||
comm_world_peer = UserCommunicatorToWorldRanks[rank];
|
||||
// std::cout << " MapCommRankToWorldRank cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
|
||||
|
||||
assert(comm_world_peer != MPI_UNDEFINED);
|
||||
}
|
||||
|
||||
assert( (tag & (~0xFFFFL)) ==0);
|
||||
|
||||
uint64_t icomm = (uint64_t)comm;
|
||||
int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
|
||||
^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
|
||||
|
||||
// hashtag = (comm_hash<<15) | tag;
|
||||
hashtag = tag;
|
||||
|
||||
};
|
||||
|
||||
void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
|
||||
{
|
||||
squadron=_squadron;
|
||||
universe_rank=_universe_rank;
|
||||
vertical_rank=_vertical_rank;
|
||||
state =_state;
|
||||
// std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
|
||||
state->head = state->tail = state->start = 0;
|
||||
base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
|
||||
int rank; MPI_Comm_rank(_squadron,&rank);
|
||||
}
|
||||
#define PERI_PLUS(A) ( (A+1)%pool )
|
||||
int Slave::Event (void) {
|
||||
|
||||
static int tail_last;
|
||||
static int head_last;
|
||||
static int start_last;
|
||||
int ierr;
|
||||
MPI_Status stat;
|
||||
static int i=0;
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
// Try to advance the start pointers
|
||||
////////////////////////////////////////////////////
|
||||
int s=state->start;
|
||||
if ( s != state->head ) {
|
||||
switch ( state->Descrs[s].command ) {
|
||||
case COMMAND_ISEND:
|
||||
ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),
|
||||
state->Descrs[s].bytes,
|
||||
MPI_CHAR,
|
||||
state->Descrs[s].rank,
|
||||
state->Descrs[s].tag,
|
||||
MPIoffloadEngine::communicator_universe,
|
||||
(MPI_Request *)&state->Descrs[s].request);
|
||||
assert(ierr==0);
|
||||
state->start = PERI_PLUS(s);
|
||||
return 1;
|
||||
break;
|
||||
|
||||
case COMMAND_IRECV:
|
||||
ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),
|
||||
state->Descrs[s].bytes,
|
||||
MPI_CHAR,
|
||||
state->Descrs[s].rank,
|
||||
state->Descrs[s].tag,
|
||||
MPIoffloadEngine::communicator_universe,
|
||||
(MPI_Request *)&state->Descrs[s].request);
|
||||
|
||||
// std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
|
||||
// std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
|
||||
assert(ierr==0);
|
||||
state->start = PERI_PLUS(s);
|
||||
return 1;
|
||||
break;
|
||||
|
||||
case COMMAND_SENDRECV:
|
||||
|
||||
// fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10);
|
||||
|
||||
ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10,
|
||||
(void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10,
|
||||
MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE);
|
||||
|
||||
assert(ierr==0);
|
||||
|
||||
// fprintf(stderr,"Sendrecv done %d %d\n",ierr,i);
|
||||
// MPI_Barrier(MPIoffloadEngine::HorizontalComm);
|
||||
// fprintf(stderr,"Barrier\n");
|
||||
i++;
|
||||
|
||||
state->start = PERI_PLUS(s);
|
||||
|
||||
return 1;
|
||||
break;
|
||||
|
||||
case COMMAND_WAITALL:
|
||||
|
||||
for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
|
||||
if ( state->Descrs[t].command != COMMAND_SENDRECV ) {
|
||||
MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
|
||||
}
|
||||
};
|
||||
s=PERI_PLUS(s);
|
||||
state->start = s;
|
||||
state->tail = s;
|
||||
|
||||
WakeUpCompute();
|
||||
|
||||
return 1;
|
||||
break;
|
||||
|
||||
default:
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// External interaction with the queue
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)
|
||||
{
|
||||
int head =state->head;
|
||||
int next = PERI_PLUS(head);
|
||||
|
||||
// Set up descriptor
|
||||
int worldrank;
|
||||
int hashtag;
|
||||
MPI_Comm communicator;
|
||||
MPI_Request request;
|
||||
uint64_t relative;
|
||||
|
||||
relative = (uint64_t)xbuf - base;
|
||||
state->Descrs[head].xbuf = relative;
|
||||
|
||||
relative= (uint64_t)rbuf - base;
|
||||
state->Descrs[head].rbuf = relative;
|
||||
|
||||
state->Descrs[head].bytes = bytes;
|
||||
|
||||
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest);
|
||||
state->Descrs[head].dest = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
|
||||
state->Descrs[head].xtag = hashtag;
|
||||
|
||||
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src);
|
||||
state->Descrs[head].src = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
|
||||
state->Descrs[head].rtag = hashtag;
|
||||
|
||||
state->Descrs[head].command= COMMAND_SENDRECV;
|
||||
|
||||
// Block until FIFO has space
|
||||
while( state->tail==next );
|
||||
|
||||
// Msync on weak order architectures
|
||||
|
||||
// Advance pointer
|
||||
state->head = next;
|
||||
|
||||
};
|
||||
uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank)
|
||||
{
|
||||
/////////////////////////////////////////
|
||||
// Spin; if FIFO is full until not full
|
||||
/////////////////////////////////////////
|
||||
int head =state->head;
|
||||
int next = PERI_PLUS(head);
|
||||
|
||||
// Set up descriptor
|
||||
int worldrank;
|
||||
int hashtag;
|
||||
MPI_Comm communicator;
|
||||
MPI_Request request;
|
||||
|
||||
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
|
||||
|
||||
uint64_t relative= (uint64_t)buf - base;
|
||||
state->Descrs[head].buf = relative;
|
||||
state->Descrs[head].bytes = bytes;
|
||||
state->Descrs[head].rank = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
|
||||
state->Descrs[head].tag = hashtag;
|
||||
state->Descrs[head].command= command;
|
||||
|
||||
/*
|
||||
if ( command == COMMAND_ISEND ) {
|
||||
std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank
|
||||
<< " to worldrank " << worldrank <<std::endl;
|
||||
std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
|
||||
std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
|
||||
}
|
||||
if ( command == COMMAND_IRECV ) {
|
||||
std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank
|
||||
<< " from worldrank " << worldrank <<std::endl;
|
||||
std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
|
||||
std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
|
||||
}
|
||||
*/
|
||||
// Block until FIFO has space
|
||||
while( state->tail==next );
|
||||
|
||||
// Msync on weak order architectures
|
||||
// Advance pointer
|
||||
state->head = next;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
MPI_Comm CartesianCommunicator::communicator_world;
|
||||
|
||||
void CartesianCommunicator::Init(int *argc, char ***argv)
|
||||
{
|
||||
int flag;
|
||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||
if ( !flag ) {
|
||||
MPI_Init(argc,argv);
|
||||
}
|
||||
communicator_world = MPI_COMM_WORLD;
|
||||
MPI_Comm ShmComm;
|
||||
MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
|
||||
}
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||
assert(ierr==0);
|
||||
}
|
||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||
{
|
||||
int rank;
|
||||
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||
assert(ierr==0);
|
||||
return rank;
|
||||
}
|
||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||
{
|
||||
coor.resize(_ndimension);
|
||||
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
{
|
||||
_ndimension = processors.size();
|
||||
std::vector<int> periodic(_ndimension,1);
|
||||
|
||||
_Nprocessors=1;
|
||||
_processors = processors;
|
||||
|
||||
for(int i=0;i<_ndimension;i++){
|
||||
_Nprocessors*=_processors[i];
|
||||
}
|
||||
|
||||
int Size;
|
||||
MPI_Comm_size(communicator_world,&Size);
|
||||
assert(Size==_Nprocessors);
|
||||
|
||||
_processor_coor.resize(_ndimension);
|
||||
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||
MPI_Comm_rank (communicator,&_processor);
|
||||
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||
};
|
||||
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||
SendToRecvFromComplete(reqs);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int sender,
|
||||
int receiver,
|
||||
int bytes)
|
||||
{
|
||||
MPI_Status stat;
|
||||
assert(sender != receiver);
|
||||
int tag = sender;
|
||||
if ( _processor == sender ) {
|
||||
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||
}
|
||||
if ( _processor == receiver ) {
|
||||
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||
}
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
int rank = _processor;
|
||||
int ierr;
|
||||
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||
|
||||
assert(ierr==0);
|
||||
|
||||
list.push_back(xrq);
|
||||
list.push_back(rrq);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
uint64_t xmit_i = (uint64_t) xmit;
|
||||
uint64_t recv_i = (uint64_t) recv;
|
||||
uint64_t shm = (uint64_t) ShmCommBuf;
|
||||
// assert xmit and recv lie in shared memory region
|
||||
assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
|
||||
assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
|
||||
assert(from!=_processor);
|
||||
assert(dest!=_processor);
|
||||
|
||||
MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
|
||||
|
||||
//MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
|
||||
|
||||
//MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
|
||||
//MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
MPIoffloadEngine::WaitAll();
|
||||
//this->Barrier();
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void) { }
|
||||
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
int nreq=list.size();
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
int ierr=MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
{
|
||||
int ierr= MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator_world);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
|
||||
|
||||
void *CartesianCommunicator::ShmBuffer(int rank) {
|
||||
return NULL;
|
||||
}
|
||||
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
@ -1,273 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/communicator/Communicator_mpi.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/GridCore.h>
|
||||
#include <Grid/GridQCDcore.h>
|
||||
#include <Grid/qcd/action/ActionCore.h>
|
||||
#include <mpi.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
MPI_Comm CartesianCommunicator::communicator_world;
|
||||
|
||||
// Should error check all MPI calls.
|
||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
int flag;
|
||||
int provided;
|
||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||
if ( !flag ) {
|
||||
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||
if ( provided != MPI_THREAD_MULTIPLE ) {
|
||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||
}
|
||||
}
|
||||
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||
ShmInitGeneric();
|
||||
}
|
||||
|
||||
CartesianCommunicator::~CartesianCommunicator()
|
||||
{
|
||||
int MPI_is_finalised;
|
||||
MPI_Finalized(&MPI_is_finalised);
|
||||
if (communicator && !MPI_is_finalised){
|
||||
MPI_Comm_free(&communicator);
|
||||
for(int i=0;i< communicator_halo.size();i++){
|
||||
MPI_Comm_free(&communicator_halo[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||
assert(ierr==0);
|
||||
}
|
||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||
{
|
||||
int rank;
|
||||
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||
assert(ierr==0);
|
||||
return rank;
|
||||
}
|
||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||
{
|
||||
coor.resize(_ndimension);
|
||||
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||
SendToRecvFromComplete(reqs);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int sender,
|
||||
int receiver,
|
||||
int bytes)
|
||||
{
|
||||
MPI_Status stat;
|
||||
assert(sender != receiver);
|
||||
int tag = sender;
|
||||
if ( _processor == sender ) {
|
||||
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||
}
|
||||
if ( _processor == receiver ) {
|
||||
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||
}
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
list.push_back(rrq);
|
||||
} else {
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||
recv,bytes,MPI_CHAR,from, from,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
}
|
||||
}
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||
int nreq=list.size();
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
}
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
int ierr=MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
///////////////////////////////////////////////////////
|
||||
// Should only be used prior to Grid Init finished.
|
||||
// Check for this?
|
||||
///////////////////////////////////////////////////////
|
||||
int CartesianCommunicator::RankWorld(void){
|
||||
int r;
|
||||
MPI_Comm_rank(communicator_world,&r);
|
||||
return r;
|
||||
}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
{
|
||||
int ierr= MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
root,
|
||||
communicator_world);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes,int dir)
|
||||
{
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
|
||||
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
MPI_Request req[2];
|
||||
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
|
||||
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[commdir],&req[0]);
|
||||
|
||||
list.push_back(req[0]);
|
||||
list.push_back(req[1]);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
int nreq=waitall.size();
|
||||
MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes,int dir)
|
||||
{
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo.size()<< <std::endl;
|
||||
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||
MPI_Request req[2];
|
||||
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
|
||||
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[commdir],&req[0]);
|
||||
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
@ -32,14 +32,22 @@ namespace Grid {
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||
|
||||
void CartesianCommunicator::Init(int *argc, char *** arv)
|
||||
{
|
||||
ShmInitGeneric();
|
||||
GlobalSharedMemory::Init(communicator_world);
|
||||
GlobalSharedMemory::SharedMemoryAllocate(
|
||||
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
|
||||
GlobalSharedMemory::Hugepages);
|
||||
}
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
|
||||
: CartesianCommunicator(processors) { srank=0;}
|
||||
: CartesianCommunicator(processors)
|
||||
{
|
||||
srank=0;
|
||||
SetCommunicator(communicator_world);
|
||||
}
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
{
|
||||
@ -54,6 +62,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
assert(_processors[d]==1);
|
||||
_processor_coor[d] = 0;
|
||||
}
|
||||
SetCommunicator(communicator_world);
|
||||
}
|
||||
|
||||
CartesianCommunicator::~CartesianCommunicator(){}
|
||||
@ -121,6 +130,36 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
|
||||
dest=0;
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
SendToRecvFromComplete(list);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,
|
||||
void *recv,
|
||||
int recv_from_rank,
|
||||
int bytes, int dir)
|
||||
{
|
||||
// Discard the "dir"
|
||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
SendToRecvFromComplete(waitall);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void){};
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,357 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/communicator/Communicator_shmem.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <mpp/shmem.h>
|
||||
#include <array>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
// Should error check all MPI calls.
|
||||
#define SHMEM_VET(addr)
|
||||
|
||||
#define SHMEM_VET_DEBUG(addr) { \
|
||||
if ( ! shmem_addr_accessible(addr,_processor) ) {\
|
||||
std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
|
||||
BACKTRACEFILE(); \
|
||||
}\
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Info that is setup once and indept of cartesian layout
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef struct HandShake_t {
|
||||
uint64_t seq_local;
|
||||
uint64_t seq_remote;
|
||||
} HandShake;
|
||||
|
||||
std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
|
||||
std::array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
|
||||
ret.fill(SHMEM_SYNC_VALUE);
|
||||
return ret;
|
||||
}
|
||||
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
|
||||
|
||||
static Vector< HandShake > XConnections;
|
||||
static Vector< HandShake > RConnections;
|
||||
|
||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
shmem_init();
|
||||
XConnections.resize(shmem_n_pes());
|
||||
RConnections.resize(shmem_n_pes());
|
||||
for(int pe =0 ; pe<shmem_n_pes();pe++){
|
||||
XConnections[pe].seq_local = 0;
|
||||
XConnections[pe].seq_remote= 0;
|
||||
RConnections[pe].seq_local = 0;
|
||||
RConnections[pe].seq_remote= 0;
|
||||
}
|
||||
shmem_barrier_all();
|
||||
ShmInitGeneric();
|
||||
}
|
||||
|
||||
CartesianCommunicator::~CartesianCommunicator(){}
|
||||
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent)
|
||||
: CartesianCommunicator(processors)
|
||||
{
|
||||
std::cout << "Attempts to split SHMEM communicators will fail " <<std::endl;
|
||||
}
|
||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||
{
|
||||
_ndimension = processors.size();
|
||||
std::vector<int> periodic(_ndimension,1);
|
||||
|
||||
_Nprocessors=1;
|
||||
_processors = processors;
|
||||
_processor_coor.resize(_ndimension);
|
||||
|
||||
_processor = shmem_my_pe();
|
||||
|
||||
Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
|
||||
|
||||
for(int i=0;i<_ndimension;i++){
|
||||
_Nprocessors*=_processors[i];
|
||||
}
|
||||
|
||||
int Size = shmem_n_pes();
|
||||
|
||||
|
||||
assert(Size==_Nprocessors);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
static long long source ;
|
||||
static long long dest ;
|
||||
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||
|
||||
// int nreduce=1;
|
||||
// int pestart=0;
|
||||
// int logStride=0;
|
||||
|
||||
source = u;
|
||||
dest = 0;
|
||||
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
|
||||
shmem_barrier_all(); // necessary?
|
||||
u = dest;
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
static long long source ;
|
||||
static long long dest ;
|
||||
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||
|
||||
// int nreduce=1;
|
||||
// int pestart=0;
|
||||
// int logStride=0;
|
||||
|
||||
source = u;
|
||||
dest = 0;
|
||||
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
|
||||
shmem_barrier_all(); // necessary?
|
||||
u = dest;
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
static float source ;
|
||||
static float dest ;
|
||||
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||
|
||||
source = f;
|
||||
dest =0.0;
|
||||
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
|
||||
shmem_barrier_all();
|
||||
f = dest;
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
static float source ;
|
||||
static float dest = 0 ;
|
||||
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||
|
||||
if ( shmem_addr_accessible(f,_processor) ){
|
||||
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync.data());
|
||||
shmem_barrier_all();
|
||||
return;
|
||||
}
|
||||
|
||||
for(int i=0;i<N;i++){
|
||||
dest =0.0;
|
||||
source = f[i];
|
||||
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
|
||||
shmem_barrier_all();
|
||||
f[i] = dest;
|
||||
}
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
static double source;
|
||||
static double dest ;
|
||||
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||
|
||||
source = d;
|
||||
dest = 0;
|
||||
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
|
||||
shmem_barrier_all();
|
||||
d = dest;
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
static double source ;
|
||||
static double dest ;
|
||||
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||
|
||||
|
||||
if ( shmem_addr_accessible(d,_processor) ){
|
||||
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync.data());
|
||||
shmem_barrier_all();
|
||||
return;
|
||||
}
|
||||
|
||||
for(int i=0;i<N;i++){
|
||||
source = d[i];
|
||||
dest =0.0;
|
||||
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
|
||||
shmem_barrier_all();
|
||||
d[i] = dest;
|
||||
}
|
||||
}
|
||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||
{
|
||||
std::vector<int> coor = _processor_coor;
|
||||
|
||||
assert(std::abs(shift) <_processors[dim]);
|
||||
|
||||
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
|
||||
Lexicographic::IndexFromCoor(coor,source,_processors);
|
||||
|
||||
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
|
||||
Lexicographic::IndexFromCoor(coor,dest,_processors);
|
||||
|
||||
}
|
||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||
{
|
||||
int rank;
|
||||
Lexicographic::IndexFromCoor(coor,rank,_processors);
|
||||
return rank;
|
||||
}
|
||||
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||
{
|
||||
Lexicographic::CoorFromIndex(coor,rank,_processors);
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
SHMEM_VET(xmit);
|
||||
SHMEM_VET(recv);
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||
SendToRecvFromComplete(reqs);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
void *recv,
|
||||
int sender,
|
||||
int receiver,
|
||||
int bytes)
|
||||
{
|
||||
static uint64_t seq;
|
||||
|
||||
assert(recv!=xmit);
|
||||
volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
|
||||
volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
|
||||
|
||||
if ( _processor == sender ) {
|
||||
|
||||
// Check he has posted a receive
|
||||
while(SendSeq->seq_remote == SendSeq->seq_local);
|
||||
|
||||
// Advance our send count
|
||||
seq = ++(SendSeq->seq_local);
|
||||
|
||||
// Send this packet
|
||||
SHMEM_VET(recv);
|
||||
shmem_putmem(recv,xmit,bytes,receiver);
|
||||
shmem_fence();
|
||||
|
||||
//Notify him we're done
|
||||
shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
|
||||
shmem_fence();
|
||||
}
|
||||
if ( _processor == receiver ) {
|
||||
|
||||
// Post a receive
|
||||
seq = ++(RecvSeq->seq_local);
|
||||
shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
|
||||
|
||||
// Now wait until he has advanced our reception counter
|
||||
while(RecvSeq->seq_remote != RecvSeq->seq_local);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Basic Halo comms primitive
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
SHMEM_VET(xmit);
|
||||
SHMEM_VET(recv);
|
||||
// shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
|
||||
shmem_putmem(recv,xmit,bytes,dest);
|
||||
|
||||
if ( CommunicatorPolicy == CommunicatorPolicySequential ) shmem_barrier_all();
|
||||
}
|
||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
// shmem_quiet(); // I'm done
|
||||
if( CommunicatorPolicy == CommunicatorPolicyConcurrent ) shmem_barrier_all();// He's done too
|
||||
}
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
shmem_barrier_all();
|
||||
}
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||
static uint32_t word;
|
||||
uint32_t *array = (uint32_t *) data;
|
||||
assert( (bytes % 4)==0);
|
||||
int words = bytes/4;
|
||||
|
||||
if ( shmem_addr_accessible(data,_processor) ){
|
||||
shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync.data());
|
||||
return;
|
||||
}
|
||||
|
||||
for(int w=0;w<words;w++){
|
||||
word = array[w];
|
||||
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
|
||||
if ( shmem_my_pe() != root ) {
|
||||
array[w] = word;
|
||||
}
|
||||
shmem_barrier_all();
|
||||
}
|
||||
|
||||
}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
{
|
||||
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
|
||||
static uint32_t word;
|
||||
uint32_t *array = (uint32_t *) data;
|
||||
assert( (bytes % 4)==0);
|
||||
int words = bytes/4;
|
||||
|
||||
for(int w=0;w<words;w++){
|
||||
word = array[w];
|
||||
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
|
||||
if ( shmem_my_pe() != root ) {
|
||||
array[w]= word;
|
||||
}
|
||||
shmem_barrier_all();
|
||||
}
|
||||
}
|
||||
|
||||
int CartesianCommunicator::RankWorld(void){
|
||||
return shmem_my_pe();
|
||||
}
|
||||
|
||||
}
|
||||
|
92
lib/communicator/SharedMemory.cc
Normal file
92
lib/communicator/SharedMemory.cc
Normal file
@ -0,0 +1,92 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/communicator/SharedMemory.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
// static data
|
||||
|
||||
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
|
||||
int GlobalSharedMemory::Hugepages = 0;
|
||||
int GlobalSharedMemory::_ShmSetup;
|
||||
int GlobalSharedMemory::_ShmAlloc;
|
||||
uint64_t GlobalSharedMemory::_ShmAllocBytes;
|
||||
|
||||
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
|
||||
|
||||
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
|
||||
int GlobalSharedMemory::WorldShmRank;
|
||||
int GlobalSharedMemory::WorldShmSize;
|
||||
std::vector<int> GlobalSharedMemory::WorldShmRanks;
|
||||
|
||||
Grid_MPI_Comm GlobalSharedMemory::WorldComm;
|
||||
int GlobalSharedMemory::WorldSize;
|
||||
int GlobalSharedMemory::WorldRank;
|
||||
|
||||
int GlobalSharedMemory::WorldNodes;
|
||||
int GlobalSharedMemory::WorldNode;
|
||||
|
||||
void GlobalSharedMemory::SharedMemoryFree(void)
|
||||
{
|
||||
assert(_ShmAlloc);
|
||||
assert(_ShmAllocBytes>0);
|
||||
for(int r=0;r<WorldShmSize;r++){
|
||||
munmap(WorldShmCommBufs[r],_ShmAllocBytes);
|
||||
}
|
||||
_ShmAlloc = 0;
|
||||
_ShmAllocBytes = 0;
|
||||
}
|
||||
/////////////////////////////////
|
||||
// Alloc, free shmem region
|
||||
/////////////////////////////////
|
||||
void *SharedMemory::ShmBufferMalloc(size_t bytes){
|
||||
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
|
||||
void *ptr = (void *)heap_top;
|
||||
heap_top += bytes;
|
||||
heap_bytes+= bytes;
|
||||
if (heap_bytes >= heap_size) {
|
||||
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
||||
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
||||
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
|
||||
assert(heap_bytes<heap_size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
void SharedMemory::ShmBufferFreeAll(void) {
|
||||
heap_top =(size_t)ShmBufferSelf();
|
||||
heap_bytes=0;
|
||||
}
|
||||
void *SharedMemory::ShmBufferSelf(void)
|
||||
{
|
||||
return ShmCommBufs[ShmRank];
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
165
lib/communicator/SharedMemory.h
Normal file
165
lib/communicator/SharedMemory.h
Normal file
@ -0,0 +1,165 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/communicator/SharedMemory.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
|
||||
// TODO
|
||||
// 1) move includes into SharedMemory.cc
|
||||
//
|
||||
// 2) split shared memory into a) optimal communicator creation from comm world
|
||||
//
|
||||
// b) shared memory buffers container
|
||||
// -- static globally shared; init once
|
||||
// -- per instance set of buffers.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
#if defined (GRID_COMMS_MPI3)
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#include <semaphore.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/ipc.h>
|
||||
#include <sys/shm.h>
|
||||
#include <sys/mman.h>
|
||||
#include <zlib.h>
|
||||
#ifdef HAVE_NUMAIF_H
|
||||
#include <numaif.h>
|
||||
#endif
|
||||
|
||||
namespace Grid {
|
||||
|
||||
#if defined (GRID_COMMS_MPI3)
|
||||
typedef MPI_Comm Grid_MPI_Comm;
|
||||
typedef MPI_Request CommsRequest_t;
|
||||
#else
|
||||
typedef int CommsRequest_t;
|
||||
typedef int Grid_MPI_Comm;
|
||||
#endif
|
||||
|
||||
class GlobalSharedMemory {
|
||||
private:
|
||||
static const int MAXLOG2RANKSPERNODE = 16;
|
||||
|
||||
// Init once lock on the buffer allocation
|
||||
static int _ShmSetup;
|
||||
static int _ShmAlloc;
|
||||
static uint64_t _ShmAllocBytes;
|
||||
|
||||
public:
|
||||
static int ShmSetup(void) { return _ShmSetup; }
|
||||
static int ShmAlloc(void) { return _ShmAlloc; }
|
||||
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
|
||||
static uint64_t MAX_MPI_SHM_BYTES;
|
||||
static int Hugepages;
|
||||
|
||||
static std::vector<void *> WorldShmCommBufs;
|
||||
|
||||
static Grid_MPI_Comm WorldComm;
|
||||
static int WorldRank;
|
||||
static int WorldSize;
|
||||
|
||||
static Grid_MPI_Comm WorldShmComm;
|
||||
static int WorldShmRank;
|
||||
static int WorldShmSize;
|
||||
|
||||
static int WorldNodes;
|
||||
static int WorldNode;
|
||||
|
||||
static std::vector<int> WorldShmRanks;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
|
||||
static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
|
||||
///////////////////////////////////////////////////
|
||||
// Provide shared memory facilities off comm world
|
||||
///////////////////////////////////////////////////
|
||||
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
||||
static void SharedMemoryFree(void);
|
||||
|
||||
};
|
||||
|
||||
//////////////////////////////
|
||||
// one per communicator
|
||||
//////////////////////////////
|
||||
class SharedMemory
|
||||
{
|
||||
private:
|
||||
static const int MAXLOG2RANKSPERNODE = 16;
|
||||
|
||||
size_t heap_top;
|
||||
size_t heap_bytes;
|
||||
size_t heap_size;
|
||||
|
||||
protected:
|
||||
|
||||
Grid_MPI_Comm ShmComm; // for barriers
|
||||
int ShmRank;
|
||||
int ShmSize;
|
||||
std::vector<void *> ShmCommBufs;
|
||||
std::vector<int> ShmRanks;// Mapping comm ranks to Shm ranks
|
||||
|
||||
public:
|
||||
SharedMemory() {};
|
||||
~SharedMemory();
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// set the buffers & sizes
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
void SetCommunicator(Grid_MPI_Comm comm);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// For this instance ; disjoint buffer sets between splits if split grid
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void ShmBarrier(void);
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Call on any instance
|
||||
///////////////////////////////////////////////////
|
||||
void SharedMemoryTest(void);
|
||||
void *ShmBufferSelf(void);
|
||||
void *ShmBuffer (int rank);
|
||||
void *ShmBufferTranslate(int rank,void * local_p);
|
||||
void *ShmBufferMalloc(size_t bytes);
|
||||
void ShmBufferFreeAll(void) ;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Make info on Nodes & ranks and Shared memory available
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
|
||||
int RankCount(void) { return GlobalSharedMemory::WorldSize;};
|
||||
|
||||
};
|
||||
|
||||
}
|
410
lib/communicator/SharedMemoryMPI.cc
Normal file
410
lib/communicator/SharedMemoryMPI.cc
Normal file
@ -0,0 +1,410 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/communicator/SharedMemory.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
/*Construct from an MPI communicator*/
|
||||
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
||||
{
|
||||
assert(_ShmSetup==0);
|
||||
WorldComm = comm;
|
||||
MPI_Comm_rank(WorldComm,&WorldRank);
|
||||
MPI_Comm_size(WorldComm,&WorldSize);
|
||||
// WorldComm, WorldSize, WorldRank
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// Split into groups that can share memory
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
|
||||
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
|
||||
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
|
||||
// WorldShmComm, WorldShmSize, WorldShmRank
|
||||
|
||||
// WorldNodes
|
||||
WorldNodes = WorldSize/WorldShmSize;
|
||||
assert( (WorldNodes * WorldShmSize) == WorldSize );
|
||||
|
||||
// FIXME: Check all WorldShmSize are the same ?
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// find world ranks in our SHM group (i.e. which ranks are on our node)
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
MPI_Group WorldGroup, ShmGroup;
|
||||
MPI_Comm_group (WorldComm, &WorldGroup);
|
||||
MPI_Comm_group (WorldShmComm, &ShmGroup);
|
||||
|
||||
std::vector<int> world_ranks(WorldSize); for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
|
||||
|
||||
WorldShmRanks.resize(WorldSize);
|
||||
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]);
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Identify who is in my group and nominate the leader
|
||||
///////////////////////////////////////////////////////////////////
|
||||
int g=0;
|
||||
std::vector<int> MyGroup;
|
||||
MyGroup.resize(WorldShmSize);
|
||||
for(int rank=0;rank<WorldSize;rank++){
|
||||
if(WorldShmRanks[rank]!=MPI_UNDEFINED){
|
||||
assert(g<WorldShmSize);
|
||||
MyGroup[g++] = rank;
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
|
||||
int myleader = MyGroup[0];
|
||||
|
||||
std::vector<int> leaders_1hot(WorldSize,0);
|
||||
std::vector<int> leaders_group(WorldNodes,0);
|
||||
leaders_1hot [ myleader ] = 1;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// global sum leaders over comm world
|
||||
///////////////////////////////////////////////////////////////////
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
|
||||
assert(ierr==0);
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// find the group leaders world rank
|
||||
///////////////////////////////////////////////////////////////////
|
||||
int group=0;
|
||||
for(int l=0;l<WorldSize;l++){
|
||||
if(leaders_1hot[l]){
|
||||
leaders_group[group++] = l;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Identify the node of the group in which I (and my leader) live
|
||||
///////////////////////////////////////////////////////////////////
|
||||
WorldNode=-1;
|
||||
for(int g=0;g<WorldNodes;g++){
|
||||
if (myleader == leaders_group[g]){
|
||||
WorldNode=g;
|
||||
}
|
||||
}
|
||||
assert(WorldNode!=-1);
|
||||
_ShmSetup=1;
|
||||
}
|
||||
|
||||
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Assert power of two shm_size.
|
||||
////////////////////////////////////////////////////////////////
|
||||
int log2size = -1;
|
||||
for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){
|
||||
if ( (0x1<<i) == WorldShmSize ) {
|
||||
log2size = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(log2size != -1);
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Identify subblock of ranks on node spreading across dims
|
||||
// in a maximally symmetrical way
|
||||
////////////////////////////////////////////////////////////////
|
||||
int ndimension = processors.size();
|
||||
std::vector<int> processor_coor(ndimension);
|
||||
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
|
||||
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
|
||||
int dim = 0;
|
||||
for(int l2=0;l2<log2size;l2++){
|
||||
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
|
||||
ShmDims[dim]*=2;
|
||||
dim=(dim+1)%ndimension;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Establish torus of processes and nodes with sub-blockings
|
||||
////////////////////////////////////////////////////////////////
|
||||
for(int d=0;d<ndimension;d++){
|
||||
NodeDims[d] = WorldDims[d]/ShmDims[d];
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Check processor counts match
|
||||
////////////////////////////////////////////////////////////////
|
||||
int Nprocessors=1;
|
||||
for(int i=0;i<ndimension;i++){
|
||||
Nprocessors*=processors[i];
|
||||
}
|
||||
assert(WorldSize==Nprocessors);
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Establish mapping between lexico physics coord and WorldRank
|
||||
////////////////////////////////////////////////////////////////
|
||||
int rank;
|
||||
|
||||
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
|
||||
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
|
||||
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
|
||||
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Build the new communicator
|
||||
/////////////////////////////////////////////////////////////////
|
||||
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbfs mapping intended
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef GRID_MPI3_SHMMMAP
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<<std::endl;
|
||||
assert(_ShmSetup==1);
|
||||
assert(_ShmAlloc==0);
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// allocate the shared windows for our group
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
MPI_Barrier(WorldShmComm);
|
||||
WorldShmCommBufs.resize(WorldShmSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbf and others map filesystems as mappable huge pages
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
char shm_name [NAME_MAX];
|
||||
for(int r=0;r<WorldShmSize;r++){
|
||||
|
||||
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
|
||||
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
|
||||
if ( fd == -1) {
|
||||
printf("open %s failed\n",shm_name);
|
||||
perror("open hugetlbfs");
|
||||
exit(0);
|
||||
}
|
||||
int mmap_flag = MAP_SHARED ;
|
||||
#ifdef MAP_POPULATE
|
||||
mmap_flag|=MAP_POPULATE;
|
||||
#endif
|
||||
#ifdef MAP_HUGETLB
|
||||
if ( flags ) mmap_flag |= MAP_HUGETLB;
|
||||
#endif
|
||||
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
|
||||
if ( ptr == (void *)MAP_FAILED ) {
|
||||
printf("mmap %s failed\n",shm_name);
|
||||
perror("failed mmap"); assert(0);
|
||||
}
|
||||
assert(((uint64_t)ptr&0x3F)==0);
|
||||
close(fd);
|
||||
WorldShmCommBufs[r] =ptr;
|
||||
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
|
||||
}
|
||||
_ShmAlloc=1;
|
||||
_ShmAllocBytes = bytes;
|
||||
};
|
||||
#endif // MMAP
|
||||
|
||||
#ifdef GRID_MPI3_SHMOPEN
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
|
||||
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
|
||||
// the posix shm virtual file system
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
|
||||
assert(_ShmSetup==1);
|
||||
assert(_ShmAlloc==0);
|
||||
MPI_Barrier(WorldShmComm);
|
||||
WorldShmCommBufs.resize(WorldShmSize);
|
||||
|
||||
char shm_name [NAME_MAX];
|
||||
if ( WorldShmRank == 0 ) {
|
||||
for(int r=0;r<WorldShmSize;r++){
|
||||
|
||||
size_t size = bytes;
|
||||
|
||||
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
|
||||
|
||||
shm_unlink(shm_name);
|
||||
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
|
||||
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
|
||||
ftruncate(fd, size);
|
||||
|
||||
int mmap_flag = MAP_SHARED;
|
||||
#ifdef MAP_POPULATE
|
||||
mmap_flag |= MAP_POPULATE;
|
||||
#endif
|
||||
#ifdef MAP_HUGETLB
|
||||
if (flags) mmap_flag |= MAP_HUGETLB;
|
||||
#endif
|
||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
|
||||
|
||||
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
|
||||
if ( ptr == (void * )MAP_FAILED ) {
|
||||
perror("failed mmap");
|
||||
assert(0);
|
||||
}
|
||||
assert(((uint64_t)ptr&0x3F)==0);
|
||||
|
||||
WorldShmCommBufs[r] =ptr;
|
||||
close(fd);
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(WorldShmComm);
|
||||
|
||||
if ( WorldShmRank != 0 ) {
|
||||
for(int r=0;r<WorldShmSize;r++){
|
||||
|
||||
size_t size = bytes ;
|
||||
|
||||
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
|
||||
|
||||
int fd=shm_open(shm_name,O_RDWR,0666);
|
||||
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
|
||||
|
||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
||||
assert(((uint64_t)ptr&0x3F)==0);
|
||||
WorldShmCommBufs[r] =ptr;
|
||||
|
||||
close(fd);
|
||||
}
|
||||
}
|
||||
_ShmAlloc=1;
|
||||
_ShmAllocBytes = bytes;
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Global shared functionality finished
|
||||
// Now move to per communicator functionality
|
||||
////////////////////////////////////////////////////////
|
||||
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
||||
{
|
||||
int rank, size;
|
||||
MPI_Comm_rank(comm,&rank);
|
||||
MPI_Comm_size(comm,&size);
|
||||
ShmRanks.resize(size);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// Split into groups that can share memory
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
|
||||
MPI_Comm_rank(ShmComm ,&ShmRank);
|
||||
MPI_Comm_size(ShmComm ,&ShmSize);
|
||||
ShmCommBufs.resize(ShmSize);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Map ShmRank to WorldShmRank and use the right buffer
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
assert (GlobalSharedMemory::ShmAlloc()==1);
|
||||
heap_size = GlobalSharedMemory::ShmAllocBytes();
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
|
||||
uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
|
||||
|
||||
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
||||
|
||||
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
||||
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
|
||||
}
|
||||
ShmBufferFreeAll();
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// find comm ranks in our SHM group (i.e. which ranks are on our node)
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
MPI_Group FullGroup, ShmGroup;
|
||||
MPI_Comm_group (comm , &FullGroup);
|
||||
MPI_Comm_group (ShmComm, &ShmGroup);
|
||||
|
||||
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
|
||||
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// On node barrier
|
||||
//////////////////////////////////////////////////////////////////
|
||||
void SharedMemory::ShmBarrier(void)
|
||||
{
|
||||
MPI_Barrier (ShmComm);
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Test the shared memory is working
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void SharedMemory::SharedMemoryTest(void)
|
||||
{
|
||||
ShmBarrier();
|
||||
if ( ShmRank == 0 ) {
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
uint64_t * check = (uint64_t *) ShmCommBufs[r];
|
||||
check[0] = GlobalSharedMemory::WorldNode;
|
||||
check[1] = r;
|
||||
check[2] = 0x5A5A5A;
|
||||
}
|
||||
}
|
||||
ShmBarrier();
|
||||
for(int r=0;r<ShmSize;r++){
|
||||
uint64_t * check = (uint64_t *) ShmCommBufs[r];
|
||||
|
||||
assert(check[0]==GlobalSharedMemory::WorldNode);
|
||||
assert(check[1]==r);
|
||||
assert(check[2]==0x5A5A5A);
|
||||
|
||||
}
|
||||
ShmBarrier();
|
||||
}
|
||||
|
||||
void *SharedMemory::ShmBuffer(int rank)
|
||||
{
|
||||
int gpeer = ShmRanks[rank];
|
||||
if (gpeer == MPI_UNDEFINED){
|
||||
return NULL;
|
||||
} else {
|
||||
return ShmCommBufs[gpeer];
|
||||
}
|
||||
}
|
||||
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
|
||||
{
|
||||
static int count =0;
|
||||
int gpeer = ShmRanks[rank];
|
||||
assert(gpeer!=ShmRank); // never send to self
|
||||
if (gpeer == MPI_UNDEFINED){
|
||||
return NULL;
|
||||
} else {
|
||||
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
|
||||
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
|
||||
return (void *) remote;
|
||||
}
|
||||
}
|
||||
SharedMemory::~SharedMemory()
|
||||
{
|
||||
int MPI_is_finalised; MPI_Finalized(&MPI_is_finalised);
|
||||
if ( !MPI_is_finalised ) {
|
||||
MPI_Comm_free(&ShmComm);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
128
lib/communicator/SharedMemoryNone.cc
Normal file
128
lib/communicator/SharedMemoryNone.cc
Normal file
@ -0,0 +1,128 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/communicator/SharedMemory.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/GridCore.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
/*Construct from an MPI communicator*/
|
||||
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
||||
{
|
||||
assert(_ShmSetup==0);
|
||||
WorldComm = 0;
|
||||
WorldRank = 0;
|
||||
WorldSize = 1;
|
||||
WorldShmComm = 0 ;
|
||||
WorldShmRank = 0 ;
|
||||
WorldShmSize = 1 ;
|
||||
WorldNodes = 1 ;
|
||||
WorldNode = 0 ;
|
||||
WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
|
||||
WorldShmCommBufs.resize(1);
|
||||
_ShmSetup=1;
|
||||
}
|
||||
|
||||
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
|
||||
{
|
||||
optimal_comm = WorldComm;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Hugetlbfs mapping intended, use anonymous mmap
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
void * ShmCommBuf ;
|
||||
assert(_ShmSetup==1);
|
||||
assert(_ShmAlloc==0);
|
||||
int mmap_flag =0;
|
||||
#ifdef MAP_ANONYMOUS
|
||||
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
|
||||
#endif
|
||||
#ifdef MAP_ANON
|
||||
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
|
||||
#endif
|
||||
#ifdef MAP_HUGETLB
|
||||
if ( flags ) mmap_flag |= MAP_HUGETLB;
|
||||
#endif
|
||||
ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
|
||||
if (ShmCommBuf == (void *)MAP_FAILED) {
|
||||
perror("mmap failed ");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#ifdef MADV_HUGEPAGE
|
||||
if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
|
||||
#endif
|
||||
bzero(ShmCommBuf,bytes);
|
||||
WorldShmCommBufs[0] = ShmCommBuf;
|
||||
_ShmAllocBytes=bytes;
|
||||
_ShmAlloc=1;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Global shared functionality finished
|
||||
// Now move to per communicator functionality
|
||||
////////////////////////////////////////////////////////
|
||||
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
||||
{
|
||||
assert(GlobalSharedMemory::ShmAlloc()==1);
|
||||
ShmRanks.resize(1);
|
||||
ShmCommBufs.resize(1);
|
||||
ShmRanks[0] = 0;
|
||||
ShmRank = 0;
|
||||
ShmSize = 1;
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Map ShmRank to WorldShmRank and use the right buffer
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
|
||||
heap_size = GlobalSharedMemory::ShmAllocBytes();
|
||||
ShmBufferFreeAll();
|
||||
return;
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// On node barrier
|
||||
//////////////////////////////////////////////////////////////////
|
||||
void SharedMemory::ShmBarrier(void){ return ; }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Test the shared memory is working
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void SharedMemory::SharedMemoryTest(void) { return; }
|
||||
|
||||
void *SharedMemory::ShmBuffer(int rank)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
SharedMemory::~SharedMemory()
|
||||
{};
|
||||
|
||||
}
|
@ -86,6 +86,7 @@ protected:
|
||||
Colours &Painter;
|
||||
int active;
|
||||
int timing_mode;
|
||||
int topWidth{-1};
|
||||
static int timestamp;
|
||||
std::string name, topName;
|
||||
std::string COLOUR;
|
||||
@ -124,11 +125,17 @@ public:
|
||||
Reset();
|
||||
}
|
||||
}
|
||||
void setTopWidth(const int w) {topWidth = w;}
|
||||
|
||||
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
||||
|
||||
if ( log.active ) {
|
||||
stream << log.background()<< std::left << log.topName << log.background()<< " : ";
|
||||
stream << log.background()<< std::left;
|
||||
if (log.topWidth > 0)
|
||||
{
|
||||
stream << std::setw(log.topWidth);
|
||||
}
|
||||
stream << log.topName << log.background()<< " : ";
|
||||
stream << log.colour() << std::left << log.name << log.background() << " : ";
|
||||
if ( log.timestamp ) {
|
||||
log.StopWatch->Stop();
|
||||
|
@ -39,6 +39,7 @@ namespace QCD {
|
||||
static const int Zdir = 2;
|
||||
static const int Tdir = 3;
|
||||
|
||||
|
||||
static const int Xp = 0;
|
||||
static const int Yp = 1;
|
||||
static const int Zp = 2;
|
||||
@ -420,15 +421,16 @@ namespace QCD {
|
||||
//////////////////////////////////////////////
|
||||
// Fermion <-> propagator assignements
|
||||
//////////////////////////////////////////////
|
||||
template <class Prop, class Ferm>
|
||||
void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
|
||||
//template <class Prop, class Ferm>
|
||||
template <class Fimpl>
|
||||
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
|
||||
{
|
||||
for(int j = 0; j < Ns; ++j)
|
||||
for(int j = 0; j < Ns; ++j)
|
||||
{
|
||||
auto pjs = peekSpin(p, j, s);
|
||||
auto fj = peekSpin(f, j);
|
||||
|
||||
for(int i = 0; i < Nc; ++i)
|
||||
for(int i = 0; i < Fimpl::Dimension; ++i)
|
||||
{
|
||||
pokeColour(pjs, peekColour(fj, i), i, c);
|
||||
}
|
||||
@ -436,15 +438,16 @@ namespace QCD {
|
||||
}
|
||||
}
|
||||
|
||||
template <class Prop, class Ferm>
|
||||
void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
|
||||
//template <class Prop, class Ferm>
|
||||
template <class Fimpl>
|
||||
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
|
||||
{
|
||||
for(int j = 0; j < Ns; ++j)
|
||||
{
|
||||
auto pjs = peekSpin(p, j, s);
|
||||
auto fj = peekSpin(f, j);
|
||||
|
||||
for(int i = 0; i < Nc; ++i)
|
||||
for(int i = 0; i < Fimpl::Dimension; ++i)
|
||||
{
|
||||
pokeColour(fj, peekColour(pjs, i, c), i);
|
||||
}
|
||||
@ -492,41 +495,17 @@ namespace QCD {
|
||||
return traceIndex<ColourIndex>(lhs);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Current types
|
||||
//////////////////////////////////////////
|
||||
GRID_SERIALIZABLE_ENUM(Current, undef,
|
||||
Vector, 0,
|
||||
Axial, 1,
|
||||
Tadpole, 2);
|
||||
|
||||
} //namespace QCD
|
||||
} // Grid
|
||||
|
||||
/*
|
||||
<<<<<<< HEAD
|
||||
#include <Grid/qcd/utils/SpaceTimeGrid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
#include <Grid/qcd/spin/TwoSpinor.h>
|
||||
#include <Grid/qcd/utils/LinalgUtils.h>
|
||||
#include <Grid/qcd/utils/CovariantCshift.h>
|
||||
|
||||
// Include representations
|
||||
#include <Grid/qcd/utils/SUn.h>
|
||||
#include <Grid/qcd/utils/SUnAdjoint.h>
|
||||
#include <Grid/qcd/utils/SUnTwoIndex.h>
|
||||
#include <Grid/qcd/representations/hmc_types.h>
|
||||
|
||||
// Scalar field
|
||||
#include <Grid/qcd/utils/ScalarObjs.h>
|
||||
|
||||
#include <Grid/qcd/action/Actions.h>
|
||||
|
||||
#include <Grid/qcd/smearing/Smearing.h>
|
||||
|
||||
#include <Grid/qcd/hmc/integrators/Integrator.h>
|
||||
#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
|
||||
#include <Grid/qcd/observables/hmc_observable.h>
|
||||
#include <Grid/qcd/hmc/HMC.h>
|
||||
|
||||
|
||||
//#include <Grid/qcd/modules/mods.h>
|
||||
=======
|
||||
|
||||
>>>>>>> develop
|
||||
*/
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -73,7 +73,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
|
||||
this->DW(psi,tmp_f,DaggerYes);
|
||||
|
||||
for(int s=0;s<Ls;s++){
|
||||
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
|
||||
axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -50,11 +50,13 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
////////////////////////////////////////////
|
||||
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion.h> // 4d wilson like
|
||||
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
|
||||
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
||||
//#include <Grid/qcd/action/fermion/CloverFermion.h>
|
||||
|
||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
|
||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
|
||||
|
||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h> // Cayley types
|
||||
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
|
||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
|
||||
@ -104,10 +106,33 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermi
|
||||
typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
|
||||
typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
|
||||
|
||||
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
|
||||
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
|
||||
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
|
||||
|
||||
// Twisted mass fermion
|
||||
typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
|
||||
typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
|
||||
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
|
||||
|
||||
// Clover fermions
|
||||
typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
|
||||
typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
|
||||
typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
|
||||
|
||||
typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
|
||||
typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
|
||||
typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
|
||||
|
||||
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
|
||||
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
|
||||
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
|
||||
|
||||
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
|
||||
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
|
||||
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
|
||||
|
||||
// Domain Wall fermions
|
||||
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
|
||||
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
|
||||
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
|
||||
|
@ -70,7 +70,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
|
||||
#define TwoIndexFermOpTemplateInstantiate(A) \
|
||||
template class A<WilsonTwoIndexSymmetricImplF>; \
|
||||
template class A<WilsonTwoIndexSymmetricImplD>;
|
||||
template class A<WilsonTwoIndexSymmetricImplD>; \
|
||||
template class A<WilsonTwoIndexAntiSymmetricImplF>; \
|
||||
template class A<WilsonTwoIndexAntiSymmetricImplD>;
|
||||
|
||||
#define FermOp5dVecTemplateInstantiate(A) \
|
||||
template class A<DomainWallVec5dImplF>; \
|
||||
|
@ -113,6 +113,21 @@ namespace Grid {
|
||||
///////////////////////////////////////////////
|
||||
virtual void ImportGauge(const GaugeField & _U)=0;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Conserved currents, either contract at sink or insert sequentially.
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
virtual void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu)=0;
|
||||
virtual void SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
std::vector<Real> mom,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax)=0;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -164,6 +164,7 @@ namespace QCD {
|
||||
public:
|
||||
|
||||
static const int Dimension = Representation::Dimension;
|
||||
static const bool isFundamental = Representation::isFundamental;
|
||||
static const bool LsVectorised=false;
|
||||
static const int Nhcs = Options::Nhcs;
|
||||
|
||||
@ -212,6 +213,13 @@ namespace QCD {
|
||||
StencilImpl &St) {
|
||||
mult(&phi(), &U(mu), &chi());
|
||||
}
|
||||
|
||||
inline void multLinkProp(SitePropagator &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SitePropagator &chi,
|
||||
int mu) {
|
||||
mult(&phi(), &U(mu), &chi());
|
||||
}
|
||||
|
||||
template <class ref>
|
||||
inline void loadLinkElement(Simd ®, ref &memory) {
|
||||
@ -254,8 +262,22 @@ namespace QCD {
|
||||
GaugeLinkField link(mat._grid);
|
||||
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
|
||||
PokeIndex<LorentzIndex>(mat,link,mu);
|
||||
}
|
||||
}
|
||||
|
||||
inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
|
||||
mat = outerProduct(B,A);
|
||||
}
|
||||
|
||||
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
|
||||
mat = TraceIndex<SpinIndex>(P);
|
||||
}
|
||||
|
||||
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
|
||||
}
|
||||
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
|
||||
|
||||
int Ls=Btilde._grid->_fdimensions[0];
|
||||
@ -277,27 +299,28 @@ namespace QCD {
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Single flavour four spinors with colour index, 5d redblack
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
|
||||
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
|
||||
template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
|
||||
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > {
|
||||
public:
|
||||
|
||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
|
||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
static const int Dimension = Nrepresentation;
|
||||
static const int Dimension = Representation::Dimension;
|
||||
static const bool isFundamental = Representation::isFundamental;
|
||||
static const bool LsVectorised=true;
|
||||
static const int Nhcs = Options::Nhcs;
|
||||
|
||||
typedef typename Options::_Coeff_t Coeff_t;
|
||||
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
|
||||
|
||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
|
||||
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
|
||||
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
|
||||
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
|
||||
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
|
||||
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
|
||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
|
||||
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
|
||||
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
||||
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
|
||||
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
|
||||
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
|
||||
|
||||
typedef iImplSpinor<Simd> SiteSpinor;
|
||||
typedef iImplPropagator<Simd> SitePropagator;
|
||||
@ -333,14 +356,27 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
|
||||
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
|
||||
StencilImpl &St) {
|
||||
SiteGaugeLink UU;
|
||||
for (int i = 0; i < Nrepresentation; i++) {
|
||||
for (int j = 0; j < Nrepresentation; j++) {
|
||||
for (int i = 0; i < Dimension; i++) {
|
||||
for (int j = 0; j < Dimension; j++) {
|
||||
vsplat(UU()()(i, j), U(mu)()(i, j));
|
||||
}
|
||||
}
|
||||
mult(&phi(), &UU(), &chi());
|
||||
}
|
||||
|
||||
|
||||
inline void multLinkProp(SitePropagator &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SitePropagator &chi,
|
||||
int mu) {
|
||||
SiteGaugeLink UU;
|
||||
for (int i = 0; i < Dimension; i++) {
|
||||
for (int j = 0; j < Dimension; j++) {
|
||||
vsplat(UU()()(i, j), U(mu)()(i, j));
|
||||
}
|
||||
}
|
||||
mult(&phi(), &UU(), &chi());
|
||||
}
|
||||
|
||||
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||
{
|
||||
SiteScalarGaugeField ScalarUmu;
|
||||
@ -373,6 +409,19 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) {
|
||||
|
||||
assert(0);
|
||||
@ -425,25 +474,26 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Flavour doubled spinors; is Gparity the only? what about C*?
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
template <class S, int Nrepresentation, class Options=CoeffReal>
|
||||
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
|
||||
template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
|
||||
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
|
||||
public:
|
||||
|
||||
static const int Dimension = Nrepresentation;
|
||||
static const int Dimension = Representation::Dimension;
|
||||
static const bool isFundamental = Representation::isFundamental;
|
||||
static const int Nhcs = Options::Nhcs;
|
||||
static const bool LsVectorised=false;
|
||||
|
||||
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
|
||||
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
typedef typename Options::_Coeff_t Coeff_t;
|
||||
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
|
||||
|
||||
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
|
||||
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>, Ngp>;
|
||||
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
|
||||
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
|
||||
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Dimension>, Ns>, Ngp>;
|
||||
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>, Ngp>;
|
||||
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhs>, Ngp>;
|
||||
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
|
||||
|
||||
typedef iImplSpinor<Simd> SiteSpinor;
|
||||
typedef iImplPropagator<Simd> SitePropagator;
|
||||
@ -537,7 +587,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Fixme: Gparity prop * link
|
||||
inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U,
|
||||
const SitePropagator &chi, int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template <class ref>
|
||||
inline void loadLinkElement(Simd ®, ref &memory) {
|
||||
@ -611,6 +666,25 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
|
||||
return;
|
||||
}
|
||||
|
||||
inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
|
||||
//mat = outerProduct(Btilde, A);
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
|
||||
assert(0);
|
||||
/*
|
||||
auto tmp = TraceIndex<SpinIndex>(P);
|
||||
parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
|
||||
mat[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) {
|
||||
|
||||
int Ls = Btilde._grid->_fdimensions[0];
|
||||
@ -640,6 +714,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
||||
|
||||
typedef RealD _Coeff_t ;
|
||||
static const int Dimension = Representation::Dimension;
|
||||
static const bool isFundamental = Representation::isFundamental;
|
||||
static const bool LsVectorised=false;
|
||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||
|
||||
@ -751,8 +826,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
||||
GaugeLinkField link(mat._grid);
|
||||
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
|
||||
PokeIndex<LorentzIndex>(mat,link,mu);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
|
||||
assert (0);
|
||||
// Must never hit
|
||||
@ -768,6 +843,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
||||
public:
|
||||
|
||||
static const int Dimension = Representation::Dimension;
|
||||
static const bool isFundamental = Representation::isFundamental;
|
||||
static const bool LsVectorised=true;
|
||||
typedef RealD Coeff_t ;
|
||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||
@ -958,29 +1034,33 @@ typedef WilsonImpl<vComplex, TwoIndexSymmetricRepresentation, CoeffReal > Wilso
|
||||
typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF; // Float
|
||||
typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD; // Double
|
||||
|
||||
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
|
||||
typedef WilsonImpl<vComplex, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplR; // Real.. whichever prec
|
||||
typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF; // Float
|
||||
typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD; // Double
|
||||
|
||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
|
||||
|
||||
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
|
||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
|
||||
|
||||
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
|
||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
|
||||
|
||||
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
|
||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
|
||||
|
||||
typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR; // Real.. whichever prec
|
||||
typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF; // Float
|
||||
typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD; // Double
|
||||
typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR; // Real.. whichever prec
|
||||
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF; // Float
|
||||
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD; // Double
|
||||
|
||||
typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
|
||||
typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH; // Float
|
||||
typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDF; // Double
|
||||
typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
|
||||
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH; // Float
|
||||
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF; // Double
|
||||
|
||||
typedef StaggeredImpl<vComplex, FundamentalRepresentation > StaggeredImplR; // Real.. whichever prec
|
||||
typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF; // Float
|
||||
|
@ -393,6 +393,31 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Conserved current - not yet implemented.
|
||||
////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
std::vector<Real> mom,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
|
||||
|
||||
//AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
|
||||
|
@ -157,6 +157,22 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
///////////////////////////////////////////////////////////////
|
||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu);
|
||||
void SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
std::vector<Real> mom,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax);
|
||||
};
|
||||
|
||||
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
|
||||
|
@ -405,6 +405,30 @@ void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
|
||||
MooeeInv(in, out);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Conserved current - not yet implemented.
|
||||
////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
std::vector<Real> mom,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
|
||||
FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
|
||||
|
@ -170,6 +170,21 @@ namespace QCD {
|
||||
// Comms buffer
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
///////////////////////////////////////////////////////////////
|
||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu);
|
||||
void SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
std::vector<Real> mom,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax);
|
||||
};
|
||||
|
||||
}}
|
||||
|
243
lib/qcd/action/fermion/WilsonCloverFermion.cc
Normal file
243
lib/qcd/action/fermion/WilsonCloverFermion.cc
Normal file
@ -0,0 +1,243 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/Eigen/Dense>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
|
||||
namespace Grid
|
||||
{
|
||||
namespace QCD
|
||||
{
|
||||
|
||||
// *NOT* EO
|
||||
template <class Impl>
|
||||
RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
|
||||
{
|
||||
FermionField temp(out._grid);
|
||||
|
||||
// Wilson term
|
||||
out.checkerboard = in.checkerboard;
|
||||
this->Dhop(in, out, DaggerNo);
|
||||
|
||||
// Clover term
|
||||
Mooee(in, temp);
|
||||
|
||||
out += temp;
|
||||
return norm2(out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
FermionField temp(out._grid);
|
||||
|
||||
// Wilson term
|
||||
out.checkerboard = in.checkerboard;
|
||||
this->Dhop(in, out, DaggerYes);
|
||||
|
||||
// Clover term
|
||||
MooeeDag(in, temp);
|
||||
|
||||
out += temp;
|
||||
return norm2(out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
{
|
||||
WilsonFermion<Impl>::ImportGauge(_Umu);
|
||||
GridBase *grid = _Umu._grid;
|
||||
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||
|
||||
// Compute the field strength terms mu>nu
|
||||
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
|
||||
|
||||
// Compute the Clover Operator acting on Colour and Spin
|
||||
// multiply here by the clover coefficients for the anisotropy
|
||||
CloverTerm = fillCloverYZ(Bx) * csw_r;
|
||||
CloverTerm += fillCloverXZ(By) * csw_r;
|
||||
CloverTerm += fillCloverXY(Bz) * csw_r;
|
||||
CloverTerm += fillCloverXT(Ex) * csw_t;
|
||||
CloverTerm += fillCloverYT(Ey) * csw_t;
|
||||
CloverTerm += fillCloverZT(Ez) * csw_t;
|
||||
CloverTerm += diag_mass;
|
||||
|
||||
int lvol = _Umu._grid->lSites();
|
||||
int DimRep = Impl::Dimension;
|
||||
|
||||
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
|
||||
std::vector<int> lcoor;
|
||||
typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
|
||||
|
||||
for (int site = 0; site < lvol; site++)
|
||||
{
|
||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
peekLocalSite(Qx, CloverTerm, lcoor);
|
||||
Qxinv = zero;
|
||||
//if (csw!=0){
|
||||
for (int j = 0; j < Ns; j++)
|
||||
for (int k = 0; k < Ns; k++)
|
||||
for (int a = 0; a < DimRep; a++)
|
||||
for (int b = 0; b < DimRep; b++)
|
||||
EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
|
||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
|
||||
|
||||
EigenInvCloverOp = EigenCloverOp.inverse();
|
||||
//std::cout << EigenInvCloverOp << std::endl;
|
||||
for (int j = 0; j < Ns; j++)
|
||||
for (int k = 0; k < Ns; k++)
|
||||
for (int a = 0; a < DimRep; a++)
|
||||
for (int b = 0; b < DimRep; b++)
|
||||
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
|
||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
||||
// }
|
||||
pokeLocalSite(Qxinv, CloverTermInv, lcoor);
|
||||
}
|
||||
|
||||
// Separate the even and odd parts
|
||||
pickCheckerboard(Even, CloverTermEven, CloverTerm);
|
||||
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
|
||||
|
||||
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
|
||||
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
|
||||
|
||||
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
|
||||
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
|
||||
|
||||
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
|
||||
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerNo, InverseNo);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerYes, InverseNo);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerNo, InverseYes);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerYes, InverseYes);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
|
||||
{
|
||||
out.checkerboard = in.checkerboard;
|
||||
CloverFieldType *Clover;
|
||||
assert(in.checkerboard == Odd || in.checkerboard == Even);
|
||||
|
||||
if (dag)
|
||||
{
|
||||
if (in._grid->_isCheckerBoarded)
|
||||
{
|
||||
if (in.checkerboard == Odd)
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
|
||||
}
|
||||
out = *Clover * in;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInv : &CloverTerm;
|
||||
out = adj(*Clover) * in;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (in._grid->_isCheckerBoarded)
|
||||
{
|
||||
|
||||
if (in.checkerboard == Odd)
|
||||
{
|
||||
// std::cout << "Calling clover term Odd" << std::endl;
|
||||
Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
|
||||
}
|
||||
else
|
||||
{
|
||||
// std::cout << "Calling clover term Even" << std::endl;
|
||||
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
|
||||
}
|
||||
out = *Clover * in;
|
||||
// std::cout << GridLogMessage << "*Clover.checkerboard " << (*Clover).checkerboard << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInv : &CloverTerm;
|
||||
out = *Clover * in;
|
||||
}
|
||||
}
|
||||
|
||||
} // MooeeInternal
|
||||
|
||||
|
||||
// Derivative parts
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
// Derivative parts
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||
{
|
||||
assert(0); // not implemented yet
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
}
|
||||
}
|
366
lib/qcd/action/fermion/WilsonCloverFermion.h
Normal file
366
lib/qcd/action/fermion/WilsonCloverFermion.h
Normal file
@ -0,0 +1,366 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: David Preti <>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
|
||||
#define GRID_QCD_WILSON_CLOVER_FERMION_H
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
namespace Grid
|
||||
{
|
||||
namespace QCD
|
||||
{
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Wilson Clover
|
||||
//
|
||||
// Operator ( with anisotropy coefficients):
|
||||
//
|
||||
// Q = 1 + (Nd-1)/xi_0 + m
|
||||
// + W_t + (nu/xi_0) * W_s
|
||||
// - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss) ]
|
||||
//
|
||||
// s spatial, t temporal directions.
|
||||
// where W_t and W_s are the temporal and spatial components of the
|
||||
// Wilson Dirac operator
|
||||
//
|
||||
// csw_r = csw_t to recover the isotropic version
|
||||
//////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
class WilsonCloverFermion : public WilsonFermion<Impl>
|
||||
{
|
||||
public:
|
||||
// Types definitions
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
template <typename vtype>
|
||||
using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
|
||||
typedef iImplClover<Simd> SiteCloverType;
|
||||
typedef Lattice<SiteCloverType> CloverFieldType;
|
||||
|
||||
public:
|
||||
typedef WilsonFermion<Impl> WilsonBase;
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
// Constructors
|
||||
WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid,
|
||||
const RealD _mass,
|
||||
const RealD _csw_r = 0.0,
|
||||
const RealD _csw_t = 0.0,
|
||||
const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
|
||||
const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
|
||||
Fgrid,
|
||||
Hgrid,
|
||||
_mass, impl_p, clover_anisotropy),
|
||||
CloverTerm(&Fgrid),
|
||||
CloverTermInv(&Fgrid),
|
||||
CloverTermEven(&Hgrid),
|
||||
CloverTermOdd(&Hgrid),
|
||||
CloverTermInvEven(&Hgrid),
|
||||
CloverTermInvOdd(&Hgrid),
|
||||
CloverTermDagEven(&Hgrid),
|
||||
CloverTermDagOdd(&Hgrid),
|
||||
CloverTermInvDagEven(&Hgrid),
|
||||
CloverTermInvDagOdd(&Hgrid)
|
||||
{
|
||||
assert(Nd == 4); // require 4 dimensions
|
||||
|
||||
if (clover_anisotropy.isAnisotropic)
|
||||
{
|
||||
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
|
||||
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
|
||||
}
|
||||
else
|
||||
{
|
||||
csw_r = _csw_r * 0.5;
|
||||
diag_mass = 4.0 + _mass;
|
||||
}
|
||||
csw_t = _csw_t * 0.5;
|
||||
|
||||
if (csw_r == 0)
|
||||
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
|
||||
if (csw_t == 0)
|
||||
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
|
||||
|
||||
ImportGauge(_Umu);
|
||||
}
|
||||
|
||||
virtual RealD M(const FermionField &in, FermionField &out);
|
||||
virtual RealD Mdag(const FermionField &in, FermionField &out);
|
||||
|
||||
virtual void Mooee(const FermionField &in, FermionField &out);
|
||||
virtual void MooeeDag(const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInv(const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInvDag(const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
|
||||
|
||||
//virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
||||
virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
||||
virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
||||
|
||||
void ImportGauge(const GaugeField &_Umu);
|
||||
|
||||
// Derivative parts unpreconditioned pseudofermions
|
||||
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
|
||||
{
|
||||
conformable(X._grid, Y._grid);
|
||||
conformable(X._grid, force._grid);
|
||||
GaugeLinkField force_mu(force._grid), lambda(force._grid);
|
||||
GaugeField clover_force(force._grid);
|
||||
PropagatorField Lambda(force._grid);
|
||||
|
||||
// Guido: Here we are hitting some performance issues:
|
||||
// need to extract the components of the DoubledGaugeField
|
||||
// for each call
|
||||
// Possible solution
|
||||
// Create a vector object to store them? (cons: wasting space)
|
||||
std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
|
||||
|
||||
Impl::extractLinkField(U, this->Umu);
|
||||
|
||||
force = zero;
|
||||
// Derivative of the Wilson hopping term
|
||||
this->DhopDeriv(force, X, Y, dag);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Clover term derivative
|
||||
///////////////////////////////////////////////////////////
|
||||
Impl::outerProductImpl(Lambda, X, Y);
|
||||
//std::cout << "Lambda:" << Lambda << std::endl;
|
||||
|
||||
Gamma::Algebra sigma[] = {
|
||||
Gamma::Algebra::SigmaXY,
|
||||
Gamma::Algebra::SigmaXZ,
|
||||
Gamma::Algebra::SigmaXT,
|
||||
Gamma::Algebra::MinusSigmaXY,
|
||||
Gamma::Algebra::SigmaYZ,
|
||||
Gamma::Algebra::SigmaYT,
|
||||
Gamma::Algebra::MinusSigmaXZ,
|
||||
Gamma::Algebra::MinusSigmaYZ,
|
||||
Gamma::Algebra::SigmaZT,
|
||||
Gamma::Algebra::MinusSigmaXT,
|
||||
Gamma::Algebra::MinusSigmaYT,
|
||||
Gamma::Algebra::MinusSigmaZT};
|
||||
|
||||
/*
|
||||
sigma_{\mu \nu}=
|
||||
| 0 sigma[0] sigma[1] sigma[2] |
|
||||
| sigma[3] 0 sigma[4] sigma[5] |
|
||||
| sigma[6] sigma[7] 0 sigma[8] |
|
||||
| sigma[9] sigma[10] sigma[11] 0 |
|
||||
*/
|
||||
|
||||
int count = 0;
|
||||
clover_force = zero;
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
force_mu = zero;
|
||||
for (int nu = 0; nu < 4; nu++)
|
||||
{
|
||||
if (mu == nu)
|
||||
continue;
|
||||
|
||||
RealD factor;
|
||||
if (nu == 4 || mu == 4)
|
||||
{
|
||||
factor = 2.0 * csw_t;
|
||||
}
|
||||
else
|
||||
{
|
||||
factor = 2.0 * csw_r;
|
||||
}
|
||||
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
|
||||
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
|
||||
force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked
|
||||
count++;
|
||||
}
|
||||
|
||||
pokeLorentz(clover_force, U[mu] * force_mu, mu);
|
||||
}
|
||||
//clover_force *= csw;
|
||||
force += clover_force;
|
||||
}
|
||||
|
||||
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
|
||||
GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
|
||||
{
|
||||
conformable(lambda._grid, U[0]._grid);
|
||||
GaugeLinkField out(lambda._grid), tmp(lambda._grid);
|
||||
// insertion in upper staple
|
||||
// please check redundancy of shift operations
|
||||
|
||||
// C1+
|
||||
tmp = lambda * U[nu];
|
||||
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
|
||||
|
||||
// C2+
|
||||
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
|
||||
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
|
||||
|
||||
// C3+
|
||||
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
|
||||
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
|
||||
|
||||
// C4+
|
||||
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
|
||||
|
||||
// insertion in lower staple
|
||||
// C1-
|
||||
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
||||
|
||||
// C2-
|
||||
tmp = adj(lambda) * U[nu];
|
||||
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
||||
|
||||
// C3-
|
||||
tmp = lambda * U[nu];
|
||||
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
|
||||
|
||||
// C4-
|
||||
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
private:
|
||||
// here fixing the 4 dimensions, make it more general?
|
||||
|
||||
RealD csw_r; // Clover coefficient - spatial
|
||||
RealD csw_t; // Clover coefficient - temporal
|
||||
RealD diag_mass; // Mass term
|
||||
CloverFieldType CloverTerm, CloverTermInv; // Clover term
|
||||
CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO
|
||||
CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
|
||||
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
||||
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
||||
|
||||
// eventually these can be compressed into 6x6 blocks instead of the 12x12
|
||||
// using the DeGrand-Rossi basis for the gamma matrices
|
||||
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F._grid);
|
||||
T = zero;
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
|
||||
{
|
||||
T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
|
||||
T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
|
||||
T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
|
||||
T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
|
||||
}
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverXZ(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F._grid);
|
||||
T = zero;
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
|
||||
{
|
||||
T._odata[i]()(0, 1) = -F._odata[i]()();
|
||||
T._odata[i]()(1, 0) = F._odata[i]()();
|
||||
T._odata[i]()(2, 3) = -F._odata[i]()();
|
||||
T._odata[i]()(3, 2) = F._odata[i]()();
|
||||
}
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverXY(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F._grid);
|
||||
T = zero;
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
|
||||
{
|
||||
|
||||
T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
|
||||
T._odata[i]()(1, 1) = timesI(F._odata[i]()());
|
||||
T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
|
||||
T._odata[i]()(3, 3) = timesI(F._odata[i]()());
|
||||
}
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverXT(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F._grid);
|
||||
T = zero;
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
|
||||
{
|
||||
T._odata[i]()(0, 1) = timesI(F._odata[i]()());
|
||||
T._odata[i]()(1, 0) = timesI(F._odata[i]()());
|
||||
T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
|
||||
T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
|
||||
}
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverYT(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F._grid);
|
||||
T = zero;
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
|
||||
{
|
||||
T._odata[i]()(0, 1) = -(F._odata[i]()());
|
||||
T._odata[i]()(1, 0) = (F._odata[i]()());
|
||||
T._odata[i]()(2, 3) = (F._odata[i]()());
|
||||
T._odata[i]()(3, 2) = -(F._odata[i]()());
|
||||
}
|
||||
|
||||
return T;
|
||||
}
|
||||
|
||||
CloverFieldType fillCloverZT(const GaugeLinkField &F)
|
||||
{
|
||||
CloverFieldType T(F._grid);
|
||||
T = zero;
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
|
||||
{
|
||||
T._odata[i]()(0, 0) = timesI(F._odata[i]()());
|
||||
T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
|
||||
T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
|
||||
T._odata[i]()(3, 3) = timesI(F._odata[i]()());
|
||||
}
|
||||
|
||||
return T;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif // GRID_QCD_WILSON_CLOVER_FERMION_H
|
@ -265,7 +265,6 @@ public:
|
||||
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
|
||||
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
||||
}
|
||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||
|
||||
std::vector<int> same_node;
|
||||
std::vector<int> surface_list;
|
||||
|
@ -47,7 +47,8 @@ int WilsonFermionStatic::HandOptDslash;
|
||||
template <class Impl>
|
||||
WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
const ImplParams &p)
|
||||
const ImplParams &p,
|
||||
const WilsonAnisotropyCoefficients &anis)
|
||||
: Kernels(p),
|
||||
_grid(&Fgrid),
|
||||
_cbgrid(&Hgrid),
|
||||
@ -60,16 +61,41 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||
Umu(&Fgrid),
|
||||
UmuEven(&Hgrid),
|
||||
UmuOdd(&Hgrid),
|
||||
_tmp(&Hgrid)
|
||||
_tmp(&Hgrid),
|
||||
anisotropyCoeff(anis)
|
||||
{
|
||||
// Allocate the required comms buffer
|
||||
ImportGauge(_Umu);
|
||||
if (anisotropyCoeff.isAnisotropic){
|
||||
diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
|
||||
} else {
|
||||
diag_mass = 4.0 + mass;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
|
||||
GaugeField HUmu(_Umu._grid);
|
||||
HUmu = _Umu * (-0.5);
|
||||
|
||||
//Here multiply the anisotropy coefficients
|
||||
if (anisotropyCoeff.isAnisotropic)
|
||||
{
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
|
||||
if (mu != anisotropyCoeff.t_direction)
|
||||
U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
|
||||
|
||||
PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
HUmu = _Umu * (-0.5);
|
||||
}
|
||||
Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
|
||||
pickCheckerboard(Even, UmuEven, Umu);
|
||||
pickCheckerboard(Odd, UmuOdd, Umu);
|
||||
@ -83,14 +109,14 @@ template <class Impl>
|
||||
RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
||||
out.checkerboard = in.checkerboard;
|
||||
Dhop(in, out, DaggerNo);
|
||||
return axpy_norm(out, 4 + mass, in, out);
|
||||
return axpy_norm(out, diag_mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
||||
out.checkerboard = in.checkerboard;
|
||||
Dhop(in, out, DaggerYes);
|
||||
return axpy_norm(out, 4 + mass, in, out);
|
||||
return axpy_norm(out, diag_mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -114,7 +140,7 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
||||
out.checkerboard = in.checkerboard;
|
||||
typename FermionField::scalar_type scal(4.0 + mass);
|
||||
typename FermionField::scalar_type scal(diag_mass);
|
||||
out = scal * in;
|
||||
}
|
||||
|
||||
@ -127,7 +153,7 @@ void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
||||
out.checkerboard = in.checkerboard;
|
||||
out = (1.0/(4.0+mass))*in;
|
||||
out = (1.0/(diag_mass))*in;
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -204,7 +230,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
|
||||
FermionField Btilde(B._grid);
|
||||
FermionField Atilde(B._grid);
|
||||
Atilde = A;
|
||||
Atilde = A;//redundant
|
||||
|
||||
st.HaloExchange(B, compressor);
|
||||
|
||||
@ -345,6 +371,112 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
}
|
||||
};
|
||||
|
||||
/*******************************************************************************
|
||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||
* to make a conserved current sink or inserting the conserved current
|
||||
* sequentially.
|
||||
******************************************************************************/
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
conformable(_grid, q_in_1._grid);
|
||||
conformable(_grid, q_in_2._grid);
|
||||
conformable(_grid, q_out._grid);
|
||||
PropagatorField tmp1(_grid), tmp2(_grid);
|
||||
q_out = zero;
|
||||
|
||||
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
|
||||
// Inefficient comms method but not performance critical.
|
||||
tmp1 = Cshift(q_in_1, mu, 1);
|
||||
tmp2 = Cshift(q_in_2, mu, 1);
|
||||
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
|
||||
{
|
||||
Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
|
||||
q_in_2._odata[sU],
|
||||
q_out._odata[sU],
|
||||
Umu, sU, mu);
|
||||
Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
|
||||
tmp2._odata[sU],
|
||||
q_out._odata[sU],
|
||||
Umu, sU, mu);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
std::vector<Real> mom,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax)
|
||||
{
|
||||
conformable(_grid, q_in._grid);
|
||||
conformable(_grid, q_out._grid);
|
||||
Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
|
||||
ComplexD i(0.0,1.0);
|
||||
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
|
||||
unsigned int tshift = (mu == Tp) ? 1 : 0;
|
||||
unsigned int LLt = GridDefaultLatt()[Tp];
|
||||
|
||||
// Momentum projection
|
||||
ph = zero;
|
||||
for(unsigned int mu = 0; mu < Nd - 1; mu++)
|
||||
{
|
||||
LatticeCoordinate(coor, mu);
|
||||
ph = ph + mom[mu]*coor*((1./(_grid->_fdimensions[mu])));
|
||||
}
|
||||
ph = exp((RealD)(2*M_PI)*i*ph);
|
||||
|
||||
q_out = zero;
|
||||
LatticeInteger coords(_grid);
|
||||
LatticeCoordinate(coords, Tp);
|
||||
|
||||
// Need q(x + mu) and q(x - mu).
|
||||
tmp = Cshift(q_in, mu, 1);
|
||||
tmpFwd = tmp*ph;
|
||||
tmp = ph*q_in;
|
||||
tmpBwd = Cshift(tmp, mu, -1);
|
||||
|
||||
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
|
||||
{
|
||||
// Compute the sequential conserved current insertion only if our simd
|
||||
// object contains a timeslice we need.
|
||||
vInteger t_mask = ((coords._odata[sU] >= tmin) &&
|
||||
(coords._odata[sU] <= tmax));
|
||||
Integer timeSlices = Reduce(t_mask);
|
||||
|
||||
if (timeSlices > 0)
|
||||
{
|
||||
Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU],
|
||||
q_out._odata[sU],
|
||||
Umu, sU, mu, t_mask);
|
||||
}
|
||||
|
||||
// Repeat for backward direction.
|
||||
t_mask = ((coords._odata[sU] >= (tmin + tshift)) &&
|
||||
(coords._odata[sU] <= (tmax + tshift)));
|
||||
|
||||
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
||||
unsigned int t0 = 0;
|
||||
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
|
||||
|
||||
timeSlices = Reduce(t_mask);
|
||||
|
||||
if (timeSlices > 0)
|
||||
{
|
||||
Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU],
|
||||
q_out._odata[sU],
|
||||
Umu, sU, mu, t_mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonFermion);
|
||||
AdjointFermOpTemplateInstantiate(WilsonFermion);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonFermion);
|
||||
|
@ -44,6 +44,21 @@ class WilsonFermionStatic {
|
||||
static const int npoint = 8;
|
||||
};
|
||||
|
||||
struct WilsonAnisotropyCoefficients: Serializable
|
||||
{
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonAnisotropyCoefficients,
|
||||
bool, isAnisotropic,
|
||||
int, t_direction,
|
||||
double, xi_0,
|
||||
double, nu);
|
||||
|
||||
WilsonAnisotropyCoefficients():
|
||||
isAnisotropic(false),
|
||||
t_direction(Nd-1),
|
||||
xi_0(1.0),
|
||||
nu(1.0){}
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
||||
public:
|
||||
@ -65,8 +80,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
||||
// override multiply; cut number routines if pass dagger argument
|
||||
// and also make interface more uniformly consistent
|
||||
//////////////////////////////////////////////////////////////////
|
||||
RealD M(const FermionField &in, FermionField &out);
|
||||
RealD Mdag(const FermionField &in, FermionField &out);
|
||||
virtual RealD M(const FermionField &in, FermionField &out);
|
||||
virtual RealD Mdag(const FermionField &in, FermionField &out);
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// half checkerboard operations
|
||||
@ -117,8 +132,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
||||
|
||||
// Constructor
|
||||
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
const ImplParams &p = ImplParams());
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
const ImplParams &p = ImplParams(),
|
||||
const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
|
||||
|
||||
// DoubleStore impl dependent
|
||||
void ImportGauge(const GaugeField &_Umu);
|
||||
@ -130,6 +146,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
||||
// protected:
|
||||
public:
|
||||
RealD mass;
|
||||
RealD diag_mass;
|
||||
|
||||
GridBase *_grid;
|
||||
GridBase *_cbgrid;
|
||||
@ -146,6 +163,24 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
WilsonAnisotropyCoefficients anisotropyCoeff;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
///////////////////////////////////////////////////////////////
|
||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu);
|
||||
void SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
std::vector<Real> mom,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax);
|
||||
};
|
||||
|
||||
typedef WilsonFermion<WilsonImplF> WilsonFermionF;
|
||||
|
@ -12,6 +12,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@ -702,6 +703,168 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
|
||||
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||
* to make a conserved current sink or inserting the conserved current
|
||||
* sequentially.
|
||||
******************************************************************************/
|
||||
|
||||
// Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
|
||||
#define REVERSE_LS(qSite, qSiteRev, Nsimd) \
|
||||
{ \
|
||||
std::vector<typename SitePropagator::scalar_object> qSiteVec(Nsimd); \
|
||||
extract(qSite, qSiteVec); \
|
||||
for (int i = 0; i < Nsimd / 2; ++i) \
|
||||
{ \
|
||||
typename SitePropagator::scalar_object tmp = qSiteVec[i]; \
|
||||
qSiteVec[i] = qSiteVec[Nsimd - i - 1]; \
|
||||
qSiteVec[Nsimd - i - 1] = tmp; \
|
||||
} \
|
||||
merge(qSiteRev, qSiteVec); \
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
conformable(q_in_1._grid, FermionGrid());
|
||||
conformable(q_in_1._grid, q_in_2._grid);
|
||||
conformable(_FourDimGrid, q_out._grid);
|
||||
PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
|
||||
unsigned int LLs = q_in_1._grid->_rdimensions[0];
|
||||
q_out = zero;
|
||||
|
||||
// Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s),
|
||||
// q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
|
||||
tmp1 = Cshift(q_in_1, mu + 1, 1);
|
||||
tmp2 = Cshift(q_in_2, mu + 1, 1);
|
||||
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
|
||||
{
|
||||
unsigned int sF1 = sU * LLs;
|
||||
unsigned int sF2 = (sU + 1) * LLs - 1;
|
||||
|
||||
for (unsigned int s = 0; s < LLs; ++s)
|
||||
{
|
||||
bool axial_sign = ((curr_type == Current::Axial) && \
|
||||
(s < (LLs / 2)));
|
||||
SitePropagator qSite2, qmuSite2;
|
||||
|
||||
// If vectorised in 5th dimension, reverse q2 vector to match up
|
||||
// sites correctly.
|
||||
if (Impl::LsVectorised)
|
||||
{
|
||||
REVERSE_LS(q_in_2._odata[sF2], qSite2, Ls / LLs);
|
||||
REVERSE_LS(tmp2._odata[sF2], qmuSite2, Ls / LLs);
|
||||
}
|
||||
else
|
||||
{
|
||||
qSite2 = q_in_2._odata[sF2];
|
||||
qmuSite2 = tmp2._odata[sF2];
|
||||
}
|
||||
Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sF1],
|
||||
qSite2,
|
||||
q_out._odata[sU],
|
||||
Umu, sU, mu, axial_sign);
|
||||
Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sF1],
|
||||
qmuSite2,
|
||||
q_out._odata[sU],
|
||||
Umu, sU, mu, axial_sign);
|
||||
sF1++;
|
||||
sF2--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
std::vector<Real> mom,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax)
|
||||
{
|
||||
conformable(q_in._grid, FermionGrid());
|
||||
conformable(q_in._grid, q_out._grid);
|
||||
Lattice<iSinglet<Simd>> ph(FermionGrid()), coor(FermionGrid());
|
||||
PropagatorField tmpFwd(FermionGrid()), tmpBwd(FermionGrid()),
|
||||
tmp(FermionGrid());
|
||||
ComplexD i(0.0, 1.0);
|
||||
unsigned int tshift = (mu == Tp) ? 1 : 0;
|
||||
unsigned int LLs = q_in._grid->_rdimensions[0];
|
||||
unsigned int LLt = GridDefaultLatt()[Tp];
|
||||
|
||||
// Momentum projection.
|
||||
ph = zero;
|
||||
for(unsigned int nu = 0; nu < Nd - 1; nu++)
|
||||
{
|
||||
// Shift coordinate lattice index by 1 to account for 5th dimension.
|
||||
LatticeCoordinate(coor, nu + 1);
|
||||
ph = ph + mom[nu]*coor*((1./(_FourDimGrid->_fdimensions[nu])));
|
||||
}
|
||||
ph = exp((RealD)(2*M_PI)*i*ph);
|
||||
|
||||
q_out = zero;
|
||||
LatticeInteger coords(_FourDimGrid);
|
||||
LatticeCoordinate(coords, Tp);
|
||||
|
||||
// Need q(x + mu, s) and q(x - mu, s). 5D lattice so shift 4D coordinate mu
|
||||
// by one.
|
||||
tmp = Cshift(q_in, mu + 1, 1);
|
||||
tmpFwd = tmp*ph;
|
||||
tmp = ph*q_in;
|
||||
tmpBwd = Cshift(tmp, mu + 1, -1);
|
||||
|
||||
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
|
||||
{
|
||||
// Compute the sequential conserved current insertion only if our simd
|
||||
// object contains a timeslice we need.
|
||||
vInteger t_mask = ((coords._odata[sU] >= tmin) &&
|
||||
(coords._odata[sU] <= tmax));
|
||||
Integer timeSlices = Reduce(t_mask);
|
||||
|
||||
if (timeSlices > 0)
|
||||
{
|
||||
unsigned int sF = sU * LLs;
|
||||
for (unsigned int s = 0; s < LLs; ++s)
|
||||
{
|
||||
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
|
||||
Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sF],
|
||||
q_out._odata[sF], Umu, sU,
|
||||
mu, t_mask, axial_sign);
|
||||
++sF;
|
||||
}
|
||||
}
|
||||
|
||||
// Repeat for backward direction.
|
||||
t_mask = ((coords._odata[sU] >= (tmin + tshift)) &&
|
||||
(coords._odata[sU] <= (tmax + tshift)));
|
||||
|
||||
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
||||
unsigned int t0 = 0;
|
||||
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
|
||||
|
||||
timeSlices = Reduce(t_mask);
|
||||
|
||||
if (timeSlices > 0)
|
||||
{
|
||||
unsigned int sF = sU * LLs;
|
||||
for (unsigned int s = 0; s < LLs; ++s)
|
||||
{
|
||||
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
|
||||
Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sF],
|
||||
q_out._odata[sF], Umu, sU,
|
||||
mu, t_mask, axial_sign);
|
||||
++sF;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonFermion5D);
|
||||
GparityFermOpTemplateInstantiate(WilsonFermion5D);
|
||||
|
||||
|
@ -214,6 +214,21 @@ namespace QCD {
|
||||
// Comms buffer
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
///////////////////////////////////////////////////////////////
|
||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu);
|
||||
void SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
std::vector<Real> mom,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax);
|
||||
};
|
||||
|
||||
}}
|
||||
|
@ -281,6 +281,172 @@ void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal
|
||||
vstream(out._odata[sF], result);
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||
* to make a conserved current sink or inserting the conserved current
|
||||
* sequentially. Common to both 4D and 5D.
|
||||
******************************************************************************/
|
||||
// N.B. Functions below assume a -1/2 factor within U.
|
||||
#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
|
||||
#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
|
||||
|
||||
/*******************************************************************************
|
||||
* Name: ContractConservedCurrentSiteFwd
|
||||
* Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
|
||||
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
|
||||
* - Pass in q_in_1 shifted in +ve mu direction.
|
||||
******************************************************************************/
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
|
||||
const SitePropagator &q_in_1,
|
||||
const SitePropagator &q_in_2,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeField &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
bool switch_sign)
|
||||
{
|
||||
SitePropagator result, tmp;
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
|
||||
result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
|
||||
if (switch_sign)
|
||||
{
|
||||
q_out -= result;
|
||||
}
|
||||
else
|
||||
{
|
||||
q_out += result;
|
||||
}
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
* Name: ContractConservedCurrentSiteBwd
|
||||
* Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
|
||||
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
|
||||
* - Pass in q_in_2 shifted in +ve mu direction.
|
||||
******************************************************************************/
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
|
||||
const SitePropagator &q_in_1,
|
||||
const SitePropagator &q_in_2,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeField &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
bool switch_sign)
|
||||
{
|
||||
SitePropagator result, tmp;
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
|
||||
result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
|
||||
if (switch_sign)
|
||||
{
|
||||
q_out += result;
|
||||
}
|
||||
else
|
||||
{
|
||||
q_out -= result;
|
||||
}
|
||||
}
|
||||
|
||||
// G-parity requires more specialised implementation.
|
||||
#define NO_CURR_SITE(Impl) \
|
||||
template <> \
|
||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
|
||||
const SitePropagator &q_in_1, \
|
||||
const SitePropagator &q_in_2, \
|
||||
SitePropagator &q_out, \
|
||||
DoubledGaugeField &U, \
|
||||
unsigned int sU, \
|
||||
unsigned int mu, \
|
||||
bool switch_sign) \
|
||||
{ \
|
||||
assert(0); \
|
||||
} \
|
||||
template <> \
|
||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
|
||||
const SitePropagator &q_in_1, \
|
||||
const SitePropagator &q_in_2, \
|
||||
SitePropagator &q_out, \
|
||||
DoubledGaugeField &U, \
|
||||
unsigned int mu, \
|
||||
unsigned int sU, \
|
||||
bool switch_sign) \
|
||||
{ \
|
||||
assert(0); \
|
||||
}
|
||||
|
||||
NO_CURR_SITE(GparityWilsonImplF);
|
||||
NO_CURR_SITE(GparityWilsonImplD);
|
||||
NO_CURR_SITE(GparityWilsonImplFH);
|
||||
NO_CURR_SITE(GparityWilsonImplDF);
|
||||
|
||||
|
||||
/*******************************************************************************
|
||||
* Name: SeqConservedCurrentSiteFwd
|
||||
* Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
|
||||
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
|
||||
* - Pass in q_in shifted in +ve mu direction.
|
||||
******************************************************************************/
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeField &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
vInteger t_mask,
|
||||
bool switch_sign)
|
||||
{
|
||||
SitePropagator result;
|
||||
Impl::multLinkProp(result, U._odata[sU], q_in, mu);
|
||||
result = WilsonCurrentFwd(result, mu);
|
||||
|
||||
// Zero any unwanted timeslice entries.
|
||||
result = predicatedWhere(t_mask, result, 0.*result);
|
||||
|
||||
if (switch_sign)
|
||||
{
|
||||
q_out -= result;
|
||||
}
|
||||
else
|
||||
{
|
||||
q_out += result;
|
||||
}
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
* Name: SeqConservedCurrentSiteFwd
|
||||
* Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
|
||||
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
|
||||
* - Pass in q_in shifted in -ve mu direction.
|
||||
******************************************************************************/
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeField &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
vInteger t_mask,
|
||||
bool switch_sign)
|
||||
{
|
||||
SitePropagator result;
|
||||
Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
|
||||
result = WilsonCurrentBwd(result, mu);
|
||||
|
||||
// Zero any unwanted timeslice entries.
|
||||
result = predicatedWhere(t_mask, result, 0.*result);
|
||||
|
||||
if (switch_sign)
|
||||
{
|
||||
q_out += result;
|
||||
}
|
||||
else
|
||||
{
|
||||
q_out -= result;
|
||||
}
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonKernels);
|
||||
AdjointFermOpTemplateInstantiate(WilsonKernels);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonKernels);
|
||||
|
@ -55,7 +55,7 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
|
||||
public:
|
||||
|
||||
template <bool EnableBool = true>
|
||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
|
||||
typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
|
||||
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1)
|
||||
{
|
||||
@ -99,7 +99,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool EnableBool = true>
|
||||
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
|
||||
typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
|
||||
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
|
||||
// no kernel choice
|
||||
@ -116,7 +116,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool EnableBool = true>
|
||||
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
|
||||
typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
|
||||
DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1)
|
||||
{
|
||||
@ -161,7 +161,7 @@ public:
|
||||
}
|
||||
|
||||
template <bool EnableBool = true>
|
||||
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
|
||||
typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
|
||||
DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
|
||||
|
||||
@ -180,6 +180,38 @@ public:
|
||||
void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Utilities for inserting Wilson conserved current.
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
|
||||
const SitePropagator &q_in_2,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeField &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
bool switch_sign = false);
|
||||
void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
|
||||
const SitePropagator &q_in_2,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeField &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
bool switch_sign = false);
|
||||
void SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeField &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
vInteger t_mask,
|
||||
bool switch_sign = false);
|
||||
void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeField &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
vInteger t_mask,
|
||||
bool switch_sign = false);
|
||||
|
||||
private:
|
||||
// Specialised variants
|
||||
void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
|
@ -946,5 +946,6 @@ INSTANTIATE_THEM(DomainWallVec5dImplFH);
|
||||
INSTANTIATE_THEM(DomainWallVec5dImplDF);
|
||||
INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
|
||||
INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
|
||||
|
||||
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
|
||||
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
|
||||
}}
|
||||
|
@ -71,18 +71,14 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
|
||||
|
||||
RealD factor = 0.5 * beta / RealD(Nc);
|
||||
|
||||
//GaugeLinkField Umu(U._grid);
|
||||
GaugeLinkField Umu(U._grid);
|
||||
GaugeLinkField dSdU_mu(U._grid);
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
//Umu = PeekIndex<LorentzIndex>(U, mu);
|
||||
Umu = PeekIndex<LorentzIndex>(U, mu);
|
||||
|
||||
// Staple in direction mu
|
||||
//WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
|
||||
//dSdU_mu = Ta(Umu * dSdU_mu) * factor;
|
||||
|
||||
|
||||
WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
|
||||
dSdU_mu = Ta(dSdU_mu) * factor;
|
||||
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
|
||||
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
|
||||
|
||||
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
||||
}
|
||||
|
@ -16,12 +16,12 @@ class ScalarImplTypes {
|
||||
typedef iImplField<Simd> SiteField;
|
||||
typedef SiteField SitePropagator;
|
||||
typedef SiteField SiteComplex;
|
||||
|
||||
|
||||
typedef Lattice<SiteField> Field;
|
||||
typedef Field ComplexField;
|
||||
typedef Field FermionField;
|
||||
typedef Field PropagatorField;
|
||||
|
||||
|
||||
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
|
||||
gaussian(pRNG, P);
|
||||
}
|
||||
@ -47,54 +47,60 @@ class ScalarImplTypes {
|
||||
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||
U = 1.0;
|
||||
}
|
||||
|
||||
|
||||
static void MomentumSpacePropagator(Field &out, RealD m)
|
||||
{
|
||||
GridBase *grid = out._grid;
|
||||
Field kmu(grid), one(grid);
|
||||
const unsigned int nd = grid->_ndimension;
|
||||
std::vector<int> &l = grid->_fdimensions;
|
||||
|
||||
|
||||
one = Complex(1.0,0.0);
|
||||
out = m*m;
|
||||
for(int mu = 0; mu < nd; mu++)
|
||||
{
|
||||
Real twoPiL = M_PI*2./l[mu];
|
||||
|
||||
|
||||
LatticeCoordinate(kmu,mu);
|
||||
kmu = 2.*sin(.5*twoPiL*kmu);
|
||||
out = out + kmu*kmu;
|
||||
}
|
||||
out = one/out;
|
||||
}
|
||||
|
||||
|
||||
static void FreePropagator(const Field &in, Field &out,
|
||||
const Field &momKernel)
|
||||
{
|
||||
FFT fft((GridCartesian *)in._grid);
|
||||
Field inFT(in._grid);
|
||||
|
||||
|
||||
fft.FFT_all_dim(inFT, in, FFT::forward);
|
||||
inFT = inFT*momKernel;
|
||||
fft.FFT_all_dim(out, inFT, FFT::backward);
|
||||
}
|
||||
|
||||
|
||||
static void FreePropagator(const Field &in, Field &out, RealD m)
|
||||
{
|
||||
Field momKernel(in._grid);
|
||||
|
||||
|
||||
MomentumSpacePropagator(momKernel, m);
|
||||
FreePropagator(in, out, momKernel);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
#ifdef USE_FFT_ACCELERATION
|
||||
#ifndef FFT_MASS
|
||||
#error "USE_FFT_ACCELERATION is defined but not FFT_MASS"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template <class S, unsigned int N>
|
||||
class ScalarAdjMatrixImplTypes {
|
||||
public:
|
||||
typedef S Simd;
|
||||
typedef QCD::SU<N> Group;
|
||||
|
||||
|
||||
template <typename vtype>
|
||||
using iImplField = iScalar<iScalar<iMatrix<vtype, N>>>;
|
||||
template <typename vtype>
|
||||
@ -103,24 +109,119 @@ class ScalarImplTypes {
|
||||
typedef iImplField<Simd> SiteField;
|
||||
typedef SiteField SitePropagator;
|
||||
typedef iImplComplex<Simd> SiteComplex;
|
||||
|
||||
|
||||
typedef Lattice<SiteField> Field;
|
||||
typedef Lattice<SiteComplex> ComplexField;
|
||||
typedef Field FermionField;
|
||||
typedef Field PropagatorField;
|
||||
|
||||
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
|
||||
static void MomentaSquare(ComplexField &out)
|
||||
{
|
||||
GridBase *grid = out._grid;
|
||||
const std::vector<int> &l = grid->FullDimensions();
|
||||
ComplexField kmu(grid);
|
||||
|
||||
for (int mu = 0; mu < grid->Nd(); mu++)
|
||||
{
|
||||
Real twoPiL = M_PI * 2.0 / l[mu];
|
||||
LatticeCoordinate(kmu, mu);
|
||||
kmu = 2.0 * sin(0.5 * twoPiL * kmu);
|
||||
out += kmu * kmu;
|
||||
}
|
||||
}
|
||||
|
||||
static void MomentumSpacePropagator(ComplexField &out, RealD m)
|
||||
{
|
||||
GridBase *grid = out._grid;
|
||||
ComplexField one(grid);
|
||||
one = Complex(1.0, 0.0);
|
||||
out = m * m;
|
||||
MomentaSquare(out);
|
||||
out = one / out;
|
||||
}
|
||||
|
||||
static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
|
||||
{
|
||||
#ifndef USE_FFT_ACCELERATION
|
||||
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
|
||||
#else
|
||||
|
||||
Field Pgaussian(P._grid), Pp(P._grid);
|
||||
ComplexField p2(P._grid); p2 = zero;
|
||||
RealD M = FFT_MASS;
|
||||
|
||||
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
|
||||
|
||||
FFT theFFT((GridCartesian*)P._grid);
|
||||
theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
|
||||
MomentaSquare(p2);
|
||||
p2 += M * M;
|
||||
p2 = sqrt(p2);
|
||||
Pp *= p2;
|
||||
theFFT.FFT_all_dim(P, Pp, FFT::backward);
|
||||
|
||||
#endif //USE_FFT_ACCELERATION
|
||||
}
|
||||
|
||||
static inline Field projectForce(Field& P) {return P;}
|
||||
|
||||
static inline void update_field(Field& P, Field& U, double ep) {
|
||||
U += P*ep;
|
||||
static inline void update_field(Field &P, Field &U, double ep)
|
||||
{
|
||||
#ifndef USE_FFT_ACCELERATION
|
||||
double t0=usecond();
|
||||
U += P * ep;
|
||||
double t1=usecond();
|
||||
double total_time = (t1-t0)/1e6;
|
||||
std::cout << GridLogIntegrator << "Total time for updating field (s) : " << total_time << std::endl;
|
||||
#else
|
||||
// FFT transform P(x) -> P(p)
|
||||
// divide by (M^2+p^2) M external parameter (how to pass?)
|
||||
// P'(p) = P(p)/(M^2+p^2)
|
||||
// Transform back -> P'(x)
|
||||
// U += P'(x)*ep
|
||||
|
||||
Field Pp(U._grid), P_FFT(U._grid);
|
||||
static ComplexField p2(U._grid);
|
||||
RealD M = FFT_MASS;
|
||||
|
||||
FFT theFFT((GridCartesian*)U._grid);
|
||||
theFFT.FFT_all_dim(Pp, P, FFT::forward);
|
||||
|
||||
static bool first_call = true;
|
||||
if (first_call)
|
||||
{
|
||||
// avoid recomputing
|
||||
MomentumSpacePropagator(p2, M);
|
||||
first_call = false;
|
||||
}
|
||||
Pp *= p2;
|
||||
theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
|
||||
U += P_FFT * ep;
|
||||
|
||||
#endif //USE_FFT_ACCELERATION
|
||||
}
|
||||
|
||||
static inline RealD FieldSquareNorm(Field& U) {
|
||||
return (TensorRemove(sum(trace(U*U))).real());
|
||||
static inline RealD FieldSquareNorm(Field &U)
|
||||
{
|
||||
#ifndef USE_FFT_ACCELERATION
|
||||
return (TensorRemove(sum(trace(U * U))).real());
|
||||
#else
|
||||
// In case of Fourier acceleration we have to:
|
||||
// compute U(p)*U(p)/(M^2+p^2)) Parseval theorem
|
||||
// 1 FFT needed U(x) -> U(p)
|
||||
// M to be passed
|
||||
|
||||
FFT theFFT((GridCartesian*)U._grid);
|
||||
Field Up(U._grid);
|
||||
|
||||
theFFT.FFT_all_dim(Up, U, FFT::forward);
|
||||
RealD M = FFT_MASS;
|
||||
ComplexField p2(U._grid);
|
||||
MomentumSpacePropagator(p2, M);
|
||||
Field Up2 = Up * p2;
|
||||
// from the definition of the DFT we need to divide by the volume
|
||||
return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
|
||||
#endif //USE_FFT_ACCELERATION
|
||||
}
|
||||
|
||||
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||
@ -146,7 +247,7 @@ class ScalarImplTypes {
|
||||
typedef ScalarImplTypes<vComplex> ScalarImplCR;
|
||||
typedef ScalarImplTypes<vComplexF> ScalarImplCF;
|
||||
typedef ScalarImplTypes<vComplexD> ScalarImplCD;
|
||||
|
||||
|
||||
// Hardcoding here the size of the matrices
|
||||
typedef ScalarAdjMatrixImplTypes<vComplex, QCD::Nc> ScalarAdjImplR;
|
||||
typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
|
||||
@ -155,7 +256,7 @@ class ScalarImplTypes {
|
||||
template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex, Colours >;
|
||||
template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF, Colours >;
|
||||
template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD, Colours >;
|
||||
|
||||
|
||||
//}
|
||||
}
|
||||
|
||||
|
@ -30,119 +30,179 @@ directory
|
||||
#ifndef SCALAR_INT_ACTION_H
|
||||
#define SCALAR_INT_ACTION_H
|
||||
|
||||
|
||||
// Note: this action can completely absorb the ScalarAction for real float fields
|
||||
// use the scalarObjs to generalise the structure
|
||||
|
||||
namespace Grid {
|
||||
// FIXME drop the QCD namespace everywhere here
|
||||
namespace Grid
|
||||
{
|
||||
// FIXME drop the QCD namespace everywhere here
|
||||
|
||||
template <class Impl, int Ndim >
|
||||
class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
|
||||
public:
|
||||
INHERIT_FIELD_TYPES(Impl);
|
||||
private:
|
||||
RealD mass_square;
|
||||
RealD lambda;
|
||||
template <class Impl, int Ndim>
|
||||
class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
|
||||
{
|
||||
public:
|
||||
INHERIT_FIELD_TYPES(Impl);
|
||||
|
||||
private:
|
||||
RealD mass_square;
|
||||
RealD lambda;
|
||||
RealD g;
|
||||
const unsigned int N = Impl::Group::Dimension;
|
||||
|
||||
typedef typename Field::vector_object vobj;
|
||||
typedef CartesianStencil<vobj,vobj> Stencil;
|
||||
typedef typename Field::vector_object vobj;
|
||||
typedef CartesianStencil<vobj, vobj> Stencil;
|
||||
|
||||
SimpleCompressor<vobj> compressor;
|
||||
int npoint = 2*Ndim;
|
||||
std::vector<int> directions;// = {0,1,2,3,0,1,2,3}; // forcing 4 dimensions
|
||||
std::vector<int> displacements;// = {1,1,1,1, -1,-1,-1,-1};
|
||||
SimpleCompressor<vobj> compressor;
|
||||
int npoint = 2 * Ndim;
|
||||
std::vector<int> directions; //
|
||||
std::vector<int> displacements; //
|
||||
|
||||
|
||||
public:
|
||||
|
||||
ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
|
||||
for (int mu = 0 ; mu < Ndim; mu++){
|
||||
directions[mu] = mu; directions[mu+Ndim] = mu;
|
||||
displacements[mu] = 1; displacements[mu+Ndim] = -1;
|
||||
}
|
||||
public:
|
||||
ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
|
||||
{
|
||||
for (int mu = 0; mu < Ndim; mu++)
|
||||
{
|
||||
directions[mu] = mu;
|
||||
directions[mu + Ndim] = mu;
|
||||
displacements[mu] = 1;
|
||||
displacements[mu + Ndim] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
virtual std::string LogParameters() {
|
||||
std::stringstream sstream;
|
||||
sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl;
|
||||
sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
|
||||
return sstream.str();
|
||||
}
|
||||
virtual std::string LogParameters()
|
||||
{
|
||||
std::stringstream sstream;
|
||||
sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl;
|
||||
sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
|
||||
sstream << GridLogMessage << "[ScalarAction] g : " << g << std::endl;
|
||||
return sstream.str();
|
||||
}
|
||||
|
||||
virtual std::string action_name() {return "ScalarAction";}
|
||||
virtual std::string action_name() { return "ScalarAction"; }
|
||||
|
||||
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
|
||||
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
|
||||
|
||||
virtual RealD S(const Field &p) {
|
||||
assert(p._grid->Nd() == Ndim);
|
||||
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
|
||||
phiStencil.HaloExchange(p, compressor);
|
||||
Field action(p._grid), pshift(p._grid), phisquared(p._grid);
|
||||
phisquared = p*p;
|
||||
action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
|
||||
for (int mu = 0; mu < Ndim; mu++) {
|
||||
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
|
||||
parallel_for (int i = 0; i < p._grid->oSites(); i++) {
|
||||
int permute_type;
|
||||
StencilEntry *SE;
|
||||
vobj temp2;
|
||||
const vobj *temp, *t_p;
|
||||
|
||||
SE = phiStencil.GetEntry(permute_type, mu, i);
|
||||
t_p = &p._odata[i];
|
||||
if ( SE->_is_local ) {
|
||||
temp = &p._odata[SE->_offset];
|
||||
if ( SE->_permute ) {
|
||||
permute(temp2, *temp, permute_type);
|
||||
action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
|
||||
} else {
|
||||
action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
|
||||
}
|
||||
} else {
|
||||
action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
}
|
||||
// action -= pshift*p + p*pshift;
|
||||
}
|
||||
// NB the trace in the algebra is normalised to 1/2
|
||||
// minus sign coming from the antihermitian fields
|
||||
return -(TensorRemove(sum(trace(action)))).real();
|
||||
};
|
||||
|
||||
virtual void deriv(const Field &p, Field &force) {
|
||||
assert(p._grid->Nd() == Ndim);
|
||||
force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
|
||||
// move this outside
|
||||
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
|
||||
phiStencil.HaloExchange(p, compressor);
|
||||
|
||||
//for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
|
||||
for (int point = 0; point < npoint; point++) {
|
||||
parallel_for (int i = 0; i < p._grid->oSites(); i++) {
|
||||
const vobj *temp;
|
||||
vobj temp2;
|
||||
int permute_type;
|
||||
StencilEntry *SE;
|
||||
SE = phiStencil.GetEntry(permute_type, point, i);
|
||||
|
||||
if ( SE->_is_local ) {
|
||||
temp = &p._odata[SE->_offset];
|
||||
if ( SE->_permute ) {
|
||||
permute(temp2, *temp, permute_type);
|
||||
force._odata[i] -= temp2;
|
||||
} else {
|
||||
force._odata[i] -= *temp;
|
||||
}
|
||||
} else {
|
||||
force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
}
|
||||
virtual RealD S(const Field &p)
|
||||
{
|
||||
assert(p._grid->Nd() == Ndim);
|
||||
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
|
||||
phiStencil.HaloExchange(p, compressor);
|
||||
Field action(p._grid), pshift(p._grid), phisquared(p._grid);
|
||||
phisquared = p * p;
|
||||
action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
|
||||
for (int mu = 0; mu < Ndim; mu++)
|
||||
{
|
||||
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
|
||||
parallel_for(int i = 0; i < p._grid->oSites(); i++)
|
||||
{
|
||||
int permute_type;
|
||||
StencilEntry *SE;
|
||||
vobj temp2;
|
||||
const vobj *temp, *t_p;
|
||||
|
||||
SE = phiStencil.GetEntry(permute_type, mu, i);
|
||||
t_p = &p._odata[i];
|
||||
if (SE->_is_local)
|
||||
{
|
||||
temp = &p._odata[SE->_offset];
|
||||
if (SE->_permute)
|
||||
{
|
||||
permute(temp2, *temp, permute_type);
|
||||
action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
|
||||
}
|
||||
else
|
||||
{
|
||||
action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
}
|
||||
// action -= pshift*p + p*pshift;
|
||||
}
|
||||
// NB the trace in the algebra is normalised to 1/2
|
||||
// minus sign coming from the antihermitian fields
|
||||
return -(TensorRemove(sum(trace(action)))).real() * N / g;
|
||||
};
|
||||
|
||||
} // namespace Grid
|
||||
|
||||
#endif // SCALAR_INT_ACTION_H
|
||||
virtual void deriv(const Field &p, Field &force)
|
||||
{
|
||||
double t0 = usecond();
|
||||
assert(p._grid->Nd() == Ndim);
|
||||
force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
|
||||
double interm_t = usecond();
|
||||
|
||||
// move this outside
|
||||
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
|
||||
|
||||
phiStencil.HaloExchange(p, compressor);
|
||||
double halo_t = usecond();
|
||||
int chunk = 128;
|
||||
//for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
|
||||
|
||||
// inverting the order of the loops slows down the code(! g++ 7)
|
||||
// cannot try to reduce the number of force writes by factor npoint...
|
||||
// use cache blocking
|
||||
for (int point = 0; point < npoint; point++)
|
||||
{
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
int permute_type;
|
||||
StencilEntry *SE;
|
||||
const vobj *temp;
|
||||
|
||||
#pragma omp for schedule(static, chunk)
|
||||
for (int i = 0; i < p._grid->oSites(); i++)
|
||||
{
|
||||
SE = phiStencil.GetEntry(permute_type, point, i);
|
||||
// prefetch next p?
|
||||
|
||||
if (SE->_is_local)
|
||||
{
|
||||
temp = &p._odata[SE->_offset];
|
||||
|
||||
if (SE->_permute)
|
||||
{
|
||||
vobj temp2;
|
||||
permute(temp2, *temp, permute_type);
|
||||
force._odata[i] -= temp2;
|
||||
}
|
||||
else
|
||||
{
|
||||
force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
force *= N / g;
|
||||
|
||||
double t1 = usecond();
|
||||
double total_time = (t1 - t0) / 1e6;
|
||||
double interm_time = (interm_t - t0) / 1e6;
|
||||
double halo_time = (halo_t - interm_t) / 1e6;
|
||||
double stencil_time = (t1 - halo_t) / 1e6;
|
||||
std::cout << GridLogIntegrator << "Total time for force computation (s) : " << total_time << std::endl;
|
||||
std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
|
||||
std::cout << GridLogIntegrator << "Halo time in force computation (s) : " << halo_time << std::endl;
|
||||
std::cout << GridLogIntegrator << "Stencil time in force computation (s) : " << stencil_time << std::endl;
|
||||
double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
|
||||
double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
|
||||
double Gflops = flops / (total_time * 1e9);
|
||||
double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
|
||||
std::cout << GridLogIntegrator << "Flops: " << flops << " - Gflop/s : " << Gflops << std::endl;
|
||||
std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << " - Gflop/s NS: " << Gflops_no_stencil << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Grid
|
||||
|
||||
#endif // SCALAR_INT_ACTION_H
|
||||
|
@ -211,7 +211,7 @@ typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
|
||||
ScalarAdjGenericHMCRunner;
|
||||
|
||||
template <int Colours>
|
||||
using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
|
||||
using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, ForceGradient, ScalarNxNMatrixFields<Colours> >;
|
||||
|
||||
} // namespace QCD
|
||||
} // namespace Grid
|
||||
|
@ -92,6 +92,19 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
|
||||
PlaquetteMod(): ObsBase(NoParameters()){}
|
||||
};
|
||||
|
||||
template < class Impl >
|
||||
class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
|
||||
typedef ObservableModule<PolyakovLogger<Impl>, NoParameters> ObsBase;
|
||||
using ObsBase::ObsBase; // for constructors
|
||||
|
||||
// acquire resource
|
||||
virtual void initialize(){
|
||||
this->ObservablePtr.reset(new PolyakovLogger<Impl>());
|
||||
}
|
||||
public:
|
||||
PolyakovMod(): ObsBase(NoParameters()){}
|
||||
};
|
||||
|
||||
|
||||
template < class Impl >
|
||||
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
|
||||
|
@ -45,5 +45,7 @@ class HmcObservable {
|
||||
|
||||
#include "plaquette.h"
|
||||
#include "topological_charge.h"
|
||||
#include "polyakov_loop.h"
|
||||
|
||||
|
||||
#endif // HMC_OBSERVABLE_H
|
||||
|
68
lib/qcd/observables/polyakov_loop.h
Normal file
68
lib/qcd/observables/polyakov_loop.h
Normal file
@ -0,0 +1,68 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/modules/polyakov_line.h
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: David Preti <david.preti@csic.es>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#ifndef HMC_POLYAKOV_H
|
||||
#define HMC_POLYAKOV_H
|
||||
|
||||
namespace Grid {
|
||||
namespace QCD {
|
||||
|
||||
// this is only defined for a gauge theory
|
||||
template <class Impl>
|
||||
class PolyakovLogger : public HmcObservable<typename Impl::Field> {
|
||||
public:
|
||||
// here forces the Impl to be of gauge fields
|
||||
// if not the compiler will complain
|
||||
INHERIT_GIMPL_TYPES(Impl);
|
||||
|
||||
// necessary for HmcObservable compatibility
|
||||
typedef typename Impl::Field Field;
|
||||
|
||||
void TrajectoryComplete(int traj,
|
||||
Field &U,
|
||||
GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
|
||||
ComplexD polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U);
|
||||
|
||||
int def_prec = std::cout.precision();
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< std::setprecision(std::numeric_limits<Real>::digits10 + 1)
|
||||
<< "Polyakov Loop: [ " << traj << " ] "<< polyakov << std::endl;
|
||||
|
||||
std::cout.precision(def_prec);
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace QCD
|
||||
} // namespace Grid
|
||||
|
||||
#endif // HMC_POLYAKOV_H
|
@ -23,6 +23,7 @@ class AdjointRep {
|
||||
typedef typename SU_Adjoint<ncolour>::LatticeAdjMatrix LatticeMatrix;
|
||||
typedef typename SU_Adjoint<ncolour>::LatticeAdjField LatticeField;
|
||||
static const int Dimension = ncolour * ncolour - 1;
|
||||
static const bool isFundamental = false;
|
||||
|
||||
LatticeField U;
|
||||
|
||||
|
@ -19,6 +19,7 @@ template <int ncolour>
|
||||
class FundamentalRep {
|
||||
public:
|
||||
static const int Dimension = ncolour;
|
||||
static const bool isFundamental = true;
|
||||
|
||||
// typdef to be used by the Representations class in HMC to get the
|
||||
// types for the higher representation fields
|
||||
|
@ -29,6 +29,7 @@ class TwoIndexRep {
|
||||
typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexMatrix LatticeMatrix;
|
||||
typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexField LatticeField;
|
||||
static const int Dimension = ncolour * (ncolour + S) / 2;
|
||||
static const bool isFundamental = false;
|
||||
|
||||
LatticeField U;
|
||||
|
||||
|
@ -123,6 +123,28 @@ public:
|
||||
return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// average over all x,y,z the temporal loop
|
||||
//////////////////////////////////////////////////
|
||||
static ComplexD avgPolyakovLoop(const GaugeField &Umu) { //assume Nd=4
|
||||
GaugeMat Ut(Umu._grid), P(Umu._grid);
|
||||
ComplexD out;
|
||||
int T = Umu._grid->GlobalDimensions()[3];
|
||||
int X = Umu._grid->GlobalDimensions()[0];
|
||||
int Y = Umu._grid->GlobalDimensions()[1];
|
||||
int Z = Umu._grid->GlobalDimensions()[2];
|
||||
|
||||
Ut = peekLorentz(Umu,3); //Select temporal direction
|
||||
P = Ut;
|
||||
for (int t=1;t<T;t++){
|
||||
P = Gimpl::CovShiftForward(Ut,3,P);
|
||||
}
|
||||
RealD norm = 1.0/(Nc*X*Y*Z*T);
|
||||
out = sum(trace(P))*norm;
|
||||
return out;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// average over traced single links
|
||||
//////////////////////////////////////////////////
|
||||
@ -190,6 +212,7 @@ public:
|
||||
|
||||
|
||||
// For the force term
|
||||
/*
|
||||
static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
GridBase *grid = Umu._grid;
|
||||
std::vector<GaugeMat> U(Nd, grid);
|
||||
@ -203,7 +226,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
|
||||
for (int nu = 0; nu < Nd; nu++) {
|
||||
if (nu != mu) {
|
||||
// this is ~10% faster than the Staple
|
||||
// this is ~10% faster than the Staple -- PAB: so what it gives the WRONG answers for other BC's!
|
||||
tmp1 = Cshift(U[nu], mu, 1);
|
||||
tmp2 = Cshift(U[mu], nu, 1);
|
||||
staple += tmp1* adj(U[nu]*tmp2);
|
||||
@ -213,7 +236,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
}
|
||||
staple = U[mu]*staple;
|
||||
}
|
||||
|
||||
*/
|
||||
//////////////////////////////////////////////////
|
||||
// the sum over all staples on each site
|
||||
//////////////////////////////////////////////////
|
||||
@ -291,9 +314,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// the sum over all staples on each site in direction mu,nu, lower part
|
||||
//////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
static void StapleLower(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
|
||||
int nu) {
|
||||
if (nu != mu) {
|
||||
@ -315,7 +338,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
//
|
||||
staple = Gimpl::ShiftStaple(
|
||||
Gimpl::CovShiftBackward(U[nu], nu,
|
||||
Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
||||
Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
|
||||
mu);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -325,7 +350,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
static void FieldStrength(GaugeMat &FS, const GaugeLorentz &Umu, int mu, int nu){
|
||||
// Fmn +--<--+ Ut +--<--+
|
||||
// | | | |
|
||||
// (x)+-->--+ +-->--+(x)
|
||||
// (x)+-->--+ +-->--+(x) - h.c.
|
||||
// | | | |
|
||||
// +--<--+ +--<--+
|
||||
|
||||
@ -335,7 +360,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
GaugeMat v = Vup - Vdn;
|
||||
GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu); // some redundant copies
|
||||
GaugeMat vu = v*u;
|
||||
FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
|
||||
//FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
|
||||
FS = (u*v + Cshift(vu, mu, -1));
|
||||
FS = 0.125*(FS - adj(FS));
|
||||
}
|
||||
|
||||
static Real TopologicalCharge(GaugeLorentz &U){
|
||||
@ -360,6 +387,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
return TensorRemove(Tq).real();
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Similar to above for rectangle is required
|
||||
//////////////////////////////////////////////////////
|
||||
|
@ -31,44 +31,78 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
#define GRID_SERIALISATION_ABSTRACT_READER_H
|
||||
|
||||
#include <type_traits>
|
||||
#include <Grid/tensors/Tensors.h>
|
||||
|
||||
namespace Grid {
|
||||
// Vector IO utilities ///////////////////////////////////////////////////////
|
||||
// helper function to read space-separated values
|
||||
// Grid scalar tensors to nested std::vectors //////////////////////////////////
|
||||
template <typename T>
|
||||
std::vector<T> strToVec(const std::string s)
|
||||
struct TensorToVec
|
||||
{
|
||||
std::istringstream sstr(s);
|
||||
T buf;
|
||||
std::vector<T> v;
|
||||
|
||||
while(!sstr.eof())
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct TensorToVec<iScalar<T>>
|
||||
{
|
||||
typedef typename TensorToVec<T>::type type;
|
||||
};
|
||||
|
||||
template <typename T, int N>
|
||||
struct TensorToVec<iVector<T, N>>
|
||||
{
|
||||
typedef typename std::vector<typename TensorToVec<T>::type> type;
|
||||
};
|
||||
|
||||
template <typename T, int N>
|
||||
struct TensorToVec<iMatrix<T, N>>
|
||||
{
|
||||
typedef typename std::vector<std::vector<typename TensorToVec<T>::type>> type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
typename TensorToVec<T>::type tensorToVec(const T &t)
|
||||
{
|
||||
return t;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename TensorToVec<iScalar<T>>::type tensorToVec(const iScalar<T>& t)
|
||||
{
|
||||
return tensorToVec(t._internal);
|
||||
}
|
||||
|
||||
template <typename T, int N>
|
||||
typename TensorToVec<iVector<T, N>>::type tensorToVec(const iVector<T, N>& t)
|
||||
{
|
||||
typename TensorToVec<iVector<T, N>>::type v;
|
||||
|
||||
v.resize(N);
|
||||
for (unsigned int i = 0; i < N; i++)
|
||||
{
|
||||
sstr >> buf;
|
||||
v.push_back(buf);
|
||||
v[i] = tensorToVec(t._internal[i]);
|
||||
}
|
||||
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
// output to streams for vectors
|
||||
template < class T >
|
||||
inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
|
||||
|
||||
template <typename T, int N>
|
||||
typename TensorToVec<iMatrix<T, N>>::type tensorToVec(const iMatrix<T, N>& t)
|
||||
{
|
||||
os << "[";
|
||||
for (auto &x: v)
|
||||
typename TensorToVec<iMatrix<T, N>>::type v;
|
||||
|
||||
v.resize(N);
|
||||
for (unsigned int i = 0; i < N; i++)
|
||||
{
|
||||
os << x << " ";
|
||||
v[i].resize(N);
|
||||
for (unsigned int j = 0; j < N; j++)
|
||||
{
|
||||
v[i][j] = tensorToVec(t._internal[i][j]);
|
||||
}
|
||||
}
|
||||
if (v.size() > 0)
|
||||
{
|
||||
os << "\b";
|
||||
}
|
||||
os << "]";
|
||||
|
||||
return os;
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
// Vector element trait //////////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
struct element
|
||||
@ -151,15 +185,15 @@ namespace Grid {
|
||||
do
|
||||
{
|
||||
is.get(c);
|
||||
} while (c != '<' && !is.eof());
|
||||
if (c == '<')
|
||||
} while (c != '(' && !is.eof());
|
||||
if (c == '(')
|
||||
{
|
||||
int start = is.tellg();
|
||||
do
|
||||
{
|
||||
is.get(c);
|
||||
} while (c != '>' && !is.eof());
|
||||
if (c == '>')
|
||||
} while (c != ')' && !is.eof());
|
||||
if (c == ')')
|
||||
{
|
||||
int end = is.tellg();
|
||||
int psize = end - start - 1;
|
||||
@ -182,7 +216,43 @@ namespace Grid {
|
||||
template <class T1, class T2>
|
||||
inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p)
|
||||
{
|
||||
os << "<" << p.first << " " << p.second << ">";
|
||||
os << "(" << p.first << " " << p.second << ")";
|
||||
return os;
|
||||
}
|
||||
|
||||
// Vector IO utilities ///////////////////////////////////////////////////////
|
||||
// helper function to read space-separated values
|
||||
template <typename T>
|
||||
std::vector<T> strToVec(const std::string s)
|
||||
{
|
||||
std::istringstream sstr(s);
|
||||
T buf;
|
||||
std::vector<T> v;
|
||||
|
||||
while(!sstr.eof())
|
||||
{
|
||||
sstr >> buf;
|
||||
v.push_back(buf);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
// output to streams for vectors
|
||||
template < class T >
|
||||
inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
|
||||
{
|
||||
os << "[";
|
||||
for (auto &x: v)
|
||||
{
|
||||
os << x << " ";
|
||||
}
|
||||
if (v.size() > 0)
|
||||
{
|
||||
os << "\b";
|
||||
}
|
||||
os << "]";
|
||||
|
||||
return os;
|
||||
}
|
||||
|
||||
|
@ -79,7 +79,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||
"vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
|
||||
#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
|
||||
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
|
||||
|
||||
#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||
|
@ -105,7 +105,6 @@ template<class vobj,class cobj>
|
||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||
public:
|
||||
|
||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||
typedef typename cobj::vector_type vector_type;
|
||||
typedef typename cobj::scalar_type scalar_type;
|
||||
typedef typename cobj::scalar_object scalar_object;
|
||||
|
@ -204,7 +204,7 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
||||
// Reinit guard
|
||||
/////////////////////////////////////////////////////////
|
||||
static int Grid_is_initialised = 0;
|
||||
|
||||
static MemoryStats dbgMemStats;
|
||||
|
||||
void Grid_init(int *argc,char ***argv)
|
||||
{
|
||||
@ -220,11 +220,11 @@ void Grid_init(int *argc,char ***argv)
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
|
||||
GridCmdOptionInt(arg,MB);
|
||||
uint64_t MB64 = MB;
|
||||
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
|
||||
GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
|
||||
CartesianCommunicator::Hugepages = 1;
|
||||
GlobalSharedMemory::Hugepages = 1;
|
||||
}
|
||||
|
||||
|
||||
@ -251,6 +251,11 @@ void Grid_init(int *argc,char ***argv)
|
||||
assert(fp!=(FILE *)NULL);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
|
||||
MemoryProfiler::debug = true;
|
||||
MemoryProfiler::stats = &dbgMemStats;
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
// Banner
|
||||
////////////////////////////////////
|
||||
@ -324,6 +329,7 @@ void Grid_init(int *argc,char ***argv)
|
||||
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
||||
@ -392,8 +398,8 @@ void Grid_init(int *argc,char ***argv)
|
||||
Grid_default_latt,
|
||||
Grid_default_mpi);
|
||||
|
||||
std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
|
||||
if ( CartesianCommunicator::Hugepages) {
|
||||
std::cout << GridLogMessage << "Requesting "<< GlobalSharedMemory::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
|
||||
if ( GlobalSharedMemory::Hugepages) {
|
||||
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
|
||||
}
|
||||
|
||||
|
72
lib/util/Profiling.h
Normal file
72
lib/util/Profiling.h
Normal file
@ -0,0 +1,72 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/util/Profiling.h
|
||||
|
||||
Copyright (C) 2018
|
||||
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#ifndef GRID_PERF_PROFILING_H
|
||||
#define GRID_PERF_PROFILING_H
|
||||
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
#include <fcntl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
|
||||
struct System
|
||||
{
|
||||
static void profile(const std::string& name,std::function<void()> body) {
|
||||
std::string filename = name.find(".data") == std::string::npos ? (name + ".data") : name;
|
||||
|
||||
// Launch profiler
|
||||
pid_t pid;
|
||||
std::stringstream s;
|
||||
s << getpid();
|
||||
pid = fork();
|
||||
if (pid == 0) {
|
||||
auto fd=open("/dev/null",O_RDWR);
|
||||
dup2(fd,1);
|
||||
dup2(fd,2);
|
||||
exit(execl("/usr/bin/perf","perf","record","-o",filename.c_str(),"-p",s.str().c_str(),nullptr));
|
||||
}
|
||||
|
||||
// Run body
|
||||
body();
|
||||
|
||||
// Kill profiler
|
||||
kill(pid,SIGINT);
|
||||
waitpid(pid,nullptr,0);
|
||||
}
|
||||
|
||||
static void profile(std::function<void()> body) {
|
||||
profile("perf.data",body);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // GRID_PERF_PROFILING_H
|
Reference in New Issue
Block a user