1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-07-28 02:07:07 +01:00

Merge branch 'develop' into feature/staggered-comms-compute

Conflicts:
	lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
This commit is contained in:
Azusa Yamaguchi
2018-05-21 13:07:29 +01:00
238 changed files with 18155 additions and 7613 deletions

View File

@@ -1,28 +1,18 @@
extra_sources=
extra_headers=
if BUILD_COMMS_MPI
extra_sources+=communicator/Communicator_mpi.cc
extra_sources+=communicator/Communicator_base.cc
endif
if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_mpi3.cc
extra_sources+=communicator/Communicator_base.cc
endif
if BUILD_COMMS_MPIT
extra_sources+=communicator/Communicator_mpit.cc
extra_sources+=communicator/Communicator_base.cc
endif
if BUILD_COMMS_SHMEM
extra_sources+=communicator/Communicator_shmem.cc
extra_sources+=communicator/Communicator_base.cc
extra_sources+=communicator/SharedMemoryMPI.cc
extra_sources+=communicator/SharedMemory.cc
endif
if BUILD_COMMS_NONE
extra_sources+=communicator/Communicator_none.cc
extra_sources+=communicator/Communicator_base.cc
extra_sources+=communicator/SharedMemoryNone.cc
extra_sources+=communicator/SharedMemory.cc
endif
if BUILD_HDF5

View File

@@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h>

View File

@@ -183,11 +183,13 @@ namespace Grid {
virtual RealD Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp(in._grid);
Field tmp(in._grid);
tmp.checkerboard = in.checkerboard;
ni=Mpc(in,tmp);
no=MpcDag(tmp,out);
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.checkerboard = in.checkerboard;
MpcDagMpc(in,out,n1,n2);
}
virtual void HermOp(const Field &in, Field &out){
@@ -215,13 +217,15 @@ namespace Grid {
public:
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
// std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
Field tmp(in._grid);
tmp.checkerboard = !in.checkerboard;
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.checkerboard << " cb out " << out.checkerboard << std::endl;
_Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out);
}

View File

@@ -0,0 +1,101 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_DEFLATION_H
#define GRID_DEFLATION_H
namespace Grid {
struct ZeroGuesser {
public:
template<class Field>
void operator()(const Field &src,Field &guess) { guess = Zero(); };
};
struct SourceGuesser {
public:
template<class Field>
void operator()(const Field &src,Field &guess) { guess = src; };
};
////////////////////////////////
// Fine grid deflation
////////////////////////////////
template<class Field>
struct DeflatedGuesser {
private:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
void operator()(const Field &src,Field &guess) {
guess = zero;
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
}
}
};
template<class FineField, class CoarseField>
class LocalCoherenceDeflatedGuesser {
private:
const std::vector<FineField> &subspace;
const std::vector<CoarseField> &evec_coarse;
const std::vector<RealD> &eval_coarse;
public:
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse)
: subspace(_subspace),
evec_coarse(_evec_coarse),
eval_coarse(_eval_coarse)
{
}
void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0]._grid);
CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero;
blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
}
blockPromote(guess_coarse,guess,subspace);
};
};
}
#endif

View File

@@ -149,19 +149,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = zero;
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
/////////////////////////////////////////////////////////////
@@ -181,6 +168,7 @@ enum IRLdiagonalisation {
template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester<Field>
{
public:
LinearFunction<Field> &_HermOp;
ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp) { };
int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
@@ -243,6 +231,7 @@ class ImplicitlyRestartedLanczos {
/////////////////////////
public:
//////////////////////////////////////////////////////////////////
// PAB:
//////////////////////////////////////////////////////////////////
@@ -490,15 +479,13 @@ until convergence
Field B(grid); B.checkerboard = evec[0].checkerboard;
// power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1;
for(int jj = 1; jj<=Nstop; jj*=2){
int j = Nstop-jj;
RealD e = eval2_copy[j]; // Discard the evalue
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
if( _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
if ( j > Nconv ) {
Nconv=j+1;
jj=Nstop; // Terminate the scan
}
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
allconv=0;
}
}
// Do evec[0] for good measure
@@ -506,8 +493,10 @@ until convergence
int j=0;
RealD e = eval2_copy[0];
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox);
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
}
if ( allconv ) Nconv = Nstop;
// test if we converged, if so, terminate
std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
// if( Nconv>=Nstop || beta_k < betastp){

View File

@@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H
namespace Grid {
struct LanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
@@ -45,6 +48,7 @@ struct LanczosParams : Serializable {
struct LocalCoherenceLanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
bool, saveEvecs,
bool, doFine,
bool, doFineRead,
bool, doCoarse,
@@ -70,21 +74,24 @@ public:
typedef Lattice<Fobj> FineField;
LinearOperatorBase<FineField> &_Linop;
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
std::vector<FineField> &subspace;
ProjectedHermOp(LinearOperatorBase<FineField>& linop, Aggregation<Fobj,CComplex,nbasis> &aggregate) :
_Linop(linop),
_Aggregate(aggregate) { };
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace)
{
assert(subspace.size() >0);
};
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard= checkerboard;
FineField fout(FineGrid); fout.checkerboard = checkerboard;
GridBase *FineGrid = _Aggregate.FineGrid;
FineField fin(FineGrid);
FineField fout(FineGrid);
_Aggregate.PromoteFromSubspace(in,fin); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
_Aggregate.ProjectToSubspace(out,fout); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
}
};
@@ -99,24 +106,27 @@ public:
OperatorFunction<FineField> & _poly;
LinearOperatorBase<FineField> &_Linop;
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
std::vector<FineField> &subspace;
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop,
Aggregation<Fobj,CComplex,nbasis> &aggregate) :
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
LinearOperatorBase<FineField>& linop,
std::vector<FineField> & _subspace) :
_poly(poly),
_Linop(linop),
_Aggregate(aggregate) { };
subspace(_subspace)
{ };
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = _Aggregate.FineGrid;
FineField fin(FineGrid) ;fin.checkerboard =_Aggregate.checkerboard;
FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
_Aggregate.PromoteFromSubspace(in,fin); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard =checkerboard;
FineField fout(FineGrid);fout.checkerboard =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
_Aggregate.ProjectToSubspace(out,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
}
};
@@ -132,19 +142,23 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
LinearFunction<CoarseField> & _Poly;
OperatorFunction<FineField> & _smoother;
LinearOperatorBase<FineField> &_Linop;
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
RealD _coarse_relax_tol;
RealD _coarse_relax_tol;
std::vector<FineField> &_subspace;
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop,
Aggregation<Fobj,CComplex,nbasis> &Aggregate,
std::vector<FineField> &subspace,
RealD coarse_relax_tol=5.0e3)
: _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol) { };
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
_coarse_relax_tol(coarse_relax_tol)
{ };
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
CoarseField v(B);
RealD eval_poly = eval;
// Apply operator
_Poly(B,v);
@@ -168,14 +182,13 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
}
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
GridBase *FineGrid = _Aggregate.FineGrid;
int checkerboard = _Aggregate.checkerboard;
GridBase *FineGrid = _subspace[0]._grid;
int checkerboard = _subspace[0].checkerboard;
FineField fB(FineGrid);fB.checkerboard =checkerboard;
FineField fv(FineGrid);fv.checkerboard =checkerboard;
_Aggregate.PromoteFromSubspace(B,fv);
blockPromote(B,fv,_subspace);
_smoother(_Linop,fv,fB);
RealD eval_poly = eval;
@@ -217,27 +230,65 @@ protected:
int _checkerboard;
LinearOperatorBase<FineField> & _FineOp;
// FIXME replace Aggregation with vector of fine; the code reuse is too small for
// the hassle and complexity of cross coupling.
Aggregation<Fobj,CComplex,nbasis> _Aggregate;
std::vector<RealD> evals_fine;
std::vector<RealD> evals_coarse;
std::vector<CoarseField> evec_coarse;
std::vector<RealD> &evals_fine;
std::vector<RealD> &evals_coarse;
std::vector<FineField> &subspace;
std::vector<CoarseField> &evec_coarse;
private:
std::vector<RealD> _evals_fine;
std::vector<RealD> _evals_coarse;
std::vector<FineField> _subspace;
std::vector<CoarseField> _evec_coarse;
public:
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard) :
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_Aggregate(CoarseGrid,FineGrid,checkerboard),
_FineOp(FineOp),
_checkerboard(checkerboard)
_checkerboard(checkerboard),
evals_fine (_evals_fine),
evals_coarse(_evals_coarse),
subspace (_subspace),
evec_coarse(_evec_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
//////////////////////////////////////////////////////////////////////////
// Alternate constructore, external storage for use by Hadrons module
//////////////////////////////////////////////////////////////////////////
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard,
std::vector<FineField> &ext_subspace,
std::vector<CoarseField> &ext_coarse,
std::vector<RealD> &ext_eval_fine,
std::vector<RealD> &ext_eval_coarse
) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_FineOp(FineOp),
_checkerboard(checkerboard),
evals_fine (ext_eval_fine),
evals_coarse(ext_eval_coarse),
subspace (ext_subspace),
evec_coarse (ext_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid);
blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
};
template<typename T> static RealD normalise(T& v)
{
@@ -246,43 +297,44 @@ public:
v = v * (1.0/nn);
return nn;
}
/*
void fakeFine(void)
{
int Nk = nbasis;
_Aggregate.subspace.resize(Nk,_FineGrid);
_Aggregate.subspace[0]=1.0;
_Aggregate.subspace[0].checkerboard=_checkerboard;
normalise(_Aggregate.subspace[0]);
subspace.resize(Nk,_FineGrid);
subspace[0]=1.0;
subspace[0].checkerboard=_checkerboard;
normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){
_Aggregate.subspace[k].checkerboard=_checkerboard;
Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
normalise(_Aggregate.subspace[k]);
subspace[k].checkerboard=_checkerboard;
Op(subspace[k-1],subspace[k]);
normalise(subspace[k]);
}
}
*/
void testFine(RealD resid)
{
assert(evals_fine.size() == nbasis);
assert(_Aggregate.subspace.size() == nbasis);
assert(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){
assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
}
}
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{
assert(evals_fine.size() == nbasis);
assert(_Aggregate.subspace.size() == nbasis);
assert(subspace.size() == nbasis);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_subspace);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
for(int k=0;k<evec_coarse.size();k++){
if ( k < nbasis ) {
@@ -302,34 +354,34 @@ public:
PlainHermOp<FineField> Op(_FineOp);
evals_fine.resize(Nm);
_Aggregate.subspace.resize(Nm,_FineGrid);
subspace.resize(Nm,_FineGrid);
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
int Nconv;
IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved
assert(Nstop>=nbasis);
assert(Nconv>=nbasis);
evals_fine.resize(nbasis);
_Aggregate.subspace.resize(nbasis,_FineGrid);
subspace.resize(nbasis,_FineGrid);
}
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,_Aggregate);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,_subspace);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_subspace);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_subspace,relax);
evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid);

View File

@@ -107,7 +107,12 @@ namespace Grid {
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
@@ -129,7 +134,6 @@ namespace Grid {
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
/////////////////////////////////////////////////////
@@ -146,6 +150,7 @@ namespace Grid {
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
guess(src_o,sol_o);
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called the Mpc solver" <<std::endl;
@@ -189,7 +194,12 @@ namespace Grid {
CBfactorise=cb;
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
@@ -225,6 +235,7 @@ namespace Grid {
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
guess(src_o,sol_o);
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
///////////////////////////////////////////////////
@@ -268,7 +279,12 @@ namespace Grid {
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix,class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
@@ -305,6 +321,7 @@ namespace Grid {
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
guess(src_o,tmp);
_HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
@@ -347,7 +364,12 @@ namespace Grid {
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
@@ -385,6 +407,7 @@ namespace Grid {
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
// _HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
guess(src_o,tmp);
_HermitianRBSolver(src_o,tmp); assert(tmp.checkerboard==Odd);
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);

View File

@@ -3,9 +3,12 @@
namespace Grid {
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
@@ -94,4 +97,29 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
}

View File

@@ -63,6 +63,64 @@ namespace Grid {
static void *Lookup(size_t bytes) ;
};
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl;\
}
#define profilerAllocate(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalAllocated += (bytes);\
s->currentlyAllocated += (bytes);\
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
#define profilerFree(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalFreed += (bytes);\
s->currentlyAllocated -= (bytes);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
void check_huge_pages(void *Buf,uint64_t BYTES);
@@ -92,6 +150,7 @@ public:
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
@@ -122,6 +181,8 @@ public:
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#ifdef HAVE_MM_MALLOC_H
@@ -172,10 +233,13 @@ public:
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
@@ -193,29 +257,39 @@ public:
#endif
return ptr;
}
void deallocate(pointer __p, size_type) {
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
shmem_free((void *)__p);
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
#endif
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
#endif
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#ifdef GRID_OMP
#pragma omp parallel for schedule(static)
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
}
return ptr;
}
void deallocate(pointer __p, size_type) {
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else

View File

@@ -59,6 +59,7 @@ public:
virtual ~GridBase() = default;
// Physics Grid information.
std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -78,6 +79,8 @@ public:
std::vector<int> _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
std::vector<int> _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
bool _isCheckerBoarded;
public:
////////////////////////////////////////////////////////////////

View File

@@ -97,6 +97,7 @@ public:
///////////////////////
// Grid information
///////////////////////
_isCheckerBoarded = false;
_ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
@@ -122,6 +123,7 @@ public:
// Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
@@ -166,6 +168,7 @@ public:
block = block * _rdimensions[d];
}
};
};
}
#endif

View File

@@ -171,9 +171,8 @@ public:
const std::vector<int> &checker_dim_mask,
int checker_dim)
{
///////////////////////
// Grid information
///////////////////////
_isCheckerBoarded = true;
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();

View File

@@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H
#include <Grid/communicator/SharedMemory.h>
#include <Grid/communicator/Communicator_base.h>
#endif

View File

@@ -36,33 +36,9 @@ namespace Grid {
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////
void * CartesianCommunicator::ShmCommBuf;
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
CartesianCommunicator::CommunicatorPolicy_t
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
int CartesianCommunicator::nCommThreads = -1;
int CartesianCommunicator::Hugepages = 0;
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;
heap_top += bytes;
heap_bytes+= bytes;
if (heap_bytes >= MAX_MPI_SHM_BYTES) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
assert(heap_bytes<MAX_MPI_SHM_BYTES);
}
return ptr;
}
void CartesianCommunicator::ShmBufferFreeAll(void) {
heap_top =(size_t)ShmBufferSelf();
heap_bytes=0;
}
/////////////////////////////////
// Grid information queries
@@ -95,251 +71,6 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
std::vector<int> row(_ndimension,1);
assert(dim>=0 && dim<_ndimension);
// Split the communicator
row[dim] = _processors[dim];
int me;
CartesianCommunicator Comm(row,*this,me);
Comm.AllToAll(in,out,words,bytes);
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
// (Turns up on 32^3 x 64 Gparity too)
MPI_Datatype object;
int iwords;
int ibytes;
iwords = words;
ibytes = bytes;
assert(words == iwords); // safe to cast to int ?
assert(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
MPI_Type_free(&object);
}
#endif
#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
{
_ndimension = processors.size();
assert(_ndimension = parent._ndimension);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// split the communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
int Nparent;
MPI_Comm_size(parent.communicator,&Nparent);
int childsize=1;
for(int d=0;d<processors.size();d++) {
childsize *= processors[d];
}
int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent);
std::vector<int> ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){
ccoor[d] = parent._processor_coor[d] % processors[d];
scoor[d] = parent._processor_coor[d] / processors[d];
ssize[d] = parent._processors[d] / processors[d];
}
int crank; // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
// Mpi uses the reverse Lexico convention to us
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors);
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);
MPI_Comm comm_split;
if ( Nchild > 1 ) {
/*
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._processors.size();d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
*/
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
assert(ierr==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
/*
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
*/
} else {
comm_split=parent.communicator;
srank = 0;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Set up from the new split communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
InitFromMPICommunicator(processors,comm_split);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Take an MPI_Comm and self assemble
//////////////////////////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
{
_ndimension = processors.size();
_processor_coor.resize(_ndimension);
/////////////////////////////////
// Count the requested nodes
/////////////////////////////////
_Nprocessors=1;
_processors = processors;
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
std::vector<int> periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
if ( communicator_base != communicator_world ) {
std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
for(int d=0;d<_processors.size();d++){
std::cout << _processor_coor[d]<<" ";
}
std::cout << std::endl;
}
int Size;
MPI_Comm_size(communicator,&Size);
#if defined(GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
communicator_halo.resize (2*_ndimension);
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
#endif
assert(Size==_Nprocessors);
}
#endif
#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
InitFromMPICommunicator(processors,communicator_world);
}
#endif
#if !defined( GRID_COMMS_MPI3)
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
#endif
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
#endif
#if !defined( GRID_COMMS_MPI3)
void CartesianCommunicator::StencilBarrier(void){};
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
void *CartesianCommunicator::ShmBuffer(int rank) {
return NULL;
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
return NULL;
}
void CartesianCommunicator::ShmInitGeneric(void){
#if 1
int mmap_flag =0;
#ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
#endif
#ifdef MAP_ANON
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
#endif
#ifdef MAP_HUGETLB
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
#endif
ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
if (ShmCommBuf == (void *)MAP_FAILED) {
perror("mmap failed ");
exit(EXIT_FAILURE);
}
#ifdef MADV_HUGEPAGE
if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
#endif
#else
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
ShmCommBuf=(void *)&ShmBufStorageVector[0];
#endif
bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
}
#endif
}

View File

@@ -32,117 +32,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
///////////////////////////////////
// Processor layout information
///////////////////////////////////
#ifdef GRID_COMMS_MPI
#include <mpi.h>
#endif
#ifdef GRID_COMMS_MPI3
#include <mpi.h>
#endif
#ifdef GRID_COMMS_MPIT
#include <mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM
#include <mpp/shmem.h>
#endif
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
class CartesianCommunicator {
public:
class CartesianCommunicator : public SharedMemory {
public:
////////////////////////////////////////////
// Isend/Irecv/Wait, or Sendrecv blocking
// Policies
////////////////////////////////////////////
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
static CommunicatorPolicy_t CommunicatorPolicy;
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
///////////////////////////////////////////
// Up to 65536 ranks per node adequate for now
// 128MB shared memory for comms enought for 48^4 local vol comms
// Give external control (command line override?) of this
///////////////////////////////////////////
static const int MAXLOG2RANKSPERNODE = 16;
static uint64_t MAX_MPI_SHM_BYTES;
static int nCommThreads;
// use explicit huge pages
static int Hugepages;
////////////////////////////////////////////
// Communicator should know nothing of the physics grid, only processor grid.
////////////////////////////////////////////
int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
static MPI_Comm communicator_world;
MPI_Comm communicator;
std::vector<MPI_Comm> communicator_halo;
typedef MPI_Request CommsRequest_t;
#else
typedef int CommsRequest_t;
#endif
////////////////////////////////////////////////////////////////////
// Helper functionality for SHM Windows common to all other impls
////////////////////////////////////////////////////////////////////
// Longer term; drop this in favour of a master / slave model with
// cartesian communicator on a subset of ranks, slave ranks controlled
// by group leader with data xfer via shared memory
////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_MPI3
static int ShmRank;
static int ShmSize;
static int GroupRank;
static int GroupSize;
static int WorldRank;
static int WorldSize;
std::vector<int> WorldDims;
std::vector<int> GroupDims;
std::vector<int> ShmDims;
std::vector<int> GroupCoor;
std::vector<int> ShmCoor;
std::vector<int> WorldCoor;
static std::vector<int> GroupRanks;
static std::vector<int> MyGroup;
static int ShmSetup;
static MPI_Win ShmWindow;
static MPI_Comm ShmComm;
std::vector<int> LexicographicToWorldRank;
static std::vector<void *> ShmCommBufs;
#else
static void ShmInitGeneric(void);
static commVector<uint8_t> ShmBufStorageVector;
#endif
/////////////////////////////////
// Grid information and queries
// Implemented in Communicator_base.C
/////////////////////////////////
static void * ShmCommBuf;
size_t heap_top;
size_t heap_bytes;
void *ShmBufferSelf(void);
void *ShmBuffer(int rank);
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
unsigned long _ndimension;
static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator;
std::vector<Grid_MPI_Comm> communicator_halo;
////////////////////////////////////////////////
// Must call in Grid startup
@@ -158,14 +74,15 @@ class CartesianCommunicator {
virtual ~CartesianCommunicator();
private:
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
////////////////////////////////////////////////
// Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private
////////////////////////////////////////////////
void InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base);
#endif
void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
public:
////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls
@@ -181,8 +98,6 @@ class CartesianCommunicator {
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ;
int NodeCount(void) ;
int RankCount(void) ;
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
@@ -270,16 +185,12 @@ class CartesianCommunicator {
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
assert(dim>=0);
assert(dim<_ndimension);
int numnode = _processors[dim];
// std::cerr << " AllToAll in.size() "<<in.size()<<std::endl;
// std::cerr << " AllToAll out.size() "<<out.size()<<std::endl;
assert(in.size()==out.size());
int numnode = _processors[dim];
uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode;
assert(numnode * words == in.size());
assert(words < (1ULL<<32));
assert(words < (1ULL<<31));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
}
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);

View File

@@ -1,222 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/ActionCore.h>
#include <mpi.h>
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
// Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag;
int provided;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
if ( provided != MPI_THREAD_MULTIPLE ) {
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
}
}
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
ShmInitGeneric();
}
CartesianCommunicator::~CartesianCommunicator()
{
int MPI_is_finalised;
MPI_Finalized(&MPI_is_finalised);
if (communicator && !MPI_is_finalised)
MPI_Comm_free(&communicator);
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor;
int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
///////////////////////////////////////////////////////
// Should only be used prior to Grid Init finished.
// Check for this?
///////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
}

View File

@@ -26,587 +26,253 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <mpi.h>
#include <semaphore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>
#include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmSetup = 0;
Grid_MPI_Comm CartesianCommunicator::communicator_world;
int CartesianCommunicator::ShmRank;
int CartesianCommunicator::ShmSize;
int CartesianCommunicator::GroupRank;
int CartesianCommunicator::GroupSize;
int CartesianCommunicator::WorldRank;
int CartesianCommunicator::WorldSize;
MPI_Comm CartesianCommunicator::communicator_world;
MPI_Comm CartesianCommunicator::ShmComm;
MPI_Win CartesianCommunicator::ShmWindow;
std::vector<int> CartesianCommunicator::GroupRanks;
std::vector<int> CartesianCommunicator::MyGroup;
std::vector<void *> CartesianCommunicator::ShmCommBufs;
int CartesianCommunicator::NodeCount(void) { return GroupSize;};
int CartesianCommunicator::RankCount(void) { return WorldSize;};
#undef FORCE_COMMS
void *CartesianCommunicator::ShmBufferSelf(void)
////////////////////////////////////////////
// First initialise of comms system
////////////////////////////////////////////
void CartesianCommunicator::Init(int *argc, char ***argv)
{
return ShmCommBufs[ShmRank];
}
void *CartesianCommunicator::ShmBuffer(int rank)
{
int gpeer = GroupRanks[rank];
#ifdef FORCE_COMMS
return NULL;
#endif
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
return ShmCommBufs[gpeer];
}
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
{
static int count =0;
int gpeer = GroupRanks[rank];
assert(gpeer!=ShmRank); // never send to self
assert(rank!=WorldRank);// never send to self
#ifdef FORCE_COMMS
return NULL;
#endif
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
return (void *) remote;
}
}
void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag;
int provided;
// mtrace();
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
assert (provided == MPI_THREAD_MULTIPLE);
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
assert(0);
}
Grid_quiesce_nodes();
// Never clean up as done once.
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Comm_rank(communicator_world,&WorldRank);
MPI_Comm_size(communicator_world,&WorldSize);
if ( WorldRank == 0 ) {
std::cout << GridLogMessage<< "Initialising MPI "<< WorldRank <<"/"<<WorldSize <<std::endl;
}
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize);
GroupSize = WorldSize/ShmSize;
/////////////////////////////////////////////////////////////////////
// find world ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (communicator_world, &WorldGroup);
MPI_Comm_group (ShmComm, &ShmGroup);
std::vector<int> world_ranks(WorldSize);
GroupRanks.resize(WorldSize);
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and noninate the leader
///////////////////////////////////////////////////////////////////
int g=0;
MyGroup.resize(ShmSize);
for(int rank=0;rank<WorldSize;rank++){
if(GroupRanks[rank]!=MPI_UNDEFINED){
assert(g<ShmSize);
MyGroup[g++] = rank;
}
}
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
int myleader = MyGroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(GroupSize,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the rank of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
GroupRank=-1;
for(int g=0;g<GroupSize;g++){
if (myleader == leaders_group[g]){
GroupRank=g;
}
}
assert(GroupRank!=-1);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared window for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(ShmComm);
ShmCommBuf = 0;
ShmCommBufs.resize(ShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbf and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMMMAP
char shm_name [NAME_MAX];
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
//sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
// printf("Opening file %s \n",shm_name);
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd == -1) {
printf("open %s failed\n",shm_name);
perror("open hugetlbfs");
exit(0);
}
int mmap_flag = MAP_SHARED ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
ShmCommBufs[r] =ptr;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
// the posix shm virtual file system
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMOPEN
char shm_name [NAME_MAX];
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
shm_unlink(shm_name);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
ftruncate(fd, size);
int mmap_flag = MAP_SHARED;
#ifdef MAP_POPULATE
mmap_flag |= MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if (Hugepages) mmap_flag |= MAP_HUGETLB;
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
if ( ptr == (void * )MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
#if 0
//#ifdef HAVE_NUMAIF_H
int status;
int flags=MPOL_MF_MOVE;
#ifdef KNL
int nodes=1; // numa domain == MCDRAM
// Find out if in SNC2,SNC4 mode ?
#else
int nodes=r; // numa domain == MPI ID
#endif
unsigned long count=1;
for(uint64_t page=0;page<size;page+=4096){
void *pages = (void *) ( page + (uint64_t)ptr );
uint64_t *cow_it = (uint64_t *)pages; *cow_it = 1;
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
if (ierr && (page==0)) perror("numa relocate command failed");
}
#endif
ShmCommBufs[r] =ptr;
}
}
MPI_Barrier(ShmComm);
if ( ShmRank != 0 ) {
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
int fd=shm_open(shm_name,O_RDWR,0666);
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
ShmCommBufs[r] =ptr;
}
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET SHMAT and SHM_HUGETLB flag
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMGET
std::vector<int> shmids(ShmSize);
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
key_t key = IPC_PRIVATE;
int flags = IPC_CREAT | SHM_R | SHM_W;
#ifdef SHM_HUGETLB
if (Hugepages) flags|=SHM_HUGETLB;
#endif
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
int errsv = errno;
printf("Errno %d\n",errsv);
printf("key %d\n",key);
printf("size %lld\n",size);
printf("flags %d\n",flags);
perror("shmget");
exit(1);
} else {
printf("shmid: 0x%x\n", shmids[r]);
}
}
}
MPI_Barrier(ShmComm);
MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm);
MPI_Barrier(ShmComm);
for(int r=0;r<ShmSize;r++){
ShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
if (ShmCommBufs[r] == (uint64_t *)-1) {
perror("Shared memory attach failure");
shmctl(shmids[r], IPC_RMID, NULL);
exit(2);
}
printf("shmaddr: %p\n", ShmCommBufs[r]);
}
MPI_Barrier(ShmComm);
// Mark for clean up
for(int r=0;r<ShmSize;r++){
shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
}
MPI_Barrier(ShmComm);
#endif
ShmCommBuf = ShmCommBufs[ShmRank];
MPI_Barrier(ShmComm);
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
check[0] = GroupRank;
check[1] = r;
check[2] = 0x5A5A5A;
}
}
MPI_Barrier(ShmComm);
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
assert(check[0]==GroupRank);
assert(check[1]==r);
assert(check[2]==0x5A5A5A);
}
MPI_Barrier(ShmComm);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Verbose for now
//////////////////////////////////////////////////////////////////////////////////////////////////////////
if (WorldRank == 0){
std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
std::cout<< WorldSize << " Ranks " ;
std::cout<< GroupSize << " Nodes " ;
std::cout<< " with "<< ShmSize << " ranks-per-node "<<std::endl;
std::cout<<GridLogMessage <<"Grid MPI-3 configuration: allocated shared memory region of size ";
std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
for(int g=0;g<GroupSize;g++){
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
}
std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
for(int g=0;g<ShmSize;g++){
std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
if(g!=ShmSize-1) std::cout<<",";
else std::cout<<"}"<<std::endl;
}
}
for(int g=0;g<GroupSize;g++){
if ( (ShmRank == 0) && (GroupRank==g) ) std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
for(int r=0;r<ShmSize;r++){
if ( (ShmRank == 0) && (GroupRank==g) ) {
std::cout<<MyGroup[r];
if(r<ShmSize-1) std::cout<<",";
else std::cout<<"}"<<std::endl<<std::flush;
}
MPI_Barrier(communicator_world);
}
}
assert(ShmSetup==0); ShmSetup=1;
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Want to implement some magic ... Group sub-cubes into those on same node
////////////////////////////////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source)
///////////////////////////////////////////////////////////////////////////
// Use cartesian communicators now even in MPI3
///////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
std::vector<int> coor = _processor_coor; // my coord
assert(std::abs(shift) <_processors[dim]);
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,source,_processors);
source = LexicographicToWorldRank[source];
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,dest,_processors);
dest = LexicographicToWorldRank[dest];
}// rank is world rank.
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
Lexicographic::IndexFromCoor(coor,rank,_processors);
rank = LexicographicToWorldRank[rank];
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}// rank is world rank
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
int lr=-1;
for(int r=0;r<WorldSize;r++){// map world Rank to lexico and then to coor
if( LexicographicToWorldRank[r]==rank) lr = r;
}
assert(lr!=-1);
Lexicographic::CoorFromIndex(coor,lr,_processors);
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world
////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
MPI_Comm optimal_comm;
////////////////////////////////////////////////////
// Remap using the shared memory optimising routine
// The remap creates a comm which must be freed
////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm);
///////////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////////
MPI_Comm_free(&optimal_comm);
}
//////////////////////////////////
// Try to subdivide communicator
//////////////////////////////////
/*
* Use default in MPI compile
*/
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors)
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
{
std::cout << "Attempts to split MPI3 communicators will fail until implemented" <<std::endl;
_ndimension = processors.size();
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
std::vector<int> parent_processor_coor(_ndimension,0);
std::vector<int> parent_processors (_ndimension,1);
// Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension;
for(int d=0;d<parent_ndimension;d++){
parent_processor_coor[pad+d]=parent._processor_coor[d];
parent_processors [pad+d]=parent._processors[d];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// split the communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
// int Nparent = parent._processors ;
// std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
int Nparent;
MPI_Comm_size(parent.communicator,&Nparent);
// std::cout << " Parent size "<<Nparent <<std::endl;
int childsize=1;
for(int d=0;d<processors.size();d++) {
childsize *= processors[d];
}
int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent);
// std::cout << " child size "<<childsize <<std::endl;
std::vector<int> ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){
ccoor[d] = parent_processor_coor[d] % processors[d];
scoor[d] = parent_processor_coor[d] / processors[d];
ssize[d] = parent_processors[d] / processors[d];
}
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
int crank;
// Mpi uses the reverse Lexico convention to us; so reversed routines called
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
MPI_Comm comm_split;
if ( Nchild > 1 ) {
if(0){
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
std::cout << " Split communicator " <<comm_split <<std::endl;
}
////////////////////////////////////////////////////////////////
// Split the communicator
////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
assert(ierr==0);
} else {
srank = 0;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
assert(ierr==0);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Set up from the new split communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
InitFromMPICommunicator(processors,comm_split);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Take the right SHM buffers
//////////////////////////////////////////////////////////////////////////////////////////////////////
SetCommunicator(comm_split);
///////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////
MPI_Comm_free(&comm_split);
if(0){
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
for(int d=0;d<processors.size();d++){
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
}
}
for(int d=0;d<processors.size();d++){
assert(_processor_coor[d] == ccoor[d] );
}
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
int ierr;
communicator=communicator_world;
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
{
////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo
////////////////////////////////////////////////////
_ndimension = processors.size();
_processor_coor.resize(_ndimension);
/////////////////////////////////
// Count the requested nodes
/////////////////////////////////
_Nprocessors=1;
_processors = processors;
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
std::vector<int> periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
if ( 0 && (communicator_base != communicator_world) ) {
std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
for(int d=0;d<_processors.size();d++){
std::cout << _processor_coor[d]<<" ";
}
std::cout << std::endl;
}
int Size;
MPI_Comm_size(communicator,&Size);
communicator_halo.resize (2*_ndimension);
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
assert(Size==_Nprocessors);
}
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = -1;
for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){
if ( (0x1<<i) == ShmSize ) {
log2size = i;
break;
}
}
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
std::vector<int> WorldDims = processors;
ShmDims.resize (_ndimension,1);
GroupDims.resize(_ndimension);
ShmCoor.resize (_ndimension);
GroupCoor.resize(_ndimension);
WorldCoor.resize(_ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%_ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%_ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<_ndimension;d++){
GroupDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Verbose
////////////////////////////////////////////////////////////////
#if 0
std::cout<< GridLogMessage << "MPI-3 usage "<<std::endl;
std::cout<< GridLogMessage << "SHM ";
for(int d=0;d<_ndimension;d++){
std::cout<< ShmDims[d] <<" ";
}
std::cout<< std::endl;
std::cout<< GridLogMessage << "Group ";
for(int d=0;d<_ndimension;d++){
std::cout<< GroupDims[d] <<" ";
}
std::cout<< std::endl;
std::cout<< GridLogMessage<<"World ";
for(int d=0;d<_ndimension;d++){
std::cout<< WorldDims[d] <<" ";
}
std::cout<< std::endl;
#endif
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
_Nprocessors=1;
_processors = processors;
_processor_coor.resize(_ndimension);
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
assert(WorldSize==_Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
for(int d=0;d<_ndimension;d++){
WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
}
_processor_coor = WorldCoor;
_processor = WorldRank;
///////////////////////////////////////////////////////////////////
// global sum Lexico to World mapping
///////////////////////////////////////////////////////////////////
int lexico;
LexicographicToWorldRank.resize(WorldSize,0);
Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
LexicographicToWorldRank[lexico] = WorldRank;
ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
assert(ierr==0);
for(int i=0;i<WorldSize;i++){
int wr = LexicographicToWorldRank[i];
// int wr = i;
std::vector<int> coor(_ndimension);
ProcessorCoorFromRank(wr,coor); // from world rank
int ck = RankFromProcessorCoor(coor);
assert(ck==wr);
if ( wr == WorldRank ) {
for(int j=0;j<coor.size();j++) {
assert(coor[j] == _processor_coor[j]);
}
}
/*
std::cout << GridLogMessage<< " Lexicographic "<<i;
std::cout << " MPI rank "<<wr;
std::cout << " Coor ";
for(int j=0;j<coor.size();j++) std::cout << coor[j];
std::cout<< std::endl;
*/
/////////////////////////////////////////////////////
// Check everyone agrees on everyone elses coords
/////////////////////////////////////////////////////
std::vector<int> mcoor = coor;
this->Broadcast(0,(void *)&mcoor[0],mcoor.size()*sizeof(int));
for(int d = 0 ; d< _ndimension; d++) {
assert(coor[d] == mcoor[d]);
}
}
};
CartesianCommunicator::~CartesianCommunicator()
{
int MPI_is_finalised;
MPI_Finalized(&MPI_is_finalised);
if (communicator && !MPI_is_finalised) {
MPI_Comm_free(&communicator);
for(int i=0;i< communicator_halo.size();i++){
for(int i=0;i<communicator_halo.size();i++){
MPI_Comm_free(&communicator_halo[i]);
}
}
@@ -734,19 +400,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
MPI_Request rrq;
int ierr;
int gdest = GroupRanks[dest];
int gfrom = GroupRanks[from];
int gme = GroupRanks[_processor];
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
#ifdef FORCE_COMMS
gdest = MPI_UNDEFINED;
gfrom = MPI_UNDEFINED;
#endif
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
assert(ierr==0);
@@ -815,5 +477,38 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
assert(ierr==0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
std::vector<int> row(_ndimension,1);
assert(dim>=0 && dim<_ndimension);
// Split the communicator
row[dim] = _processors[dim];
int me;
CartesianCommunicator Comm(row,*this,me);
Comm.AllToAll(in,out,words,bytes);
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
// (Turns up on 32^3 x 64 Gparity too)
MPI_Datatype object;
int iwords;
int ibytes;
iwords = words;
ibytes = bytes;
assert(words == iwords); // safe to cast to int ?
assert(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
MPI_Type_free(&object);
}
}

View File

@@ -1,988 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include "Grid.h"
#include <mpi.h>
//#include <numaif.h>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Workarounds:
/// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
/// darwin dispatch semaphores don't seem to be multiprocess.
///
/// ii) openmpi under --mca shmem posix works with two squadrons per node;
/// openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
/// memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#include <semaphore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
typedef sem_t *Grid_semaphore;
#error /*THis is deprecated*/
#if 0
#define SEM_INIT(S) S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
#define SEM_POST(S) assert ( sem_post(S) == 0 );
#define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
#else
#define SEM_INIT(S) ;
#define SEM_INIT_EXCL(S) ;
#define SEM_POST(S) ;
#define SEM_WAIT(S) ;
#endif
#include <sys/mman.h>
namespace Grid {
enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV };
struct Descriptor {
uint64_t buf;
size_t bytes;
int rank;
int tag;
int command;
uint64_t xbuf;
uint64_t rbuf;
int xtag;
int rtag;
int src;
int dest;
MPI_Request request;
};
const int pool = 48;
class SlaveState {
public:
volatile int head;
volatile int start;
volatile int tail;
volatile Descriptor Descrs[pool];
};
class Slave {
public:
Grid_semaphore sem_head;
Grid_semaphore sem_tail;
SlaveState *state;
MPI_Comm squadron;
uint64_t base;
int universe_rank;
int vertical_rank;
char sem_name [NAME_MAX];
////////////////////////////////////////////////////////////
// Descriptor circular pointers
////////////////////////////////////////////////////////////
Slave() {};
void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
void SemInit(void) {
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
SEM_INIT(sem_head);
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
SEM_INIT(sem_tail);
}
void SemInitExcl(void) {
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
SEM_INIT_EXCL(sem_head);
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
SEM_INIT_EXCL(sem_tail);
}
void WakeUpDMA(void) {
SEM_POST(sem_head);
};
void WakeUpCompute(void) {
SEM_POST(sem_tail);
};
void WaitForCommand(void) {
SEM_WAIT(sem_head);
};
void WaitForComplete(void) {
SEM_WAIT(sem_tail);
};
void EventLoop (void) {
// std::cout<< " Entering event loop "<<std::endl;
while(1){
WaitForCommand();
// std::cout << "Getting command "<<std::endl;
#if 0
_mm_monitor((void *)&state->head,0,0);
int s=state->start;
if ( s != state->head ) {
_mm_mwait(0,0);
}
#endif
Event();
}
}
int Event (void) ;
uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ;
void WaitAll() {
// std::cout << "Queueing WAIT command "<<std::endl;
QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
// std::cout << "Waking up DMA "<<std::endl;
WakeUpDMA();
// std::cout << "Waiting from semaphore "<<std::endl;
WaitForComplete();
// std::cout << "Checking FIFO is empty "<<std::endl;
while ( state->tail != state->head );
}
};
////////////////////////////////////////////////////////////////////////
// One instance of a data mover.
// Master and Slave must agree on location in shared memory
////////////////////////////////////////////////////////////////////////
class MPIoffloadEngine {
public:
static std::vector<Slave> Slaves;
static int ShmSetup;
static int UniverseRank;
static int UniverseSize;
static MPI_Comm communicator_universe;
static MPI_Comm communicator_cached;
static MPI_Comm HorizontalComm;
static int HorizontalRank;
static int HorizontalSize;
static MPI_Comm VerticalComm;
static MPI_Win VerticalWindow;
static int VerticalSize;
static int VerticalRank;
static std::vector<void *> VerticalShmBufs;
static std::vector<std::vector<int> > UniverseRanks;
static std::vector<int> UserCommunicatorToWorldRanks;
static MPI_Group WorldGroup, CachedGroup;
static void CommunicatorInit (MPI_Comm &communicator_world,
MPI_Comm &ShmComm,
void * &ShmCommBuf);
static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
/////////////////////////////////////////////////////////
// routines for master proc must handle any communicator
/////////////////////////////////////////////////////////
static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
// std::cout<< " Queueing send "<< bytes<< " slave "<< slave << " to comm "<<rank <<std::endl;
Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
// std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
Slaves[slave].WakeUpDMA();
// std::cout << "Waking up DMA "<< slave<<std::endl;
};
static void QueueSendRecv(int slave,void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)
{
Slaves[slave].QueueSendRecv(xbuf,rbuf,bytes,xtag,rtag,comm,dest,src);
Slaves[slave].WakeUpDMA();
}
static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
// std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank <<std::endl;
Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
// std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
Slaves[slave].WakeUpDMA();
// std::cout << "Waking up DMA "<< slave<<std::endl;
};
static void WaitAll() {
for(int s=1;s<VerticalSize;s++) {
// std::cout << "Waiting for slave "<< s<<std::endl;
Slaves[s].WaitAll();
}
// std::cout << " Wait all Complete "<<std::endl;
};
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
int basework = nwork/units;
int backfill = units-(nwork%units);
if ( me >= units ) {
mywork = myoff = 0;
} else {
mywork = (nwork+me)/units;
myoff = basework * me;
if ( me > backfill )
myoff+= (me-backfill);
}
return;
};
static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
uint8_t * cxbuf = (uint8_t *) xbuf;
uint8_t * crbuf = (uint8_t *) rbuf;
static int rrp=0;
int procs = VerticalSize-1;
int myoff=0;
int mywork=bytes;
QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
rrp = rrp+1;
if ( rrp == (VerticalSize-1) ) rrp = 0;
}
static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
uint8_t * cxbuf = (uint8_t *) xbuf;
uint8_t * crbuf = (uint8_t *) rbuf;
int mywork, myoff, procs;
procs = VerticalSize-1;
for(int s=0;s<procs;s++) {
GetWork(bytes,s,mywork,myoff,procs);
QueueSendRecv(s+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
}
};
static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
uint8_t * cbuf = (uint8_t *) buf;
int mywork, myoff, procs;
procs = VerticalSize-1;
for(int s=0;s<procs;s++) {
GetWork(bytes,s,mywork,myoff,procs);
QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
}
};
static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
uint8_t * cbuf = (uint8_t *) buf;
int mywork, myoff, procs;
procs = VerticalSize-1;
for(int s=0;s<procs;s++) {
GetWork(bytes,s,mywork,myoff,procs);
QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
}
};
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<Slave> MPIoffloadEngine::Slaves;
int MPIoffloadEngine::UniverseRank;
int MPIoffloadEngine::UniverseSize;
MPI_Comm MPIoffloadEngine::communicator_universe;
MPI_Comm MPIoffloadEngine::communicator_cached;
MPI_Group MPIoffloadEngine::WorldGroup;
MPI_Group MPIoffloadEngine::CachedGroup;
MPI_Comm MPIoffloadEngine::HorizontalComm;
int MPIoffloadEngine::HorizontalRank;
int MPIoffloadEngine::HorizontalSize;
MPI_Comm MPIoffloadEngine::VerticalComm;
int MPIoffloadEngine::VerticalSize;
int MPIoffloadEngine::VerticalRank;
MPI_Win MPIoffloadEngine::VerticalWindow;
std::vector<void *> MPIoffloadEngine::VerticalShmBufs;
std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
std::vector<int> MPIoffloadEngine::UserCommunicatorToWorldRanks;
int CartesianCommunicator::NodeCount(void) { return HorizontalSize;};
int MPIoffloadEngine::ShmSetup = 0;
void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
MPI_Comm &ShmComm,
void * &ShmCommBuf)
{
int flag;
assert(ShmSetup==0);
//////////////////////////////////////////////////////////////////////
// Universe is all nodes prior to squadron grouping
//////////////////////////////////////////////////////////////////////
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
MPI_Comm_rank(communicator_universe,&UniverseRank);
MPI_Comm_size(communicator_universe,&UniverseSize);
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory (Verticals)
/////////////////////////////////////////////////////////////////////
#undef MPI_SHARED_MEM_DEBUG
#ifdef MPI_SHARED_MEM_DEBUG
MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
#else
MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
#endif
MPI_Comm_rank(VerticalComm ,&VerticalRank);
MPI_Comm_size(VerticalComm ,&VerticalSize);
//////////////////////////////////////////////////////////////////////
// Split into horizontal groups by rank in squadron
//////////////////////////////////////////////////////////////////////
MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
MPI_Comm_rank(HorizontalComm,&HorizontalRank);
MPI_Comm_size(HorizontalComm,&HorizontalSize);
assert(HorizontalSize*VerticalSize==UniverseSize);
////////////////////////////////////////////////////////////////////////////////
// What is my place in the world
////////////////////////////////////////////////////////////////////////////////
int WorldRank=0;
if(VerticalRank==0) WorldRank = HorizontalRank;
int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
assert(ierr==0);
////////////////////////////////////////////////////////////////////////////////
// Where is the world in the universe?
////////////////////////////////////////////////////////////////////////////////
UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
for(int w=0;w<HorizontalSize;w++){
ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
assert(ierr==0);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared window for our group, pass back Shm info to CartesianCommunicator
//////////////////////////////////////////////////////////////////////////////////////////////////////////
VerticalShmBufs.resize(VerticalSize);
#undef MPI_SHARED_MEM
#ifdef MPI_SHARED_MEM
ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
assert(ierr==0);
// std::cout<<"SHM "<<ShmCommBuf<<std::endl;
for(int r=0;r<VerticalSize;r++){
MPI_Aint sz;
int dsp_unit;
MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
// std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
}
#else
char shm_name [NAME_MAX];
MPI_Barrier(VerticalComm);
if ( VerticalRank == 0 ) {
for(int r=0;r<VerticalSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
if ( r>0 ) size = sizeof(SlaveState);
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
shm_unlink(shm_name);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
if ( fd < 0 ) {
perror("failed shm_open");
assert(0);
}
ftruncate(fd, size);
VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( VerticalShmBufs[r] == MAP_FAILED ) {
perror("failed mmap");
assert(0);
}
/*
for(uint64_t page=0;page<size;page+=4096){
void *pages = (void *) ( page + (uint64_t)VerticalShmBufs[r] );
int status;
int flags=MPOL_MF_MOVE_ALL;
int nodes=1; // numa domain == MCDRAM
unsigned long count=1;
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
if (ierr && (page==0)) perror("numa relocate command failed");
}
*/
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
check[0] = WorldRank;
check[1] = r;
// std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
}
}
MPI_Barrier(VerticalComm);
if ( VerticalRank != 0 ) {
for(int r=0;r<VerticalSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
if ( r>0 ) size = sizeof(SlaveState);
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
if ( fd<0 ) {
perror("failed shm_open");
assert(0);
}
VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
assert(check[0]== WorldRank);
assert(check[1]== r);
// std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
}
}
#endif
MPI_Barrier(VerticalComm);
//////////////////////////////////////////////////////////////////////
// Map rank of leader on node in their in new world, to the
// rank in this vertical plane's horizontal communicator
//////////////////////////////////////////////////////////////////////
communicator_world = HorizontalComm;
ShmComm = VerticalComm;
ShmCommBuf = VerticalShmBufs[0];
MPI_Comm_group (communicator_world, &WorldGroup);
///////////////////////////////////////////////////////////
// Start the slave data movers
///////////////////////////////////////////////////////////
if ( VerticalRank != 0 ) {
Slave indentured;
indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
indentured.SemInitExcl();// init semaphore in shared memory
MPI_Barrier(VerticalComm);
MPI_Barrier(VerticalComm);
indentured.EventLoop();
assert(0);
} else {
Slaves.resize(VerticalSize);
for(int i=1;i<VerticalSize;i++){
Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
}
MPI_Barrier(VerticalComm);
for(int i=1;i<VerticalSize;i++){
Slaves[i].SemInit();// init semaphore in shared memory
}
MPI_Barrier(VerticalComm);
}
///////////////////////////////////////////////////////////
// Verbose for now
///////////////////////////////////////////////////////////
ShmSetup=1;
if (UniverseRank == 0){
std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
std::cout<<UniverseSize << " Ranks " ;
std::cout<<HorizontalSize << " Nodes " ;
std::cout<<VerticalSize << " with ranks-per-node "<<std::endl;
std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
for(int g=0;g<HorizontalSize;g++){
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
}
for(int g=0;g<HorizontalSize;g++){
std::cout<<GridLogMessage<<" { ";
for(int s=0;s<VerticalSize;s++){
std::cout<< UniverseRanks[g][s];
if ( s<VerticalSize-1 ) {
std::cout<<",";
}
}
std::cout<<" } "<<std::endl;
}
}
};
///////////////////////////////////////////////////////////////////////////////////////////////
// Map the communicator into communicator_world, and find the neighbour.
// Cache the mappings; cache size is 1.
///////////////////////////////////////////////////////////////////////////////////////////////
void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
if ( comm == HorizontalComm ) {
comm_world_peer = rank;
// std::cout << " MapCommRankToWorldRank horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
} else if ( comm == communicator_cached ) {
comm_world_peer = UserCommunicatorToWorldRanks[rank];
// std::cout << " MapCommRankToWorldRank cached " <<rank<<"->"<<comm_world_peer<<std::endl;
} else {
int size;
MPI_Comm_size(comm,&size);
UserCommunicatorToWorldRanks.resize(size);
std::vector<int> cached_ranks(size);
for(int r=0;r<size;r++) {
cached_ranks[r]=r;
}
communicator_cached=comm;
MPI_Comm_group(communicator_cached, &CachedGroup);
MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]);
comm_world_peer = UserCommunicatorToWorldRanks[rank];
// std::cout << " MapCommRankToWorldRank cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
assert(comm_world_peer != MPI_UNDEFINED);
}
assert( (tag & (~0xFFFFL)) ==0);
uint64_t icomm = (uint64_t)comm;
int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
// hashtag = (comm_hash<<15) | tag;
hashtag = tag;
};
void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
{
squadron=_squadron;
universe_rank=_universe_rank;
vertical_rank=_vertical_rank;
state =_state;
// std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
state->head = state->tail = state->start = 0;
base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
int rank; MPI_Comm_rank(_squadron,&rank);
}
#define PERI_PLUS(A) ( (A+1)%pool )
int Slave::Event (void) {
static int tail_last;
static int head_last;
static int start_last;
int ierr;
MPI_Status stat;
static int i=0;
////////////////////////////////////////////////////
// Try to advance the start pointers
////////////////////////////////////////////////////
int s=state->start;
if ( s != state->head ) {
switch ( state->Descrs[s].command ) {
case COMMAND_ISEND:
ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),
state->Descrs[s].bytes,
MPI_CHAR,
state->Descrs[s].rank,
state->Descrs[s].tag,
MPIoffloadEngine::communicator_universe,
(MPI_Request *)&state->Descrs[s].request);
assert(ierr==0);
state->start = PERI_PLUS(s);
return 1;
break;
case COMMAND_IRECV:
ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),
state->Descrs[s].bytes,
MPI_CHAR,
state->Descrs[s].rank,
state->Descrs[s].tag,
MPIoffloadEngine::communicator_universe,
(MPI_Request *)&state->Descrs[s].request);
// std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
// std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
assert(ierr==0);
state->start = PERI_PLUS(s);
return 1;
break;
case COMMAND_SENDRECV:
// fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10);
ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10,
(void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10,
MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE);
assert(ierr==0);
// fprintf(stderr,"Sendrecv done %d %d\n",ierr,i);
// MPI_Barrier(MPIoffloadEngine::HorizontalComm);
// fprintf(stderr,"Barrier\n");
i++;
state->start = PERI_PLUS(s);
return 1;
break;
case COMMAND_WAITALL:
for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
if ( state->Descrs[t].command != COMMAND_SENDRECV ) {
MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
}
};
s=PERI_PLUS(s);
state->start = s;
state->tail = s;
WakeUpCompute();
return 1;
break;
default:
assert(0);
break;
}
}
return 0;
}
//////////////////////////////////////////////////////////////////////////////
// External interaction with the queue
//////////////////////////////////////////////////////////////////////////////
void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)
{
int head =state->head;
int next = PERI_PLUS(head);
// Set up descriptor
int worldrank;
int hashtag;
MPI_Comm communicator;
MPI_Request request;
uint64_t relative;
relative = (uint64_t)xbuf - base;
state->Descrs[head].xbuf = relative;
relative= (uint64_t)rbuf - base;
state->Descrs[head].rbuf = relative;
state->Descrs[head].bytes = bytes;
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest);
state->Descrs[head].dest = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
state->Descrs[head].xtag = hashtag;
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src);
state->Descrs[head].src = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
state->Descrs[head].rtag = hashtag;
state->Descrs[head].command= COMMAND_SENDRECV;
// Block until FIFO has space
while( state->tail==next );
// Msync on weak order architectures
// Advance pointer
state->head = next;
};
uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank)
{
/////////////////////////////////////////
// Spin; if FIFO is full until not full
/////////////////////////////////////////
int head =state->head;
int next = PERI_PLUS(head);
// Set up descriptor
int worldrank;
int hashtag;
MPI_Comm communicator;
MPI_Request request;
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
uint64_t relative= (uint64_t)buf - base;
state->Descrs[head].buf = relative;
state->Descrs[head].bytes = bytes;
state->Descrs[head].rank = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
state->Descrs[head].tag = hashtag;
state->Descrs[head].command= command;
/*
if ( command == COMMAND_ISEND ) {
std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank
<< " to worldrank " << worldrank <<std::endl;
std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
}
if ( command == COMMAND_IRECV ) {
std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank
<< " from worldrank " << worldrank <<std::endl;
std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
}
*/
// Block until FIFO has space
while( state->tail==next );
// Msync on weak order architectures
// Advance pointer
state->head = next;
return 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
void CartesianCommunicator::Init(int *argc, char ***argv)
{
int flag;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init(argc,argv);
}
communicator_world = MPI_COMM_WORLD;
MPI_Comm ShmComm;
MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_ndimension = processors.size();
std::vector<int> periodic(_ndimension,1);
_Nprocessors=1;
_processors = processors;
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
int Size;
MPI_Comm_size(communicator_world,&Size);
assert(Size==_Nprocessors);
_processor_coor.resize(_ndimension);
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank (communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
};
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
MPI_Request xrq;
MPI_Request rrq;
int rank = _processor;
int ierr;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
}
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
uint64_t xmit_i = (uint64_t) xmit;
uint64_t recv_i = (uint64_t) recv;
uint64_t shm = (uint64_t) ShmCommBuf;
// assert xmit and recv lie in shared memory region
assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
assert(from!=_processor);
assert(dest!=_processor);
MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
//MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
//MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
//MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
MPIoffloadEngine::WaitAll();
//this->Barrier();
}
void CartesianCommunicator::StencilBarrier(void) { }
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
void *CartesianCommunicator::ShmBuffer(int rank) {
return NULL;
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
return NULL;
}
};

View File

@@ -1,273 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/ActionCore.h>
#include <mpi.h>
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
// Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag;
int provided;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
if ( provided != MPI_THREAD_MULTIPLE ) {
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
}
}
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
ShmInitGeneric();
}
CartesianCommunicator::~CartesianCommunicator()
{
int MPI_is_finalised;
MPI_Finalized(&MPI_is_finalised);
if (communicator && !MPI_is_finalised){
MPI_Comm_free(&communicator);
for(int i=0;i< communicator_halo.size();i++){
MPI_Comm_free(&communicator_halo[i]);
}
}
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor;
int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
///////////////////////////////////////////////////////
// Should only be used prior to Grid Init finished.
// Check for this?
///////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir)
{
int myrank = _processor;
int ierr;
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
// Give the CPU to MPI immediately; can use threads to overlap optionally
MPI_Request req[2];
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[commdir],&req[0]);
list.push_back(req[0]);
list.push_back(req[1]);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
int nreq=waitall.size();
MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
}
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir)
{
int myrank = _processor;
int ierr;
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo.size()<< <std::endl;
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
// Give the CPU to MPI immediately; can use threads to overlap optionally
MPI_Request req[2];
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[commdir],&req[0]);
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
return 2.0*bytes;
}
}

View File

@@ -32,14 +32,22 @@ namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world;
void CartesianCommunicator::Init(int *argc, char *** arv)
{
ShmInitGeneric();
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors) { srank=0;}
: CartesianCommunicator(processors)
{
srank=0;
SetCommunicator(communicator_world);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
@@ -54,6 +62,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
assert(_processors[d]==1);
_processor_coor[d] = 0;
}
SetCommunicator(communicator_world);
}
CartesianCommunicator::~CartesianCommunicator(){}
@@ -121,6 +130,36 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
dest=0;
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void){};
}

View File

@@ -1,357 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_shmem.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <mpp/shmem.h>
#include <array>
namespace Grid {
// Should error check all MPI calls.
#define SHMEM_VET(addr)
#define SHMEM_VET_DEBUG(addr) { \
if ( ! shmem_addr_accessible(addr,_processor) ) {\
std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
BACKTRACEFILE(); \
}\
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
typedef struct HandShake_t {
uint64_t seq_local;
uint64_t seq_remote;
} HandShake;
std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
std::array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
ret.fill(SHMEM_SYNC_VALUE);
return ret;
}
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
static Vector< HandShake > XConnections;
static Vector< HandShake > RConnections;
void CartesianCommunicator::Init(int *argc, char ***argv) {
shmem_init();
XConnections.resize(shmem_n_pes());
RConnections.resize(shmem_n_pes());
for(int pe =0 ; pe<shmem_n_pes();pe++){
XConnections[pe].seq_local = 0;
XConnections[pe].seq_remote= 0;
RConnections[pe].seq_local = 0;
RConnections[pe].seq_remote= 0;
}
shmem_barrier_all();
ShmInitGeneric();
}
CartesianCommunicator::~CartesianCommunicator(){}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent)
: CartesianCommunicator(processors)
{
std::cout << "Attempts to split SHMEM communicators will fail " <<std::endl;
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_ndimension = processors.size();
std::vector<int> periodic(_ndimension,1);
_Nprocessors=1;
_processors = processors;
_processor_coor.resize(_ndimension);
_processor = shmem_my_pe();
Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
int Size = shmem_n_pes();
assert(Size==_Nprocessors);
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
static long long source ;
static long long dest ;
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
// int nreduce=1;
// int pestart=0;
// int logStride=0;
source = u;
dest = 0;
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all(); // necessary?
u = dest;
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
static long long source ;
static long long dest ;
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
// int nreduce=1;
// int pestart=0;
// int logStride=0;
source = u;
dest = 0;
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all(); // necessary?
u = dest;
}
void CartesianCommunicator::GlobalSum(float &f){
static float source ;
static float dest ;
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
source = f;
dest =0.0;
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
f = dest;
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
static float source ;
static float dest = 0 ;
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
if ( shmem_addr_accessible(f,_processor) ){
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
return;
}
for(int i=0;i<N;i++){
dest =0.0;
source = f[i];
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
f[i] = dest;
}
}
void CartesianCommunicator::GlobalSum(double &d)
{
static double source;
static double dest ;
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
source = d;
dest = 0;
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
d = dest;
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
static double source ;
static double dest ;
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
if ( shmem_addr_accessible(d,_processor) ){
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
return;
}
for(int i=0;i<N;i++){
source = d[i];
dest =0.0;
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
d[i] = dest;
}
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
std::vector<int> coor = _processor_coor;
assert(std::abs(shift) <_processors[dim]);
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,source,_processors);
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,dest,_processors);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
Lexicographic::IndexFromCoor(coor,rank,_processors);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
Lexicographic::CoorFromIndex(coor,rank,_processors);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
SHMEM_VET(xmit);
SHMEM_VET(recv);
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
static uint64_t seq;
assert(recv!=xmit);
volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
if ( _processor == sender ) {
// Check he has posted a receive
while(SendSeq->seq_remote == SendSeq->seq_local);
// Advance our send count
seq = ++(SendSeq->seq_local);
// Send this packet
SHMEM_VET(recv);
shmem_putmem(recv,xmit,bytes,receiver);
shmem_fence();
//Notify him we're done
shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
shmem_fence();
}
if ( _processor == receiver ) {
// Post a receive
seq = ++(RecvSeq->seq_local);
shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
// Now wait until he has advanced our reception counter
while(RecvSeq->seq_remote != RecvSeq->seq_local);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
SHMEM_VET(xmit);
SHMEM_VET(recv);
// shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
shmem_putmem(recv,xmit,bytes,dest);
if ( CommunicatorPolicy == CommunicatorPolicySequential ) shmem_barrier_all();
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
// shmem_quiet(); // I'm done
if( CommunicatorPolicy == CommunicatorPolicyConcurrent ) shmem_barrier_all();// He's done too
}
void CartesianCommunicator::Barrier(void)
{
shmem_barrier_all();
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
static uint32_t word;
uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0);
int words = bytes/4;
if ( shmem_addr_accessible(data,_processor) ){
shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync.data());
return;
}
for(int w=0;w<words;w++){
word = array[w];
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
if ( shmem_my_pe() != root ) {
array[w] = word;
}
shmem_barrier_all();
}
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
static uint32_t word;
uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0);
int words = bytes/4;
for(int w=0;w<words;w++){
word = array[w];
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
if ( shmem_my_pe() != root ) {
array[w]= word;
}
shmem_barrier_all();
}
}
int CartesianCommunicator::RankWorld(void){
return shmem_my_pe();
}
}

View File

@@ -0,0 +1,92 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
// static data
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
int GlobalSharedMemory::Hugepages = 0;
int GlobalSharedMemory::_ShmSetup;
int GlobalSharedMemory::_ShmAlloc;
uint64_t GlobalSharedMemory::_ShmAllocBytes;
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
int GlobalSharedMemory::WorldShmRank;
int GlobalSharedMemory::WorldShmSize;
std::vector<int> GlobalSharedMemory::WorldShmRanks;
Grid_MPI_Comm GlobalSharedMemory::WorldComm;
int GlobalSharedMemory::WorldSize;
int GlobalSharedMemory::WorldRank;
int GlobalSharedMemory::WorldNodes;
int GlobalSharedMemory::WorldNode;
void GlobalSharedMemory::SharedMemoryFree(void)
{
assert(_ShmAlloc);
assert(_ShmAllocBytes>0);
for(int r=0;r<WorldShmSize;r++){
munmap(WorldShmCommBufs[r],_ShmAllocBytes);
}
_ShmAlloc = 0;
_ShmAllocBytes = 0;
}
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
void *SharedMemory::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;
heap_top += bytes;
heap_bytes+= bytes;
if (heap_bytes >= heap_size) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
assert(heap_bytes<heap_size);
}
return ptr;
}
void SharedMemory::ShmBufferFreeAll(void) {
heap_top =(size_t)ShmBufferSelf();
heap_bytes=0;
}
void *SharedMemory::ShmBufferSelf(void)
{
return ShmCommBufs[ShmRank];
}
}

View File

@@ -0,0 +1,165 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// TODO
// 1) move includes into SharedMemory.cc
//
// 2) split shared memory into a) optimal communicator creation from comm world
//
// b) shared memory buffers container
// -- static globally shared; init once
// -- per instance set of buffers.
//
#pragma once
#include <Grid/GridCore.h>
#if defined (GRID_COMMS_MPI3)
#include <mpi.h>
#endif
#include <semaphore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>
#include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
namespace Grid {
#if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request CommsRequest_t;
#else
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
#endif
class GlobalSharedMemory {
private:
static const int MAXLOG2RANKSPERNODE = 16;
// Init once lock on the buffer allocation
static int _ShmSetup;
static int _ShmAlloc;
static uint64_t _ShmAllocBytes;
public:
static int ShmSetup(void) { return _ShmSetup; }
static int ShmAlloc(void) { return _ShmAlloc; }
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
static uint64_t MAX_MPI_SHM_BYTES;
static int Hugepages;
static std::vector<void *> WorldShmCommBufs;
static Grid_MPI_Comm WorldComm;
static int WorldRank;
static int WorldSize;
static Grid_MPI_Comm WorldShmComm;
static int WorldShmRank;
static int WorldShmSize;
static int WorldNodes;
static int WorldNode;
static std::vector<int> WorldShmRanks;
//////////////////////////////////////////////////////////////////////////////////////
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
//////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
///////////////////////////////////////////////////
// Provide shared memory facilities off comm world
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
};
//////////////////////////////
// one per communicator
//////////////////////////////
class SharedMemory
{
private:
static const int MAXLOG2RANKSPERNODE = 16;
size_t heap_top;
size_t heap_bytes;
size_t heap_size;
protected:
Grid_MPI_Comm ShmComm; // for barriers
int ShmRank;
int ShmSize;
std::vector<void *> ShmCommBufs;
std::vector<int> ShmRanks;// Mapping comm ranks to Shm ranks
public:
SharedMemory() {};
~SharedMemory();
///////////////////////////////////////////////////////////////////////////////////////
// set the buffers & sizes
///////////////////////////////////////////////////////////////////////////////////////
void SetCommunicator(Grid_MPI_Comm comm);
////////////////////////////////////////////////////////////////////////
// For this instance ; disjoint buffer sets between splits if split grid
////////////////////////////////////////////////////////////////////////
void ShmBarrier(void);
///////////////////////////////////////////////////
// Call on any instance
///////////////////////////////////////////////////
void SharedMemoryTest(void);
void *ShmBufferSelf(void);
void *ShmBuffer (int rank);
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
//////////////////////////////////////////////////////////////////////////
// Make info on Nodes & ranks and Shared memory available
//////////////////////////////////////////////////////////////////////////
int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
int RankCount(void) { return GlobalSharedMemory::WorldSize;};
};
}

View File

@@ -0,0 +1,652 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <pwd.h>
namespace Grid {
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
WorldComm = comm;
MPI_Comm_rank(WorldComm,&WorldRank);
MPI_Comm_size(WorldComm,&WorldSize);
// WorldComm, WorldSize, WorldRank
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
// WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes
WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ?
/////////////////////////////////////////////////////////////////////
// find world ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (WorldComm, &WorldGroup);
MPI_Comm_group (WorldShmComm, &ShmGroup);
std::vector<int> world_ranks(WorldSize); for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
WorldShmRanks.resize(WorldSize);
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and nominate the leader
///////////////////////////////////////////////////////////////////
int g=0;
std::vector<int> MyGroup;
MyGroup.resize(WorldShmSize);
for(int rank=0;rank<WorldSize;rank++){
if(WorldShmRanks[rank]!=MPI_UNDEFINED){
assert(g<WorldShmSize);
MyGroup[g++] = rank;
}
}
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
int myleader = MyGroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(WorldNodes,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the node of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
WorldNode=-1;
for(int g=0;g<WorldNodes;g++){
if (myleader == leaders_group[g]){
WorldNode=g;
}
}
assert(WorldNode!=-1);
_ShmSetup=1;
}
// Gray encode support
int BinaryToGray (int binary) {
int gray = (binary>>1)^binary;
return gray;
}
int Log2Size(int TwoToPower,int MAXLOG2)
{
int log2size = -1;
for(int i=0;i<=MAXLOG2;i++){
if ( (0x1<<i) == TwoToPower ) {
log2size = i;
break;
}
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
{
#undef HYPERCUBE
#ifdef HYPERCUBE
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname
////////////////////////////////////////////////////////////////
// n runs 0...7 9...16 18...25 27...34 (8*4) 5 bits
// i runs 0..7 3 bits
// r runs 0..3 2 bits
// 2^10 = 1024 nodes
const int maxhdim = 10;
std::vector<int> HyperCubeCoords(maxhdim,0);
std::vector<int> RootHyperCubeCoords(maxhdim,0);
int R;
int I;
int N;
const int namelen = _POSIX_HOST_NAME_MAX;
char name[namelen];
// Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
assert(nscan==3);
int nlo = N%9;
int nhi = N/9;
uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
uint32_t rootcoor = hypercoor;
//////////////////////////////////////////////////////////////////
// Print debug info
//////////////////////////////////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
std::string hname(name);
std::cout << "hostname "<<hname<<std::endl;
std::cout << "R " << R << " I " << I << " N "<< N<<
<< " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
//////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition.
//////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor;
assert(hypercoor<WorldSize);
assert(hypercoor>=0);
//////////////////////////////////////
// Printing
//////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<ndimension;d++){
NodeDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Map Hcube according to physical lattice
// must partition. Loop over dims and find out who would join.
////////////////////////////////////////////////////////////////
int hcoor = hypercoor;
for(int d=0;d<ndimension;d++){
int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
int msk = (0x1<<bits)-1;
HyperCoor[d]=hcoor & msk;
HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
hcoor = hcoor >> bits;
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
int Nprocessors=1;
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
int rank;
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
/////////////////////////////////////////////////////////////////
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#else
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<ndimension;d++){
NodeDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
int Nprocessors=1;
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
int rank;
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
/////////////////////////////////////////////////////////////////
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
std::vector<int> shmids(WorldShmSize);
if ( WorldShmRank == 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes;
key_t key = IPC_PRIVATE;
int flags = IPC_CREAT | SHM_R | SHM_W;
#ifdef SHM_HUGETLB
if (Hugepages) flags|=SHM_HUGETLB;
#endif
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
int errsv = errno;
printf("Errno %d\n",errsv);
printf("key %d\n",key);
printf("size %lld\n",size);
printf("flags %d\n",flags);
perror("shmget");
exit(1);
}
}
}
MPI_Barrier(WorldShmComm);
MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
MPI_Barrier(WorldShmComm);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
if (WorldShmCommBufs[r] == (uint64_t *)-1) {
perror("Shared memory attach failure");
shmctl(shmids[r], IPC_RMID, NULL);
exit(2);
}
}
MPI_Barrier(WorldShmComm);
///////////////////////////////////
// Mark for clean up
///////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
}
MPI_Barrier(WorldShmComm);
_ShmAlloc=1;
_ShmAllocBytes = bytes;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
for(int r=0;r<WorldShmSize;r++){
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd == -1) {
printf("open %s failed\n",shm_name);
perror("open hugetlbfs");
exit(0);
}
int mmap_flag = MAP_SHARED ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
#endif // MMAP
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbf and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
int fd=-1;
int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
#endif // MMAP
#ifdef GRID_MPI3_SHMOPEN
////////////////////////////////////////////////////////////////////////////////////////////
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
// the posix shm virtual file system
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
char shm_name [NAME_MAX];
if ( WorldShmRank == 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes;
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
shm_unlink(shm_name);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
ftruncate(fd, size);
int mmap_flag = MAP_SHARED;
#ifdef MAP_POPULATE
mmap_flag |= MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if (flags) mmap_flag |= MAP_HUGETLB;
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap");
assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
}
}
MPI_Barrier(WorldShmComm);
if ( WorldShmRank != 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes ;
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
int fd=shm_open(shm_name,O_RDWR,0666);
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
}
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
}
#endif
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
int rank, size;
MPI_Comm_rank(comm,&rank);
MPI_Comm_size(comm,&size);
ShmRanks.resize(size);
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize);
//////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer
//////////////////////////////////////////////////////////////////////
assert (GlobalSharedMemory::ShmAlloc()==1);
heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){
uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
}
ShmBufferFreeAll();
/////////////////////////////////////////////////////////////////////
// find comm ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group FullGroup, ShmGroup;
MPI_Comm_group (comm , &FullGroup);
MPI_Comm_group (ShmComm, &ShmGroup);
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
}
//////////////////////////////////////////////////////////////////
// On node barrier
//////////////////////////////////////////////////////////////////
void SharedMemory::ShmBarrier(void)
{
MPI_Barrier (ShmComm);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Test the shared memory is working
//////////////////////////////////////////////////////////////////////////////////////////////////////////
void SharedMemory::SharedMemoryTest(void)
{
ShmBarrier();
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
check[0] = GlobalSharedMemory::WorldNode;
check[1] = r;
check[2] = 0x5A5A5A;
}
}
ShmBarrier();
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==0x5A5A5A);
}
ShmBarrier();
}
void *SharedMemory::ShmBuffer(int rank)
{
int gpeer = ShmRanks[rank];
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
return ShmCommBufs[gpeer];
}
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
static int count =0;
int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
return (void *) remote;
}
}
SharedMemory::~SharedMemory()
{
int MPI_is_finalised; MPI_Finalized(&MPI_is_finalised);
if ( !MPI_is_finalised ) {
MPI_Comm_free(&ShmComm);
}
};
}

View File

@@ -0,0 +1,128 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
WorldComm = 0;
WorldRank = 0;
WorldSize = 1;
WorldShmComm = 0 ;
WorldShmRank = 0 ;
WorldShmSize = 1 ;
WorldNodes = 1 ;
WorldNode = 0 ;
WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
WorldShmCommBufs.resize(1);
_ShmSetup=1;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
{
optimal_comm = WorldComm;
}
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
int mmap_flag =0;
#ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
#endif
#ifdef MAP_ANON
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
if (ShmCommBuf == (void *)MAP_FAILED) {
perror("mmap failed ");
exit(EXIT_FAILURE);
}
#ifdef MADV_HUGEPAGE
if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
#endif
bzero(ShmCommBuf,bytes);
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
};
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
assert(GlobalSharedMemory::ShmAlloc()==1);
ShmRanks.resize(1);
ShmCommBufs.resize(1);
ShmRanks[0] = 0;
ShmRank = 0;
ShmSize = 1;
//////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer
//////////////////////////////////////////////////////////////////////
ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
heap_size = GlobalSharedMemory::ShmAllocBytes();
ShmBufferFreeAll();
return;
}
//////////////////////////////////////////////////////////////////
// On node barrier
//////////////////////////////////////////////////////////////////
void SharedMemory::ShmBarrier(void){ return ; }
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Test the shared memory is working
//////////////////////////////////////////////////////////////////////////////////////////////////////////
void SharedMemory::SharedMemoryTest(void) { return; }
void *SharedMemory::ShmBuffer(int rank)
{
return NULL;
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
return NULL;
}
SharedMemory::~SharedMemory()
{};
}

View File

@@ -45,31 +45,33 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int ent = 0;
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int stride=rhs._grid->_slice_stride[dimension];
if ( cbmask == 0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int bo = n*e2;
buffer[off+bo+b]=rhs._odata[so+o+b];
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
}
}
} else {
int bo=0;
std::vector<std::pair<int,int> > table;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) {
table.push_back(std::pair<int,int> (bo++,o+b));
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
}
}
}
parallel_for(int i=0;i<table.size();i++){
buffer[off+table[i].first]=rhs._odata[so+table[i].second];
}
}
parallel_for(int i=0;i<ent;i++){
buffer[table[i].first]=rhs._odata[table[i].second];
}
}
@@ -140,31 +142,35 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent =0;
if ( cbmask ==0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
rhs._odata[so+o+b]=buffer[bo+b];
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
}
}
} else {
std::vector<std::pair<int,int> > table;
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
table.push_back(std::pair<int,int> (so+o+b,bo++));
table[ent++]=std::pair<int,int> (so+o+b,bo++);
}
}
}
parallel_for(int i=0;i<table.size();i++){
// std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
rhs._odata[table[i].first]=buffer[table[i].second];
}
}
parallel_for(int i=0;i<ent;i++){
rhs._odata[table[i].first]=buffer[table[i].second];
}
}
@@ -228,29 +234,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension];
int stride = rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
if(cbmask == 0x3 ){
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
} else {
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
}
}
parallel_for(int i=0;i<ent;i++){
lhs._odata[table[i].first]=rhs._odata[table[i].second];
}
}
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
@@ -269,16 +278,28 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e2=rhs._grid->_slice_block [dimension];
int stride = rhs._grid->_slice_stride[dimension];
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
double t_tab,t_perm;
if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
}
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
}
}}
parallel_for(int i=0;i<ent;i++){
permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
}
}
//////////////////////////////////////////////////////
@@ -291,6 +312,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
double t_local;
if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3);
} else {
@@ -299,7 +322,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
}
}
template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid = rhs._grid;
int fd = grid->_fdimensions[dimension];
@@ -325,11 +348,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
// FIXME : This must change where we have a
// Rotate slice.
// Document how this works ; why didn't I do this when I first wrote it...
// wrap is whether sshift > rd.
// num is sshift mod rd.
//
@@ -365,10 +384,8 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
return ret;
}
}
#endif

View File

@@ -54,13 +54,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
if ( !comm_dim ) {
// std::cout << "Cshift_local" <<std::endl;
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) {
// std::cout << "Cshift_comms_simd" <<std::endl;
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift);
} else {
// std::cout << "Cshift_comms" <<std::endl;
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
return ret;
@@ -91,9 +91,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
//std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
} else {
//std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
@@ -175,6 +178,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);

View File

@@ -256,9 +256,42 @@ public:
_odata[ss]=r._odata[ss];
}
}
Lattice(Lattice&& r){ // move constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata=std::move(r._odata);
}
inline Lattice<vobj> & operator = (Lattice<vobj> && r)
{
_grid = r._grid;
checkerboard = r.checkerboard;
_odata =std::move(r._odata);
return *this;
}
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
}
virtual ~Lattice(void) = default;
void reset(GridBase* grid) {
@@ -277,15 +310,6 @@ public:
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
}
// *=,+=,-= operators inherit behvour from correspond */+/- operation
template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {

View File

@@ -179,7 +179,7 @@ namespace Grid {
return ret;
}
#define DECLARE_RELATIONAL(op,functor) \
#define DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
{\
@@ -198,11 +198,6 @@ namespace Grid {
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \
return lhs._internal op rhs._internal; \
} \
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
{ \
@@ -212,14 +207,21 @@ namespace Grid {
inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
{ \
return lhs op rhs._internal; \
}
} \
#define DECLARE_RELATIONAL(op,functor) \
DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \
return lhs._internal op rhs._internal; \
}
DECLARE_RELATIONAL(<,slt);
DECLARE_RELATIONAL(<=,sle);
DECLARE_RELATIONAL(>,sgt);
DECLARE_RELATIONAL(>=,sge);
DECLARE_RELATIONAL(==,seq);
DECLARE_RELATIONAL_EQ(==,seq);
DECLARE_RELATIONAL(!=,sne);
#undef DECLARE_RELATIONAL

View File

@@ -52,23 +52,5 @@ namespace Grid {
}
};
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
Real *v_ptr = (Real *)&l._odata[0];
size_t o_len = l._grid->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
}
#endif

View File

@@ -77,9 +77,6 @@ namespace Grid {
// merge of April 11 2017
//<<<<<<< HEAD
// this function is necessary for the LS vectorised field
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
{
@@ -91,7 +88,6 @@ namespace Grid {
// all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same
@@ -102,27 +98,6 @@ namespace Grid {
return fine->lSites() / coarse->lSites();
}
/*
// Wrap seed_seq to give common interface with random_device
class fixedSeed {
public:
typedef std::seed_seq::result_type result_type;
std::seed_seq src;
fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
result_type operator () (void){
std::vector<result_type> list(1);
src.generate(list.begin(),list.end());
return list[0];
}
};
=======
>>>>>>> develop
*/
// real scalars are one component
template<class scalar,class distribution,class generator>
@@ -171,7 +146,7 @@ namespace Grid {
// support for parallel init
///////////////////////
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng)
static void Skip(RngEngine &eng,uint64_t site)
{
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
@@ -184,8 +159,11 @@ namespace Grid {
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
/////////////////////////////////////////////////////////////////////////////////////
uint64_t skip = 0x1; skip = skip<<40;
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
uint64_t skip = site;
skip = skip<<40;
eng.discard(skip);
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
#endif
static RngEngine Reseed(RngEngine &eng)
@@ -407,15 +385,14 @@ namespace Grid {
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
std::vector<int> gcoor;
int rank,o_idx,i_idx;
// Everybody loops over global volume.
for(int gidx=0;gidx<_grid->_gsites;gidx++){
Skip(master_engine); // Skip to next RNG sequence
parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
// Where is it?
int rank,o_idx,i_idx;
std::vector<int> gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
@@ -423,6 +400,7 @@ namespace Grid {
if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
}

View File

@@ -599,6 +599,51 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
extract1(in_vobj, out_ptrs, 0);
}
}
template<typename vobj, typename sobj>
typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type
unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
{
typedef typename vobj::vector_type vtype;
GridBase* in_grid = in._grid;
out.resize(in_grid->lSites());
int ndim = in_grid->Nd();
int in_nsimd = vtype::Nsimd();
std::vector<std::vector<int> > in_icoor(in_nsimd);
for(int lane=0; lane < in_nsimd; lane++){
in_icoor[lane].resize(ndim);
in_grid->iCoorFromIindex(in_icoor[lane], lane);
}
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
//Assemble vector of pointers to output elements
std::vector<sobj*> out_ptrs(in_nsimd);
std::vector<int> in_ocoor(ndim);
in_grid->oCoorFromOindex(in_ocoor, in_oidx);
std::vector<int> lcoor(in_grid->Nd());
for(int lane=0; lane < in_nsimd; lane++){
for(int mu=0;mu<ndim;mu++)
lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
int lex;
Lexicographic::IndexFromCoorReversed(lcoor, lex, in_grid->_ldimensions);
out_ptrs[lane] = &out[lex];
}
//Unpack into those ptrs
const vobj & in_vobj = in._odata[in_oidx];
extract1(in_vobj, out_ptrs, 0);
}
}
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
template<typename vobj, typename sobj>
typename std::enable_if<isSIMDvectorized<vobj>::value
@@ -648,10 +693,59 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
}
}
template<typename vobj, typename sobj>
typename std::enable_if<isSIMDvectorized<vobj>::value
&& !isSIMDvectorized<sobj>::value, void>::type
vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
{
typedef typename vobj::vector_type vtype;
GridBase* grid = out._grid;
assert(in.size()==grid->lSites());
int ndim = grid->Nd();
int nsimd = vtype::Nsimd();
std::vector<std::vector<int> > icoor(nsimd);
for(int lane=0; lane < nsimd; lane++){
icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane);
}
parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
//Assemble vector of pointers to output elements
std::vector<sobj*> ptrs(nsimd);
std::vector<int> ocoor(ndim);
grid->oCoorFromOindex(ocoor, oidx);
std::vector<int> lcoor(grid->Nd());
for(int lane=0; lane < nsimd; lane++){
for(int mu=0;mu<ndim;mu++){
lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
}
int lex;
Lexicographic::IndexFromCoorReversed(lcoor, lex, grid->_ldimensions);
ptrs[lane] = &in[lex];
}
//pack from those ptrs
vobj vecobj;
merge1(vecobj, ptrs, 0);
out._odata[oidx] = vecobj;
}
}
//Convert a Lattice from one precision to another
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
assert(out._grid->Nd() == in._grid->Nd());
assert(out._grid->FullDimensions() == in._grid->FullDimensions());
out.checkerboard = in.checkerboard;
GridBase *in_grid=in._grid;
GridBase *out_grid = out._grid;
@@ -694,30 +788,6 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
////////////////////////////////////////////////////////////////////////////////
// Communicate between grids
////////////////////////////////////////////////////////////////////////////////
//
// All to all plan
//
// Subvolume on fine grid is v. Vectors a,b,c,d
//
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// SIMPLEST CASE:
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Mesh of nodes (2) ; subdivide to 1 subdivisions
//
// Lex ord:
// N0 va0 vb0 N1 va1 vb1
//
// For each dimension do an all to all
//
// full AllToAll(0)
// N0 va0 va1 N1 vb0 vb1
//
// REARRANGE
// N0 va01 N1 vb01
//
// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
// NB: Easiest to programme if keep in lex order.
//
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// SIMPLE CASE:
///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -751,9 +821,17 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
//
// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
// NB: Easiest to programme if keep in lex order.
//
/////////////////////////////////////////////////////////
/*
* Let chunk = (fvol*nvec)/sP be size of a chunk. ( Divide lexico vol * nvec into fP/sP = M chunks )
*
* 2nd A2A (over sP nodes; subdivide the fP into sP chunks of M)
*
* node 0 1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1).. data chunk x M x sP = fL / sP * M * sP = fL * M growth
* node 1 1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1)..
* node 2 1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1)..
* node 3 1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1)..
* etc...
*/
template<class Vobj>
void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
{
@@ -812,57 +890,58 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int nvec = nvector; // Counts down to 1 as we collapse dims
std::vector<int> ldims = full_grid->_ldimensions;
std::vector<int> lcoor(ndim);
for(int d=ndim-1;d>=0;d--){
if ( ratio[d] != 1 ) {
full_grid ->AllToAll(d,alldata,tmpdata);
// std::cout << GridLogMessage << "Grid_split: dim " <<d<<" ratio "<<ratio[d]<<" nvec "<<nvec<<" procs "<<split_grid->_processors[d]<<std::endl;
// for(int v=0;v<nvec;v++){
// std::cout << "Grid_split: alldata["<<v<<"] " << alldata[v] <<std::endl;
// std::cout << "Grid_split: tmpdata["<<v<<"] " << tmpdata[v] <<std::endl;
// }
//////////////////////////////////////////
//Local volume for this dimension is expanded by ratio of processor extents
// Number of vectors is decreased by same factor
// Rearrange to lexico for bigger volume
//////////////////////////////////////////
nvec /= ratio[d];
if ( split_grid->_processors[d] > 1 ) {
alldata=tmpdata;
split_grid->AllToAll(d,alldata,tmpdata);
}
auto rdims = ldims; rdims[d] *= ratio[d];
auto rsites= lsites*ratio[d];
for(int v=0;v<nvec;v++){
auto rdims = ldims;
auto M = ratio[d];
auto rsites= lsites*M;// increases rsites by M
nvec /= M; // Reduce nvec by subdivision factor
rdims[d] *= M; // increase local dim by same factor
// For loop over each site within old subvol
for(int lsite=0;lsite<lsites;lsite++){
int sP = split_grid->_processors[d];
int fP = full_grid->_processors[d];
Lexicographic::CoorFromIndex(lcoor, lsite, ldims);
int fvol = lsites;
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
for(int r=0;r<ratio[d];r++){ // ratio*nvec terms
// Loop over reordered data post A2A
parallel_for(int c=0;c<chunk;c++){
std::vector<int> coor(ndim);
for(int m=0;m<M;m++){
for(int s=0;s<sP;s++){
// addressing; use lexico
int lex_r;
uint64_t lex_c = c+chunk*m+chunk*M*s;
uint64_t lex_fvol_vec = c+chunk*s;
uint64_t lex_fvol = lex_fvol_vec%fvol;
uint64_t lex_vec = lex_fvol_vec/fvol;
auto rcoor = lcoor; rcoor[d] += r*ldims[d];
// which node sets an adder to the coordinate
Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);
coor[d] += m*ldims[d];
Lexicographic::IndexFromCoor(coor, lex_r, rdims);
lex_r += lex_vec * rsites;
int rsite; Lexicographic::IndexFromCoor(rcoor, rsite, rdims);
rsite += v * rsites;
// LexicoFind coordinate & vector number within split lattice
alldata[lex_r] = tmpdata[lex_c];
int rmul=nvec*lsites;
int vmul= lsites;
alldata[rsite] = tmpdata[lsite+r*rmul+v*vmul];
// if ( lsite==0 ) {
// std::cout << "Grid_split: grow alldata["<<rsite<<"] " << alldata[rsite] << " <- tmpdata["<< lsite+r*rmul+v*vmul<<"] "<<tmpdata[lsite+r*rmul+v*vmul] <<std::endl;
// }
}
}
}
ldims[d]*= ratio[d];
lsites *= ratio[d];
if ( split_grid->_processors[d] > 1 ) {
tmpdata = alldata;
split_grid->AllToAll(d,tmpdata,alldata);
}
}
}
vectorizeFromLexOrdArray(alldata,split);
@@ -933,72 +1012,74 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
/////////////////////////////////////////////////////////////////
// Start from split grid and work towards full grid
/////////////////////////////////////////////////////////////////
std::vector<int> lcoor(ndim);
std::vector<int> rcoor(ndim);
int nvec = 1;
lsites = split_grid->lSites();
std::vector<int> ldims = split_grid->_ldimensions;
uint64_t rsites = split_grid->lSites();
std::vector<int> rdims = split_grid->_ldimensions;
// for(int d=ndim-1;d>=0;d--){
for(int d=0;d<ndim;d++){
if ( ratio[d] != 1 ) {
auto M = ratio[d];
if ( split_grid->_processors[d] > 1 ) {
tmpdata = alldata;
split_grid->AllToAll(d,tmpdata,alldata);
}
//////////////////////////////////////////
//Local volume for this dimension is expanded by ratio of processor extents
// Number of vectors is decreased by same factor
// Rearrange to lexico for bigger volume
//////////////////////////////////////////
auto rsites= lsites/ratio[d];
auto rdims = ldims; rdims[d]/=ratio[d];
for(int v=0;v<nvec;v++){
// rsite, rcoor --> smaller local volume
// lsite, lcoor --> bigger original (single node?) volume
// For loop over each site within smaller subvol
for(int rsite=0;rsite<rsites;rsite++){
Lexicographic::CoorFromIndex(rcoor, rsite, rdims);
int lsite;
for(int r=0;r<ratio[d];r++){
lcoor = rcoor; lcoor[d] += r*rdims[d];
Lexicographic::IndexFromCoor(lcoor, lsite, ldims); lsite += v * lsites;
int rmul=nvec*rsites;
int vmul= rsites;
tmpdata[rsite+r*rmul+v*vmul]=alldata[lsite];
int sP = split_grid->_processors[d];
int fP = full_grid->_processors[d];
auto ldims = rdims; ldims[d] /= M; // Decrease local dims by same factor
auto lsites= rsites/M; // Decreases rsites by M
int fvol = lsites;
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
{
// Loop over reordered data post A2A
parallel_for(int c=0;c<chunk;c++){
std::vector<int> coor(ndim);
for(int m=0;m<M;m++){
for(int s=0;s<sP;s++){
// addressing; use lexico
int lex_r;
uint64_t lex_c = c+chunk*m+chunk*M*s;
uint64_t lex_fvol_vec = c+chunk*s;
uint64_t lex_fvol = lex_fvol_vec%fvol;
uint64_t lex_vec = lex_fvol_vec/fvol;
// which node sets an adder to the coordinate
Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);
coor[d] += m*ldims[d];
Lexicographic::IndexFromCoor(coor, lex_r, rdims);
lex_r += lex_vec * rsites;
// LexicoFind coordinate & vector number within split lattice
tmpdata[lex_c] = alldata[lex_r];
}
}
}
}
nvec *= ratio[d];
ldims[d]=rdims[d];
lsites =rsites;
if ( split_grid->_processors[d] > 1 ) {
split_grid->AllToAll(d,tmpdata,alldata);
tmpdata=alldata;
}
full_grid ->AllToAll(d,tmpdata,alldata);
rdims[d]/= M;
rsites /= M;
nvec *= M; // Increase nvec by subdivision factor
}
}
lsites = full_grid->lSites();
for(int v=0;v<nvector;v++){
assert(v<full.size());
// assert(v<full.size());
parallel_for(int site=0;site<lsites;site++){
// assert(v*lsites+site < alldata.size());
scalardata[site] = alldata[v*lsites+site];
}
vectorizeFromLexOrdArray(scalardata,full[v]);
}
}
}
#endif

View File

@@ -86,6 +86,7 @@ protected:
Colours &Painter;
int active;
int timing_mode;
int topWidth{-1};
static int timestamp;
std::string name, topName;
std::string COLOUR;
@@ -124,11 +125,17 @@ public:
Reset();
}
}
void setTopWidth(const int w) {topWidth = w;}
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) {
stream << log.background()<< std::left << log.topName << log.background()<< " : ";
stream << log.background()<< std::left;
if (log.topWidth > 0)
{
stream << std::setw(log.topWidth);
}
stream << log.topName << log.background()<< " : ";
stream << log.colour() << std::left << log.name << log.background() << " : ";
if ( log.timestamp ) {
log.StopWatch->Stop();

View File

@@ -91,7 +91,7 @@ class BinaryIO {
typedef typename vobj::scalar_object sobj;
GridBase *grid = lat._grid;
int lsites = grid->lSites();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
unvectorizeToLexOrdArray(scalardata,lat);
@@ -110,11 +110,11 @@ class BinaryIO {
lsites = 1;
}
#pragma omp parallel
PARALLEL_REGION
{
uint32_t nersc_csum_thr = 0;
#pragma omp for
PARALLEL_FOR_LOOP_INTERN
for (uint64_t local_site = 0; local_site < lsites; local_site++)
{
uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
@@ -124,7 +124,7 @@ class BinaryIO {
}
}
#pragma omp critical
PARALLEL_CRITICAL
{
nersc_csum += nersc_csum_thr;
}
@@ -146,21 +146,23 @@ class BinaryIO {
std::vector<int> local_start =grid->LocalStarts();
std::vector<int> global_vol =grid->FullDimensions();
#pragma omp parallel
PARALLEL_REGION
{
std::vector<int> coor(nd);
uint32_t scidac_csuma_thr=0;
uint32_t scidac_csumb_thr=0;
uint32_t site_crc=0;
#pragma omp for
PARALLEL_FOR_LOOP_INTERN
for(uint64_t local_site=0;local_site<lsites;local_site++){
uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
/*
* Scidac csum is rather more heavyweight
* FIXME -- 128^3 x 256 x 16 will overflow.
*/
int global_site;
Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@@ -181,7 +183,7 @@ class BinaryIO {
scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
}
#pragma omp critical
PARALLEL_CRITICAL
{
scidac_csuma^= scidac_csuma_thr;
scidac_csumb^= scidac_csumb_thr;
@@ -261,7 +263,7 @@ class BinaryIO {
GridBase *grid,
std::vector<fobj> &iodata,
std::string file,
Integer offset,
uint64_t& offset,
const std::string &format, int control,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@@ -370,7 +372,7 @@ class BinaryIO {
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ifstream fin;
fin.open(file, std::ios::binary | std::ios::in);
fin.open(file, std::ios::binary | std::ios::in);
if (control & BINARYIO_MASTER_APPEND)
{
fin.seekg(-sizeof(fobj), fin.end);
@@ -429,14 +431,20 @@ class BinaryIO {
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
}
std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
MPI_Offset os;
MPI_File_get_position(fh, &os);
MPI_File_get_byte_offset(fh, os, &disp);
offset = disp;
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
@@ -446,16 +454,20 @@ class BinaryIO {
} else {
std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
std::ofstream fout;
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
try {
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
if (offset) { // Must already exist and contain data
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
} else { // Allow create
fout.open(file,std::ios::binary|std::ios::out);
}
} catch (const std::fstream::failure& exc) {
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
// std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
@@ -489,6 +501,7 @@ class BinaryIO {
exit(1);
#endif
}
offset = fout.tellp();
fout.close();
}
timer.Stop();
@@ -523,7 +536,7 @@ class BinaryIO {
static inline void readLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
Integer offset,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@@ -533,7 +546,7 @@ class BinaryIO {
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
int lsites = grid->lSites();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
@@ -544,7 +557,7 @@ class BinaryIO {
GridStopWatch timer;
timer.Start();
parallel_for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
vectorizeFromLexOrdArray(scalardata,Umu);
grid->Barrier();
@@ -560,7 +573,7 @@ class BinaryIO {
static inline void writeLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
Integer offset,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@@ -569,7 +582,7 @@ class BinaryIO {
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
int lsites = grid->lSites();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
@@ -580,7 +593,7 @@ class BinaryIO {
GridStopWatch timer; timer.Start();
unvectorizeToLexOrdArray(scalardata,Umu);
parallel_for(int x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
grid->Barrier();
timer.Stop();
@@ -597,7 +610,7 @@ class BinaryIO {
static inline void readRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
Integer offset,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
@@ -610,8 +623,8 @@ class BinaryIO {
std::string format = "IEEE32BIG";
GridBase *grid = parallel._grid;
int gsites = grid->gSites();
int lsites = grid->lSites();
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp = 0;
uint32_t scidac_csuma_tmp = 0;
@@ -626,7 +639,7 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb);
timer.Start();
parallel_for(int lidx=0;lidx<lsites;lidx++){
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel.SetState(tmp,lidx);
@@ -659,7 +672,7 @@ class BinaryIO {
static inline void writeRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
Integer offset,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
@@ -670,8 +683,8 @@ class BinaryIO {
typedef std::array<RngStateType,RngStateCount> RNGstate;
GridBase *grid = parallel._grid;
int gsites = grid->gSites();
int lsites = grid->lSites();
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp;
uint32_t scidac_csuma_tmp;
@@ -684,7 +697,7 @@ class BinaryIO {
timer.Start();
std::vector<RNGstate> iodata(lsites);
parallel_for(int lidx=0;lidx<lsites;lidx++){
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount);
parallel.GetState(tmp,lidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
@@ -693,7 +706,6 @@ class BinaryIO {
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
iodata.resize(1);
{
std::vector<RngStateType> tmp(RngStateCount);

View File

@@ -182,6 +182,11 @@ class GridLimeReader : public BinaryIO {
{
filename= _filename;
File = fopen(filename.c_str(), "r");
if (File == nullptr)
{
std::cerr << "cannot open file '" << filename << "'" << std::endl;
abort();
}
LimeR = limeCreateReader(File);
}
/////////////////////////////////////////////
@@ -248,7 +253,6 @@ class GridLimeReader : public BinaryIO {
template<class serialisable_object>
void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
{
std::string xmlstring;
// should this be a do while; can we miss a first record??
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
@@ -262,7 +266,8 @@ class GridLimeReader : public BinaryIO {
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
// std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
XmlReader RD(&xmlc[0],"");
std::string xmlstring(&xmlc[0]);
XmlReader RD(xmlstring, true, "");
read(RD,object_name,object);
return;
}
@@ -272,8 +277,10 @@ class GridLimeReader : public BinaryIO {
}
};
class GridLimeWriter : public BinaryIO {
class GridLimeWriter : public BinaryIO
{
public:
///////////////////////////////////////////////////
// FIXME: format for RNG? Now just binary out instead
// FIXME: collective calls or not ?
@@ -282,17 +289,24 @@ class GridLimeWriter : public BinaryIO {
FILE *File;
LimeWriter *LimeW;
std::string filename;
bool boss_node;
GridLimeWriter( bool isboss = true) {
boss_node = isboss;
}
void open(const std::string &_filename) {
filename= _filename;
File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL );
if ( boss_node ) {
File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL );
}
}
/////////////////////////////////////////////
// Close the file
/////////////////////////////////////////////
void close(void) {
fclose(File);
if ( boss_node ) {
fclose(File);
}
// limeDestroyWriter(LimeW);
}
///////////////////////////////////////////////////////
@@ -300,10 +314,12 @@ class GridLimeWriter : public BinaryIO {
///////////////////////////////////////////////////////
int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
{
LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
assert(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h);
if ( boss_node ) {
LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
assert(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h);
}
return LIME_SUCCESS;
}
////////////////////////////////////////////
@@ -312,65 +328,99 @@ class GridLimeWriter : public BinaryIO {
template<class serialisable_object>
void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name)
{
std::string xmlstring;
{
XmlWriter WR("","");
write(WR,object_name,object);
xmlstring = WR.XmlString();
if ( boss_node ) {
std::string xmlstring;
{
XmlWriter WR("","");
write(WR,object_name,object);
xmlstring = WR.XmlString();
}
// std::cout << "WriteLimeObject" << record_name <<std::endl;
uint64_t nbytes = xmlstring.size();
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
limeDestroyHeader(h);
}
// std::cout << "WriteLimeObject" << record_name <<std::endl;
uint64_t nbytes = xmlstring.size();
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
limeDestroyHeader(h);
// std::cout << " File offset is now"<<ftello(File) << std::endl;
}
////////////////////////////////////////////
////////////////////////////////////////////////////
// Write a generic lattice field and csum
////////////////////////////////////////////
// This routine is Collectively called by all nodes
// in communicator used by the field._grid
////////////////////////////////////////////////////
template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
{
////////////////////////////////////////////
// Create record header
////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
int err;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl;
////////////////////////////////////////////////////////////////////
// NB: FILE and iostream are jointly writing disjoint sequences in the
// the same file through different file handles (integer units).
//
// These are both buffered, so why I think this code is right is as follows.
//
// i) write record header to FILE *File, telegraphing the size.
// ii) ftello reads the offset from FILE *File .
// i) write record header to FILE *File, telegraphing the size; flush
// ii) ftello reads the offset from FILE *File .
// iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
// Closes iostream and flushes.
// iv) fseek on FILE * to end of this disjoint section.
// v) Continue writing scidac record.
////////////////////////////////////////////////////////////////////
uint64_t offset = ftello(File);
// std::cout << " Writing to offset "<<offset << std::endl;
GridBase *grid = field._grid;
assert(boss_node == field._grid->IsBoss() );
////////////////////////////////////////////
// Create record header
////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
int err;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
if ( boss_node ) {
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
fflush(File);
}
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl;
////////////////////////////////////////////////
// Check all nodes agree on file position
////////////////////////////////////////////////
uint64_t offset1;
if ( boss_node ) {
offset1 = ftello(File);
}
grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
///////////////////////////////////////////
// The above is collective. Write by other means into the binary record
///////////////////////////////////////////
std::string format = getFormatString<vobj>();
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
// fseek(File,0,SEEK_END); offset = ftello(File);std::cout << " offset now "<<offset << std::endl;
err=limeWriterCloseRecord(LimeW); assert(err>=0);
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
///////////////////////////////////////////
// Wind forward and close the record
///////////////////////////////////////////
if ( boss_node ) {
fseek(File,0,SEEK_END);
uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl;
assert( (offset2-offset1) == PayloadSize);
}
/////////////////////////////////////////////////////////////
// Check MPI-2 I/O did what we expect to file
/////////////////////////////////////////////////////////////
if ( boss_node ) {
err=limeWriterCloseRecord(LimeW); assert(err>=0);
}
////////////////////////////////////////
// Write checksum element, propagaing forward from the BinaryIO
// Always pair a checksum with a binary object, and close message
@@ -380,21 +430,26 @@ class GridLimeWriter : public BinaryIO {
std::stringstream streamb; streamb << std::hex << scidac_csumb;
checksum.suma= streama.str();
checksum.sumb= streamb.str();
// std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
if ( boss_node ) {
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
}
}
};
class ScidacWriter : public GridLimeWriter {
public:
template<class SerialisableUserFile>
void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss) { };
template<class SerialisableUserFile>
void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
if ( this->boss_node ) {
writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
}
////////////////////////////////////////////////
// Write generic lattice field in scidac format
////////////////////////////////////////////////
@@ -415,9 +470,12 @@ class ScidacWriter : public GridLimeWriter {
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
if ( this->boss_node ) {
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
}
// Collective call
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
}
};
@@ -484,6 +542,8 @@ class ScidacReader : public GridLimeReader {
class IldgWriter : public ScidacWriter {
public:
IldgWriter(bool isboss) : ScidacWriter(isboss) {};
///////////////////////////////////
// A little helper
@@ -568,7 +628,6 @@ class IldgWriter : public ScidacWriter {
writeLimeIldgLFN(header.ildg_lfn); // rec
writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
// limeDestroyWriter(LimeW);
fclose(File);
}
};
@@ -644,9 +703,11 @@ class IldgReader : public GridLimeReader {
//////////////////////////////////
// ILDG format record
std::string xmlstring(&xmlc[0]);
if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"ildgFormat",ildgFormat_);
if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
@@ -661,13 +722,13 @@ class IldgReader : public GridLimeReader {
}
if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
FieldMetaData_.ildg_lfn = std::string(&xmlc[0]);
FieldMetaData_.ildg_lfn = xmlstring;
found_ildgLFN = 1;
}
if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"FieldMetaData",FieldMetaData_);
format = FieldMetaData_.floating_point;
@@ -681,18 +742,17 @@ class IldgReader : public GridLimeReader {
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) {
std::string xmls(&xmlc[0]);
// is it a USQCD info field
if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) {
if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) {
// std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"usqcdInfo",usqcdInfo_);
found_usqcdInfo = 1;
}
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) {
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"scidacChecksum",scidacChecksum_);
found_scidacChecksum = 1;
}

View File

@@ -136,8 +136,9 @@ struct scidacRecord : Serializable {
int, typesize,
int, datacount);
scidacRecord() { version =1.0; }
scidacRecord()
: version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
{}
};
////////////////////////

View File

@@ -81,18 +81,16 @@ namespace Grid {
std::string, creation_date,
std::string, archive_date,
std::string, floating_point);
FieldMetaData(void) {
nd=4;
dimension.resize(4);
boundary.resize(4);
scidac_checksuma=0;
scidac_checksumb=0;
checksum=0;
}
// WARNING: non-initialised values might lead to twisted parallel IO
// issues, std::string are fine because they initliase to size 0
// as per C++ standard.
FieldMetaData(void)
: nd(4), dimension(4,0), boundary(4, ""), data_start(0),
link_trace(0.), plaquette(0.), checksum(0),
scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
{}
};
namespace QCD {
using namespace Grid;

View File

@@ -57,7 +57,7 @@ namespace Grid {
// for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{
int offset=0;
uint64_t offset=0;
std::map<std::string,std::string> header;
std::string line;
@@ -139,7 +139,7 @@ namespace Grid {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu._grid;
int offset = readHeader(file,Umu._grid,header);
uint64_t offset = readHeader(file,Umu._grid,header);
FieldMetaData clone(header);
@@ -236,21 +236,25 @@ namespace Grid {
GaugeStatistics(Umu,header);
MachineCharacteristics(header);
int offset;
truncate(file);
uint64_t offset;
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
offset = writeHeader(header,file);
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
writeHeader(header,file);
if ( grid->IsBoss() ) {
writeHeader(header,file);
}
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum
@@ -278,7 +282,7 @@ namespace Grid {
header.plaquette=0.0;
MachineCharacteristics(header);
int offset;
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
@@ -293,12 +297,18 @@ namespace Grid {
header.data_type = std::string("SITMO");
#endif
truncate(file);
offset = writeHeader(header,file);
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
offset = writeHeader(header,file);
if ( grid->IsBoss() ) {
offset = writeHeader(header,file);
}
std::cout<<GridLogMessage
<<"Written NERSC RNG STATE "<<file<< " checksum "
@@ -313,7 +323,7 @@ namespace Grid {
GridBase *grid = parallel._grid;
int offset = readHeader(file,grid,header);
uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header);

View File

@@ -1,44 +0,0 @@
pugixml [![Build Status](https://travis-ci.org/zeux/pugixml.svg?branch=master)](https://travis-ci.org/zeux/pugixml) [![Build status](https://ci.appveyor.com/api/projects/status/9hdks1doqvq8pwe7/branch/master?svg=true)](https://ci.appveyor.com/project/zeux/pugixml)
=======
pugixml is a C++ XML processing library, which consists of a DOM-like interface with rich traversal/modification
capabilities, an extremely fast XML parser which constructs the DOM tree from an XML file/buffer, and an XPath 1.0
implementation for complex data-driven tree queries. Full Unicode support is also available, with Unicode interface
variants and conversions between different Unicode encodings (which happen automatically during parsing/saving).
pugixml is used by a lot of projects, both open-source and proprietary, for performance and easy-to-use interface.
## Documentation
Documentation for the current release of pugixml is available on-line as two separate documents:
* [Quick-start guide](http://pugixml.org/docs/quickstart.html), that aims to provide enough information to start using the library;
* [Complete reference manual](http://pugixml.org/docs/manual.html), that describes all features of the library in detail.
Youre advised to start with the quick-start guide; however, many important library features are either not described in it at all or only mentioned briefly; if you require more information you should read the complete manual.
## License
This library is available to anybody free of charge, under the terms of MIT License:
Copyright (c) 2006-2015 Arseny Kapoulkine
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

View File

@@ -1,7 +1,7 @@
/**
* pugixml parser - version 1.6
* pugixml parser - version 1.9
* --------------------------------------------------------
* Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at http://pugixml.org/
*
* This library is distributed under the MIT License. See notice at the end
@@ -17,6 +17,9 @@
// Uncomment this to enable wchar_t mode
// #define PUGIXML_WCHAR_MODE
// Uncomment this to enable compact mode
// #define PUGIXML_COMPACT
// Uncomment this to disable XPath
// #define PUGIXML_NO_XPATH
@@ -46,7 +49,7 @@
#endif
/**
* Copyright (c) 2006-2015 Arseny Kapoulkine
* Copyright (c) 2006-2018 Arseny Kapoulkine
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@@ -59,7 +62,7 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,7 @@
/**
* pugixml parser - version 1.6
* pugixml parser - version 1.9
* --------------------------------------------------------
* Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at http://pugixml.org/
*
* This library is distributed under the MIT License. See notice at the end
@@ -13,7 +13,7 @@
#ifndef PUGIXML_VERSION
// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
# define PUGIXML_VERSION 160
# define PUGIXML_VERSION 190
#endif
// Include user configuration file (this can define various configuration macros)
@@ -72,6 +72,44 @@
# endif
#endif
// If the platform is known to have move semantics support, compile move ctor/operator implementation
#ifndef PUGIXML_HAS_MOVE
# if __cplusplus >= 201103
# define PUGIXML_HAS_MOVE
# elif defined(_MSC_VER) && _MSC_VER >= 1600
# define PUGIXML_HAS_MOVE
# endif
#endif
// If C++ is 2011 or higher, add 'noexcept' specifiers
#ifndef PUGIXML_NOEXCEPT
# if __cplusplus >= 201103
# define PUGIXML_NOEXCEPT noexcept
# elif defined(_MSC_VER) && _MSC_VER >= 1900
# define PUGIXML_NOEXCEPT noexcept
# else
# define PUGIXML_NOEXCEPT
# endif
#endif
// Some functions can not be noexcept in compact mode
#ifdef PUGIXML_COMPACT
# define PUGIXML_NOEXCEPT_IF_NOT_COMPACT
#else
# define PUGIXML_NOEXCEPT_IF_NOT_COMPACT PUGIXML_NOEXCEPT
#endif
// If C++ is 2011 or higher, add 'override' qualifiers
#ifndef PUGIXML_OVERRIDE
# if __cplusplus >= 201103
# define PUGIXML_OVERRIDE override
# elif defined(_MSC_VER) && _MSC_VER >= 1700
# define PUGIXML_OVERRIDE override
# else
# define PUGIXML_OVERRIDE
# endif
#endif
// Character interface macros
#ifdef PUGIXML_WCHAR_MODE
# define PUGIXML_TEXT(t) L ## t
@@ -133,13 +171,13 @@ namespace pugi
// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
const unsigned int parse_eol = 0x0020;
// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
const unsigned int parse_wconv_attribute = 0x0040;
// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
const unsigned int parse_wnorm_attribute = 0x0080;
// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
const unsigned int parse_declaration = 0x0100;
@@ -158,6 +196,11 @@ namespace pugi
// is a valid document. This flag is off by default.
const unsigned int parse_fragment = 0x1000;
// This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of
// the document; this flag is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments.
// This flag is off by default.
const unsigned int parse_embed_pcdata = 0x2000;
// The default parsing mode.
// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
@@ -184,16 +227,16 @@ namespace pugi
};
// Formatting flags
// Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
const unsigned int format_indent = 0x01;
// Write encoding-specific BOM to the output stream. This flag is off by default.
const unsigned int format_write_bom = 0x02;
// Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
const unsigned int format_raw = 0x04;
// Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
const unsigned int format_no_declaration = 0x08;
@@ -206,6 +249,9 @@ namespace pugi
// Write every attribute on a new line with appropriate indentation. This flag is off by default.
const unsigned int format_indent_attributes = 0x40;
// Don't output empty element tags, instead writing an explicit start and end tag even if there are no children. This flag is off by default.
const unsigned int format_no_empty_element_tags = 0x80;
// The default set of formatting flags.
// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
const unsigned int format_default = format_indent;
@@ -225,7 +271,7 @@ namespace pugi
class xml_node;
class xml_text;
#ifndef PUGIXML_NO_XPATH
class xpath_node;
class xpath_node_set;
@@ -268,7 +314,7 @@ namespace pugi
// Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
xml_writer_file(void* file);
virtual void write(const void* data, size_t size);
virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;
private:
void* file;
@@ -283,7 +329,7 @@ namespace pugi
xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
virtual void write(const void* data, size_t size);
virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;
private:
std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
@@ -299,13 +345,13 @@ namespace pugi
private:
xml_attribute_struct* _attr;
typedef void (*unspecified_bool_type)(xml_attribute***);
public:
// Default constructor. Constructs an empty attribute.
xml_attribute();
// Constructs attribute from internal pointer
explicit xml_attribute(xml_attribute_struct* attr);
@@ -354,6 +400,8 @@ namespace pugi
// Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
bool set_value(int rhs);
bool set_value(unsigned int rhs);
bool set_value(long rhs);
bool set_value(unsigned long rhs);
bool set_value(double rhs);
bool set_value(float rhs);
bool set_value(bool rhs);
@@ -367,6 +415,8 @@ namespace pugi
xml_attribute& operator=(const char_t* rhs);
xml_attribute& operator=(int rhs);
xml_attribute& operator=(unsigned int rhs);
xml_attribute& operator=(long rhs);
xml_attribute& operator=(unsigned long rhs);
xml_attribute& operator=(double rhs);
xml_attribute& operator=(float rhs);
xml_attribute& operator=(bool rhs);
@@ -417,7 +467,7 @@ namespace pugi
// Borland C++ workaround
bool operator!() const;
// Comparison operators (compares wrapped node pointers)
bool operator==(const xml_node& r) const;
bool operator!=(const xml_node& r) const;
@@ -438,7 +488,7 @@ namespace pugi
// Get node value, or "" if node is empty or it has no value
// Note: For <node>text</node> node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes.
const char_t* value() const;
// Get attribute list
xml_attribute first_attribute() const;
xml_attribute last_attribute() const;
@@ -450,7 +500,7 @@ namespace pugi
// Get next/previous sibling in the children list of the parent node
xml_node next_sibling() const;
xml_node previous_sibling() const;
// Get parent node
xml_node parent() const;
@@ -478,7 +528,7 @@ namespace pugi
// Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
bool set_name(const char_t* rhs);
bool set_value(const char_t* rhs);
// Add attribute with specified name. Returns added attribute, or empty attribute on errors.
xml_attribute append_attribute(const char_t* name);
xml_attribute prepend_attribute(const char_t* name);
@@ -532,11 +582,11 @@ namespace pugi
template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
{
if (!_root) return xml_attribute();
for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
if (pred(attrib))
return attrib;
return xml_attribute();
}
@@ -544,11 +594,11 @@ namespace pugi
template <typename Predicate> xml_node find_child(Predicate pred) const
{
if (!_root) return xml_node();
for (xml_node node = first_child(); node; node = node.next_sibling())
if (pred(node))
return node;
return xml_node();
}
@@ -558,7 +608,7 @@ namespace pugi
if (!_root) return xml_node();
xml_node cur = first_child();
while (cur._root && cur._root != _root)
{
if (pred(cur)) return cur;
@@ -590,7 +640,7 @@ namespace pugi
// Recursively traverse subtree with xml_tree_walker
bool traverse(xml_tree_walker& walker);
#ifndef PUGIXML_NO_XPATH
// Select single node by evaluating XPath query. Returns first node from the resulting node set.
xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const;
@@ -601,11 +651,11 @@ namespace pugi
xpath_node_set select_nodes(const xpath_query& query) const;
// (deprecated: use select_node instead) Select single node by evaluating XPath query.
xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
xpath_node select_single_node(const xpath_query& query) const;
PUGIXML_DEPRECATED xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
PUGIXML_DEPRECATED xpath_node select_single_node(const xpath_query& query) const;
#endif
// Print subtree using a writer object
void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
@@ -701,6 +751,8 @@ namespace pugi
// Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
bool set(int rhs);
bool set(unsigned int rhs);
bool set(long rhs);
bool set(unsigned long rhs);
bool set(double rhs);
bool set(float rhs);
bool set(bool rhs);
@@ -714,6 +766,8 @@ namespace pugi
xml_text& operator=(const char_t* rhs);
xml_text& operator=(int rhs);
xml_text& operator=(unsigned int rhs);
xml_text& operator=(long rhs);
xml_text& operator=(unsigned long rhs);
xml_text& operator=(double rhs);
xml_text& operator=(float rhs);
xml_text& operator=(bool rhs);
@@ -867,11 +921,11 @@ namespace pugi
private:
int _depth;
protected:
// Get current traversal depth
int depth() const;
public:
xml_tree_walker();
virtual ~xml_tree_walker();
@@ -942,13 +996,14 @@ namespace pugi
char_t* _buffer;
char _memory[192];
// Non-copyable semantics
xml_document(const xml_document&);
const xml_document& operator=(const xml_document&);
xml_document& operator=(const xml_document&);
void create();
void destroy();
void _create();
void _destroy();
void _move(xml_document& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
public:
// Default constructor, makes empty document
@@ -957,6 +1012,12 @@ namespace pugi
// Destructor, invalidates all node/attribute handles to this document
~xml_document();
#ifdef PUGIXML_HAS_MOVE
// Move semantics support
xml_document(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
xml_document& operator=(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
#endif
// Removes all nodes, leaving the empty document
void reset();
@@ -970,7 +1031,7 @@ namespace pugi
#endif
// (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied.
xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
PUGIXML_DEPRECATED xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
// Load document from zero-terminated string. No encoding conversions are applied.
xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default);
@@ -1051,7 +1112,7 @@ namespace pugi
// Non-copyable semantics
xpath_variable(const xpath_variable&);
xpath_variable& operator=(const xpath_variable&);
public:
// Get variable name
const char_t* name() const;
@@ -1095,10 +1156,10 @@ namespace pugi
xpath_variable_set(const xpath_variable_set& rhs);
xpath_variable_set& operator=(const xpath_variable_set& rhs);
#if __cplusplus >= 201103
#ifdef PUGIXML_HAS_MOVE
// Move semantics support
xpath_variable_set(xpath_variable_set&& rhs);
xpath_variable_set& operator=(xpath_variable_set&& rhs);
xpath_variable_set(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
xpath_variable_set& operator=(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
#endif
// Add a new variable or get the existing one, if the types match
@@ -1139,29 +1200,29 @@ namespace pugi
// Destructor
~xpath_query();
#if __cplusplus >= 201103
#ifdef PUGIXML_HAS_MOVE
// Move semantics support
xpath_query(xpath_query&& rhs);
xpath_query& operator=(xpath_query&& rhs);
xpath_query(xpath_query&& rhs) PUGIXML_NOEXCEPT;
xpath_query& operator=(xpath_query&& rhs) PUGIXML_NOEXCEPT;
#endif
// Get query expression return type
xpath_value_type return_type() const;
// Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
bool evaluate_boolean(const xpath_node& n) const;
// Evaluate expression as double value in the specified context; performs type conversion if necessary.
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
double evaluate_number(const xpath_node& n) const;
#ifndef PUGIXML_NO_STL
// Evaluate expression as string value in the specified context; performs type conversion if necessary.
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
string_t evaluate_string(const xpath_node& n) const;
#endif
// Evaluate expression as string value in the specified context; performs type conversion if necessary.
// At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
@@ -1188,7 +1249,7 @@ namespace pugi
// Borland C++ workaround
bool operator!() const;
};
#ifndef PUGIXML_NO_EXCEPTIONS
// XPath exception class
class PUGIXML_CLASS xpath_exception: public std::exception
@@ -1201,26 +1262,26 @@ namespace pugi
explicit xpath_exception(const xpath_parse_result& result);
// Get error message
virtual const char* what() const throw();
virtual const char* what() const throw() PUGIXML_OVERRIDE;
// Get parse result
const xpath_parse_result& result() const;
};
#endif
// XPath node class (either xml_node or xml_attribute)
class PUGIXML_CLASS xpath_node
{
private:
xml_node _node;
xml_attribute _attribute;
typedef void (*unspecified_bool_type)(xpath_node***);
public:
// Default constructor; constructs empty XPath node
xpath_node();
// Construct XPath node from XML node/attribute
xpath_node(const xml_node& node);
xpath_node(const xml_attribute& attribute, const xml_node& parent);
@@ -1228,13 +1289,13 @@ namespace pugi
// Get node/attribute, if any
xml_node node() const;
xml_attribute attribute() const;
// Get parent of contained node/attribute
xml_node parent() const;
// Safe bool conversion operator
operator unspecified_bool_type() const;
// Borland C++ workaround
bool operator!() const;
@@ -1260,13 +1321,13 @@ namespace pugi
type_sorted, // Sorted by document order (ascending)
type_sorted_reverse // Sorted by document order (descending)
};
// Constant iterator type
typedef const xpath_node* const_iterator;
// We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work
typedef const xpath_node* iterator;
// Default constructor. Constructs empty set.
xpath_node_set();
@@ -1275,49 +1336,49 @@ namespace pugi
// Destructor
~xpath_node_set();
// Copy constructor/assignment operator
xpath_node_set(const xpath_node_set& ns);
xpath_node_set& operator=(const xpath_node_set& ns);
#if __cplusplus >= 201103
#ifdef PUGIXML_HAS_MOVE
// Move semantics support
xpath_node_set(xpath_node_set&& rhs);
xpath_node_set& operator=(xpath_node_set&& rhs);
xpath_node_set(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
xpath_node_set& operator=(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
#endif
// Get collection type
type_t type() const;
// Get collection size
size_t size() const;
// Indexing operator
const xpath_node& operator[](size_t index) const;
// Collection iterators
const_iterator begin() const;
const_iterator end() const;
// Sort the collection in ascending/descending order by document order
void sort(bool reverse = false);
// Get first node in the collection by document order
xpath_node first() const;
// Check if collection is empty
bool empty() const;
private:
type_t _type;
xpath_node _storage;
xpath_node* _begin;
xpath_node* _end;
void _assign(const_iterator begin, const_iterator end, type_t type);
void _move(xpath_node_set& rhs);
void _move(xpath_node_set& rhs) PUGIXML_NOEXCEPT;
};
#endif
@@ -1325,7 +1386,7 @@ namespace pugi
// Convert wide string to UTF8
std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
// Convert UTF8 to wide string
std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
@@ -1333,13 +1394,13 @@ namespace pugi
// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
typedef void* (*allocation_function)(size_t size);
// Memory deallocation function interface
typedef void (*deallocation_function)(void* ptr);
// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
// Get current memory management functions
allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
@@ -1375,7 +1436,7 @@ namespace std
#endif
/**
* Copyright (c) 2006-2015 Arseny Kapoulkine
* Copyright (c) 2006-2018 Arseny Kapoulkine
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@@ -1388,7 +1449,7 @@ namespace std
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

View File

@@ -1,6 +1,6 @@
pugixml 1.6 - an XML processing library
pugixml 1.9 - an XML processing library
Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
Report bugs and download new versions at http://pugixml.org/
This is the distribution of pugixml, which is a C++ XML processing library,
@@ -28,7 +28,7 @@ The distribution contains the following folders:
This library is distributed under the MIT License:
Copyright (c) 2006-2015 Arseny Kapoulkine
Copyright (c) 2006-2018 Arseny Kapoulkine
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation

View File

@@ -39,6 +39,7 @@ namespace QCD {
static const int Zdir = 2;
static const int Tdir = 3;
static const int Xp = 0;
static const int Yp = 1;
static const int Zp = 2;
@@ -420,15 +421,16 @@ namespace QCD {
//////////////////////////////////////////////
// Fermion <-> propagator assignements
//////////////////////////////////////////////
template <class Prop, class Ferm>
void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
//template <class Prop, class Ferm>
template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{
for(int j = 0; j < Ns; ++j)
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
for(int i = 0; i < Nc; ++i)
for(int i = 0; i < Fimpl::Dimension; ++i)
{
pokeColour(pjs, peekColour(fj, i), i, c);
}
@@ -436,15 +438,16 @@ namespace QCD {
}
}
template <class Prop, class Ferm>
void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
//template <class Prop, class Ferm>
template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
for(int i = 0; i < Nc; ++i)
for(int i = 0; i < Fimpl::Dimension; ++i)
{
pokeColour(fj, peekColour(pjs, i, c), i);
}
@@ -492,41 +495,17 @@ namespace QCD {
return traceIndex<ColourIndex>(lhs);
}
//////////////////////////////////////////
// Current types
//////////////////////////////////////////
GRID_SERIALIZABLE_ENUM(Current, undef,
Vector, 0,
Axial, 1,
Tadpole, 2);
} //namespace QCD
} // Grid
/*
<<<<<<< HEAD
#include <Grid/qcd/utils/SpaceTimeGrid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/spin/TwoSpinor.h>
#include <Grid/qcd/utils/LinalgUtils.h>
#include <Grid/qcd/utils/CovariantCshift.h>
// Include representations
#include <Grid/qcd/utils/SUn.h>
#include <Grid/qcd/utils/SUnAdjoint.h>
#include <Grid/qcd/utils/SUnTwoIndex.h>
#include <Grid/qcd/representations/hmc_types.h>
// Scalar field
#include <Grid/qcd/utils/ScalarObjs.h>
#include <Grid/qcd/action/Actions.h>
#include <Grid/qcd/smearing/Smearing.h>
#include <Grid/qcd/hmc/integrators/Integrator.h>
#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
#include <Grid/qcd/observables/hmc_observable.h>
#include <Grid/qcd/hmc/HMC.h>
//#include <Grid/qcd/modules/mods.h>
=======
>>>>>>> develop
*/
#endif

View File

@@ -52,6 +52,35 @@ namespace QCD {
{
}
///////////////////////////////////////////////////////////////
// Physical surface field utilities
///////////////////////////////////////////////////////////////
template<class Impl>
void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
{
int Ls = this->Ls;
FermionField tmp(this->FermionGrid());
tmp = solution5d;
conformable(solution5d._grid,this->FermionGrid());
conformable(exported4d._grid,this->GaugeGrid());
axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
axpby_ssp_pplus (tmp, 1., tmp , 1., solution5d, 0, Ls-1);
ExtractSlice(exported4d, tmp, 0, 0);
}
template<class Impl>
void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
{
int Ls = this->Ls;
FermionField tmp(this->FermionGrid());
conformable(imported5d._grid,this->FermionGrid());
conformable(input4d._grid ,this->GaugeGrid());
tmp = zero;
InsertSlice(input4d, tmp, 0 , 0);
InsertSlice(input4d, tmp, Ls-1, 0);
axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
Dminus(tmp,imported5d);
}
template<class Impl>
void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
{
@@ -73,7 +102,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
this->DW(psi,tmp_f,DaggerYes);
for(int s=0;s<Ls;s++){
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
}
}

View File

@@ -83,8 +83,13 @@ namespace Grid {
virtual void M5D (const FermionField &psi, FermionField &chi);
virtual void M5Ddag(const FermionField &psi, FermionField &chi);
///////////////////////////////////////////////////////////////
// Physical surface field utilities
///////////////////////////////////////////////////////////////
virtual void Dminus(const FermionField &psi, FermionField &chi);
virtual void DminusDag(const FermionField &psi, FermionField &chi);
virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
/////////////////////////////////////////////////////
// Instantiate different versions depending on Impl

View File

@@ -469,7 +469,7 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
}
a0 = a0+incr;
a1 = a1+incr;
a2 = a2+sizeof(Simd::scalar_type);
a2 = a2+sizeof(typename Simd::scalar_type);
}}
{
int lexa = s1+LLs*site;
@@ -701,7 +701,7 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
}
a0 = a0+incr;
a1 = a1+incr;
a2 = a2+sizeof(Simd::scalar_type);
a2 = a2+sizeof(typename Simd::scalar_type);
}}
{
int lexa = s1+LLs*site;

View File

@@ -295,6 +295,27 @@ namespace Grid {
assert((Ls&0x1)==1); // Odd Ls required
}
template<class Impl>
void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
{
int Ls = this->Ls;
conformable(solution5d._grid,this->FermionGrid());
conformable(exported4d._grid,this->GaugeGrid());
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
}
template<class Impl>
void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
{
int Ls = this->Ls;
conformable(imported5d._grid,this->FermionGrid());
conformable(input4d._grid ,this->GaugeGrid());
FermionField tmp(this->FermionGrid());
tmp=zero;
InsertSlice(input4d, tmp, Ls-1, Ls-1);
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
this->Dminus(tmp,imported5d);
}
FermOpTemplateInstantiate(ContinuedFractionFermion5D);
}

View File

@@ -65,6 +65,14 @@ namespace Grid {
// Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
///////////////////////////////////////////////////////////////
// Physical surface field utilities
///////////////////////////////////////////////////////////////
// virtual void Dminus(const FermionField &psi, FermionField &chi); // Inherit trivial case
// virtual void DminusDag(const FermionField &psi, FermionField &chi); // Inherit trivial case
virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d);
// Constructors
ContinuedFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@@ -475,7 +475,7 @@ namespace QCD {
}
a0 = a0 + incr;
a1 = a1 + incr;
a2 = a2 + sizeof(Simd::scalar_type);
a2 = a2 + sizeof(typename Simd::scalar_type);
}
}

View File

@@ -50,11 +50,13 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
////////////////////////////////////////////
#include <Grid/qcd/action/fermion/WilsonFermion.h> // 4d wilson like
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
//#include <Grid/qcd/action/fermion/CloverFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
#include <Grid/qcd/action/fermion/CayleyFermion5D.h> // Cayley types
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
@@ -104,10 +106,33 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermi
typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
// Twisted mass fermion
typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
// Clover fermions
typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
// Domain Wall fermions
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;

View File

@@ -70,7 +70,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
#define TwoIndexFermOpTemplateInstantiate(A) \
template class A<WilsonTwoIndexSymmetricImplF>; \
template class A<WilsonTwoIndexSymmetricImplD>;
template class A<WilsonTwoIndexSymmetricImplD>; \
template class A<WilsonTwoIndexAntiSymmetricImplF>; \
template class A<WilsonTwoIndexAntiSymmetricImplD>;
#define FermOp5dVecTemplateInstantiate(A) \
template class A<DomainWallVec5dImplF>; \

View File

@@ -47,6 +47,7 @@ namespace Grid {
INHERIT_IMPL_TYPES(Impl);
FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
virtual ~FermionOperator(void) = default;
virtual FermionField &tmp(void) = 0;
@@ -115,6 +116,34 @@ namespace Grid {
///////////////////////////////////////////////
virtual void ImportGauge(const GaugeField & _U)=0;
//////////////////////////////////////////////////////////////////////
// Conserved currents, either contract at sink or insert sequentially.
//////////////////////////////////////////////////////////////////////
virtual void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu)=0;
virtual void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax)=0;
///////////////////////////////////////////////
// Physical field import/export
///////////////////////////////////////////////
virtual void Dminus(const FermionField &psi, FermionField &chi) { chi=psi; }
virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
{
imported = input;
};
virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported)
{
exported=solution;
};
};
}

View File

@@ -164,6 +164,7 @@ namespace QCD {
public:
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=false;
static const int Nhcs = Options::Nhcs;
@@ -212,6 +213,13 @@ namespace QCD {
StencilImpl &St) {
mult(&phi(), &U(mu), &chi());
}
inline void multLinkProp(SitePropagator &phi,
const SiteDoubledGaugeField &U,
const SitePropagator &chi,
int mu) {
mult(&phi(), &U(mu), &chi());
}
template <class ref>
inline void loadLinkElement(Simd &reg, ref &memory) {
@@ -254,8 +262,22 @@ namespace QCD {
GaugeLinkField link(mat._grid);
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
PokeIndex<LorentzIndex>(mat,link,mu);
}
}
inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
mat = outerProduct(B,A);
}
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
mat = TraceIndex<SpinIndex>(P);
}
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
for (int mu = 0; mu < Nd; mu++)
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
int Ls=Btilde._grid->_fdimensions[0];
@@ -277,27 +299,28 @@ namespace QCD {
////////////////////////////////////////////////////////////////////////////////////
// Single flavour four spinors with colour index, 5d redblack
////////////////////////////////////////////////////////////////////////////////////
template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > {
public:
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
static const int Dimension = Nrepresentation;
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=true;
static const int Nhcs = Options::Nhcs;
typedef typename Options::_Coeff_t Coeff_t;
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplPropagator<Simd> SitePropagator;
@@ -333,14 +356,27 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
SiteGaugeLink UU;
for (int i = 0; i < Nrepresentation; i++) {
for (int j = 0; j < Nrepresentation; j++) {
for (int i = 0; i < Dimension; i++) {
for (int j = 0; j < Dimension; j++) {
vsplat(UU()()(i, j), U(mu)()(i, j));
}
}
mult(&phi(), &UU(), &chi());
}
inline void multLinkProp(SitePropagator &phi,
const SiteDoubledGaugeField &U,
const SitePropagator &chi,
int mu) {
SiteGaugeLink UU;
for (int i = 0; i < Dimension; i++) {
for (int j = 0; j < Dimension; j++) {
vsplat(UU()()(i, j), U(mu)()(i, j));
}
}
mult(&phi(), &UU(), &chi());
}
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
{
SiteScalarGaugeField ScalarUmu;
@@ -373,6 +409,19 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
assert(0);
}
inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
assert(0);
}
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
assert(0);
}
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
assert(0);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
assert(0);
@@ -425,25 +474,26 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
////////////////////////////////////////////////////////////////////////////////////////
// Flavour doubled spinors; is Gparity the only? what about C*?
////////////////////////////////////////////////////////////////////////////////////////
template <class S, int Nrepresentation, class Options=CoeffReal>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
public:
static const int Dimension = Nrepresentation;
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const int Nhcs = Options::Nhcs;
static const bool LsVectorised=false;
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
typedef typename Options::_Coeff_t Coeff_t;
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>, Ngp>;
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Dimension>, Ns>, Ngp>;
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>, Ngp>;
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhs>, Ngp>;
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplPropagator<Simd> SitePropagator;
@@ -537,7 +587,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
}
}
// Fixme: Gparity prop * link
inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U,
const SitePropagator &chi, int mu)
{
assert(0);
}
template <class ref>
inline void loadLinkElement(Simd &reg, ref &memory) {
@@ -611,6 +666,25 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
return;
}
inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
//mat = outerProduct(Btilde, A);
assert(0);
}
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
assert(0);
/*
auto tmp = TraceIndex<SpinIndex>(P);
parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) {
mat[ss]() = tmp[ss](0, 0) + conjugate(tmp[ss](1, 1));
}
*/
}
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
assert(0);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls = Btilde._grid->_fdimensions[0];
@@ -640,6 +714,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
typedef RealD _Coeff_t ;
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=false;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -758,8 +833,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
GaugeLinkField link(mat._grid);
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
PokeIndex<LorentzIndex>(mat,link,mu);
}
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
assert (0);
// Must never hit
@@ -775,6 +850,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
public:
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=true;
typedef RealD Coeff_t ;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -951,29 +1027,33 @@ typedef WilsonImpl<vComplex, TwoIndexSymmetricRepresentation, CoeffReal > Wilso
typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation, CoeffReal > WilsonTwoIndexSymmetricImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
typedef WilsonImpl<vComplex, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD; // Double
typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD; // Double
typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH; // Float
typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDF; // Double
typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH; // Float
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF; // Double
typedef StaggeredImpl<vComplex, FundamentalRepresentation > StaggeredImplR; // Real.. whichever prec
typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF; // Float

View File

@@ -569,6 +569,31 @@ void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu)
{
assert(0);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax)
{
assert(0);
}
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
//AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);

View File

@@ -179,6 +179,22 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax);
};
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;

View File

@@ -619,6 +619,30 @@ void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
MooeeInv(in, out);
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu)
{
assert(0);
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax)
{
assert(0);
}
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);

View File

@@ -212,6 +212,21 @@ namespace QCD {
// Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax);
};
}}

View File

@@ -853,7 +853,7 @@ namespace QCD {
a0 = a0 + incr;
a1 = a1 + incr;
a2 = a2 + sizeof(Simd::scalar_type);
a2 = a2 + sizeof(typename Simd::scalar_type);
}
}

View File

@@ -396,6 +396,27 @@ namespace Grid {
amax=zolo_hi;
}
template<class Impl>
void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
{
int Ls = this->Ls;
conformable(solution5d._grid,this->FermionGrid());
conformable(exported4d._grid,this->GaugeGrid());
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
}
template<class Impl>
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
{
int Ls = this->Ls;
conformable(imported5d._grid,this->FermionGrid());
conformable(input4d._grid ,this->GaugeGrid());
FermionField tmp(this->FermionGrid());
tmp=zero;
InsertSlice(input4d, tmp, Ls-1, Ls-1);
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
this->Dminus(tmp,imported5d);
}
// Constructors
template<class Impl>
PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,

View File

@@ -70,6 +70,12 @@ namespace Grid {
// Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
///////////////////////////////////////////////////////////////
// Physical surface field utilities
///////////////////////////////////////////////////////////////
virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d);
// Constructors
PartialFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@@ -0,0 +1,243 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
Copyright (C) 2017
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/Eigen/Dense>
#include <Grid/qcd/spin/Dirac.h>
namespace Grid
{
namespace QCD
{
// *NOT* EO
template <class Impl>
RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
{
FermionField temp(out._grid);
// Wilson term
out.checkerboard = in.checkerboard;
this->Dhop(in, out, DaggerNo);
// Clover term
Mooee(in, temp);
out += temp;
return norm2(out);
}
template <class Impl>
RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
{
FermionField temp(out._grid);
// Wilson term
out.checkerboard = in.checkerboard;
this->Dhop(in, out, DaggerYes);
// Clover term
MooeeDag(in, temp);
out += temp;
return norm2(out);
}
template <class Impl>
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
{
WilsonFermion<Impl>::ImportGauge(_Umu);
GridBase *grid = _Umu._grid;
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
// Compute the field strength terms mu>nu
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
CloverTerm = fillCloverYZ(Bx) * csw_r;
CloverTerm += fillCloverXZ(By) * csw_r;
CloverTerm += fillCloverXY(Bz) * csw_r;
CloverTerm += fillCloverXT(Ex) * csw_t;
CloverTerm += fillCloverYT(Ey) * csw_t;
CloverTerm += fillCloverZT(Ez) * csw_t;
CloverTerm += diag_mass;
int lvol = _Umu._grid->lSites();
int DimRep = Impl::Dimension;
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
std::vector<int> lcoor;
typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
for (int site = 0; site < lvol; site++)
{
grid->LocalIndexToLocalCoor(site, lcoor);
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
peekLocalSite(Qx, CloverTerm, lcoor);
Qxinv = zero;
//if (csw!=0){
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
EigenInvCloverOp = EigenCloverOp.inverse();
//std::cout << EigenInvCloverOp << std::endl;
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// }
pokeLocalSite(Qxinv, CloverTermInv, lcoor);
}
// Separate the even and odd parts
pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
}
template <class Impl>
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
{
out.checkerboard = in.checkerboard;
CloverFieldType *Clover;
assert(in.checkerboard == Odd || in.checkerboard == Even);
if (dag)
{
if (in._grid->_isCheckerBoarded)
{
if (in.checkerboard == Odd)
{
Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
}
else
{
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
}
out = *Clover * in;
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
out = adj(*Clover) * in;
}
}
else
{
if (in._grid->_isCheckerBoarded)
{
if (in.checkerboard == Odd)
{
// std::cout << "Calling clover term Odd" << std::endl;
Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
}
else
{
// std::cout << "Calling clover term Even" << std::endl;
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
}
out = *Clover * in;
// std::cout << GridLogMessage << "*Clover.checkerboard " << (*Clover).checkerboard << std::endl;
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
out = *Clover * in;
}
}
} // MooeeInternal
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
{
assert(0);
}
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
assert(0); // not implemented yet
}
FermOpTemplateInstantiate(WilsonCloverFermion);
AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
}
}

View File

@@ -0,0 +1,366 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
Copyright (C) 2017
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: David Preti <>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
#define GRID_QCD_WILSON_CLOVER_FERMION_H
#include <Grid/Grid.h>
namespace Grid
{
namespace QCD
{
///////////////////////////////////////////////////////////////////
// Wilson Clover
//
// Operator ( with anisotropy coefficients):
//
// Q = 1 + (Nd-1)/xi_0 + m
// + W_t + (nu/xi_0) * W_s
// - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss) ]
//
// s spatial, t temporal directions.
// where W_t and W_s are the temporal and spatial components of the
// Wilson Dirac operator
//
// csw_r = csw_t to recover the isotropic version
//////////////////////////////////////////////////////////////////
template <class Impl>
class WilsonCloverFermion : public WilsonFermion<Impl>
{
public:
// Types definitions
INHERIT_IMPL_TYPES(Impl);
template <typename vtype>
using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteCloverType;
typedef Lattice<SiteCloverType> CloverFieldType;
public:
typedef WilsonFermion<Impl> WilsonBase;
virtual void Instantiatable(void){};
// Constructors
WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
Fgrid,
Hgrid,
_mass, impl_p, clover_anisotropy),
CloverTerm(&Fgrid),
CloverTermInv(&Fgrid),
CloverTermEven(&Hgrid),
CloverTermOdd(&Hgrid),
CloverTermInvEven(&Hgrid),
CloverTermInvOdd(&Hgrid),
CloverTermDagEven(&Hgrid),
CloverTermDagOdd(&Hgrid),
CloverTermInvDagEven(&Hgrid),
CloverTermInvDagOdd(&Hgrid)
{
assert(Nd == 4); // require 4 dimensions
if (clover_anisotropy.isAnisotropic)
{
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
}
else
{
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if (csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if (csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
virtual RealD M(const FermionField &in, FermionField &out);
virtual RealD Mdag(const FermionField &in, FermionField &out);
virtual void Mooee(const FermionField &in, FermionField &out);
virtual void MooeeDag(const FermionField &in, FermionField &out);
virtual void MooeeInv(const FermionField &in, FermionField &out);
virtual void MooeeInvDag(const FermionField &in, FermionField &out);
virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
//virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void ImportGauge(const GaugeField &_Umu);
// Derivative parts unpreconditioned pseudofermions
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X._grid, Y._grid);
conformable(X._grid, force._grid);
GaugeLinkField force_mu(force._grid), lambda(force._grid);
GaugeField clover_force(force._grid);
PropagatorField Lambda(force._grid);
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
Impl::extractLinkField(U, this->Umu);
force = zero;
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = zero;
for (int mu = 0; mu < 4; mu++)
{
force_mu = zero;
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda._grid, U[0]._grid);
GaugeLinkField out(lambda._grid), tmp(lambda._grid);
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
private:
// here fixing the 4 dimensions, make it more general?
RealD csw_r; // Clover coefficient - spatial
RealD csw_t; // Clover coefficient - temporal
RealD diag_mass; // Mass term
CloverFieldType CloverTerm, CloverTermInv; // Clover term
CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO
CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
// eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
}
return T;
}
CloverFieldType fillCloverXZ(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 1) = -F._odata[i]()();
T._odata[i]()(1, 0) = F._odata[i]()();
T._odata[i]()(2, 3) = -F._odata[i]()();
T._odata[i]()(3, 2) = F._odata[i]()();
}
return T;
}
CloverFieldType fillCloverXY(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
T._odata[i]()(1, 1) = timesI(F._odata[i]()());
T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
T._odata[i]()(3, 3) = timesI(F._odata[i]()());
}
return T;
}
CloverFieldType fillCloverXT(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 1) = timesI(F._odata[i]()());
T._odata[i]()(1, 0) = timesI(F._odata[i]()());
T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
}
return T;
}
CloverFieldType fillCloverYT(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 1) = -(F._odata[i]()());
T._odata[i]()(1, 0) = (F._odata[i]()());
T._odata[i]()(2, 3) = (F._odata[i]()());
T._odata[i]()(3, 2) = -(F._odata[i]()());
}
return T;
}
CloverFieldType fillCloverZT(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 0) = timesI(F._odata[i]()());
T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
T._odata[i]()(3, 3) = timesI(F._odata[i]()());
}
return T;
}
};
}
}
#endif // GRID_QCD_WILSON_CLOVER_FERMION_H

View File

@@ -69,39 +69,47 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
/*****************************************************/
/* Compress includes precision change if mpi data is not same */
/*****************************************************/
inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
projector::Proj(buf[o],in,mu,dag);
inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) {
SiteHalfSpinor tmp;
projector::Proj(tmp,in,mu,dag);
vstream(buf[o],tmp);
}
/*****************************************************/
/* Exchange includes precision change if mpi data is not same */
/*****************************************************/
inline void Exchange(SiteHalfSpinor *mp,
SiteHalfSpinor *vp0,
SiteHalfSpinor *vp1,
inline void Exchange(SiteHalfSpinor * __restrict__ mp,
const SiteHalfSpinor * __restrict__ vp0,
const SiteHalfSpinor * __restrict__ vp1,
Integer type,Integer o){
exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
SiteHalfSpinor tmp1;
SiteHalfSpinor tmp2;
exchange(tmp1,tmp2,vp0[o],vp1[o],type);
vstream(mp[2*o ],tmp1);
vstream(mp[2*o+1],tmp2);
}
/*****************************************************/
/* Have a decompression step if mpi data is not same */
/*****************************************************/
inline void Decompress(SiteHalfSpinor *out,
SiteHalfSpinor *in, Integer o) {
inline void Decompress(SiteHalfSpinor * __restrict__ out,
SiteHalfSpinor * __restrict__ in, Integer o) {
assert(0);
}
/*****************************************************/
/* Compress Exchange */
/*****************************************************/
inline void CompressExchange(SiteHalfSpinor *out0,
SiteHalfSpinor *out1,
const SiteSpinor *in,
inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
SiteHalfSpinor * __restrict__ out1,
const SiteSpinor * __restrict__ in,
Integer j,Integer k, Integer m,Integer type){
SiteHalfSpinor temp1, temp2,temp3,temp4;
projector::Proj(temp1,in[k],mu,dag);
projector::Proj(temp2,in[m],mu,dag);
exchange(out0[j],out1[j],temp1,temp2,type);
exchange(temp3,temp4,temp1,temp2,type);
vstream(out0[j],temp3);
vstream(out1[j],temp4);
}
/*****************************************************/
@@ -265,7 +273,6 @@ public:
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
}
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
WilsonStencil(GridBase *grid,
int npoints,

View File

@@ -47,7 +47,8 @@ int WilsonFermionStatic::HandOptDslash;
template <class Impl>
WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
const ImplParams &p)
const ImplParams &p,
const WilsonAnisotropyCoefficients &anis)
: Kernels(p),
_grid(&Fgrid),
_cbgrid(&Hgrid),
@@ -60,16 +61,41 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
_tmp(&Hgrid)
_tmp(&Hgrid),
anisotropyCoeff(anis)
{
// Allocate the required comms buffer
ImportGauge(_Umu);
if (anisotropyCoeff.isAnisotropic){
diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
} else {
diag_mass = 4.0 + mass;
}
}
template <class Impl>
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
GaugeField HUmu(_Umu._grid);
HUmu = _Umu * (-0.5);
//Here multiply the anisotropy coefficients
if (anisotropyCoeff.isAnisotropic)
{
for (int mu = 0; mu < Nd; mu++)
{
GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
if (mu != anisotropyCoeff.t_direction)
U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
}
}
else
{
HUmu = _Umu * (-0.5);
}
Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd, Umu);
@@ -83,14 +109,14 @@ template <class Impl>
RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerNo);
return axpy_norm(out, 4 + mass, in, out);
return axpy_norm(out, diag_mass, in, out);
}
template <class Impl>
RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerYes);
return axpy_norm(out, 4 + mass, in, out);
return axpy_norm(out, diag_mass, in, out);
}
template <class Impl>
@@ -114,7 +140,7 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
template <class Impl>
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
typename FermionField::scalar_type scal(4.0 + mass);
typename FermionField::scalar_type scal(diag_mass);
out = scal * in;
}
@@ -127,7 +153,7 @@ void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
template<class Impl>
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
out = (1.0/(4.0+mass))*in;
out = (1.0/(diag_mass))*in;
}
template<class Impl>
@@ -204,7 +230,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
FermionField Btilde(B._grid);
FermionField Atilde(B._grid);
Atilde = A;
Atilde = A;//redundant
st.HaloExchange(B, compressor);
@@ -429,6 +455,112 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
};
/*Change ends */
/*******************************************************************************
* Conserved current utilities for Wilson fermions, for contracting propagators
* to make a conserved current sink or inserting the conserved current
* sequentially.
******************************************************************************/
template <class Impl>
void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu)
{
Gamma g5(Gamma::Algebra::Gamma5);
conformable(_grid, q_in_1._grid);
conformable(_grid, q_in_2._grid);
conformable(_grid, q_out._grid);
PropagatorField tmp1(_grid), tmp2(_grid);
q_out = zero;
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
// Inefficient comms method but not performance critical.
tmp1 = Cshift(q_in_1, mu, 1);
tmp2 = Cshift(q_in_2, mu, 1);
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
{
Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
q_in_2._odata[sU],
q_out._odata[sU],
Umu, sU, mu);
Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
tmp2._odata[sU],
q_out._odata[sU],
Umu, sU, mu);
}
}
template <class Impl>
void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax)
{
conformable(_grid, q_in._grid);
conformable(_grid, q_out._grid);
Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
ComplexD i(0.0,1.0);
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
unsigned int tshift = (mu == Tp) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
// Momentum projection
ph = zero;
for(unsigned int mu = 0; mu < Nd - 1; mu++)
{
LatticeCoordinate(coor, mu);
ph = ph + mom[mu]*coor*((1./(_grid->_fdimensions[mu])));
}
ph = exp((RealD)(2*M_PI)*i*ph);
q_out = zero;
LatticeInteger coords(_grid);
LatticeCoordinate(coords, Tp);
// Need q(x + mu) and q(x - mu).
tmp = Cshift(q_in, mu, 1);
tmpFwd = tmp*ph;
tmp = ph*q_in;
tmpBwd = Cshift(tmp, mu, -1);
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
{
// Compute the sequential conserved current insertion only if our simd
// object contains a timeslice we need.
vInteger t_mask = ((coords._odata[sU] >= tmin) &&
(coords._odata[sU] <= tmax));
Integer timeSlices = Reduce(t_mask);
if (timeSlices > 0)
{
Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU],
q_out._odata[sU],
Umu, sU, mu, t_mask);
}
// Repeat for backward direction.
t_mask = ((coords._odata[sU] >= (tmin + tshift)) &&
(coords._odata[sU] <= (tmax + tshift)));
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
unsigned int t0 = 0;
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
timeSlices = Reduce(t_mask);
if (timeSlices > 0)
{
Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU],
q_out._odata[sU],
Umu, sU, mu, t_mask);
}
}
}
FermOpTemplateInstantiate(WilsonFermion);
AdjointFermOpTemplateInstantiate(WilsonFermion);
TwoIndexFermOpTemplateInstantiate(WilsonFermion);

View File

@@ -44,6 +44,21 @@ class WilsonFermionStatic {
static const int npoint = 8;
};
struct WilsonAnisotropyCoefficients: Serializable
{
GRID_SERIALIZABLE_CLASS_MEMBERS(WilsonAnisotropyCoefficients,
bool, isAnisotropic,
int, t_direction,
double, xi_0,
double, nu);
WilsonAnisotropyCoefficients():
isAnisotropic(false),
t_direction(Nd-1),
xi_0(1.0),
nu(1.0){}
};
template <class Impl>
class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
public:
@@ -65,8 +80,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
RealD M(const FermionField &in, FermionField &out);
RealD Mdag(const FermionField &in, FermionField &out);
virtual RealD M(const FermionField &in, FermionField &out);
virtual RealD Mdag(const FermionField &in, FermionField &out);
/////////////////////////////////////////////////////////
// half checkerboard operations
@@ -123,8 +138,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
// Constructor
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
const ImplParams &p = ImplParams());
GridRedBlackCartesian &Hgrid, RealD _mass,
const ImplParams &p = ImplParams(),
const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() );
// DoubleStore impl dependent
void ImportGauge(const GaugeField &_Umu);
@@ -138,6 +154,7 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
virtual RealD Mass(void) { return mass; }
virtual int isTrivialEE(void) { return 1; };
RealD mass;
RealD diag_mass;
GridBase *_grid;
GridBase *_cbgrid;
@@ -154,6 +171,24 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
WilsonAnisotropyCoefficients anisotropyCoeff;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax);
};
typedef WilsonFermion<WilsonImplF> WilsonFermionF;

View File

@@ -12,6 +12,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -701,6 +702,168 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
}
/*******************************************************************************
* Conserved current utilities for Wilson fermions, for contracting propagators
* to make a conserved current sink or inserting the conserved current
* sequentially.
******************************************************************************/
// Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
#define REVERSE_LS(qSite, qSiteRev, Nsimd) \
{ \
std::vector<typename SitePropagator::scalar_object> qSiteVec(Nsimd); \
extract(qSite, qSiteVec); \
for (int i = 0; i < Nsimd / 2; ++i) \
{ \
typename SitePropagator::scalar_object tmp = qSiteVec[i]; \
qSiteVec[i] = qSiteVec[Nsimd - i - 1]; \
qSiteVec[Nsimd - i - 1] = tmp; \
} \
merge(qSiteRev, qSiteVec); \
}
template <class Impl>
void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu)
{
conformable(q_in_1._grid, FermionGrid());
conformable(q_in_1._grid, q_in_2._grid);
conformable(_FourDimGrid, q_out._grid);
PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
unsigned int LLs = q_in_1._grid->_rdimensions[0];
q_out = zero;
// Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s),
// q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
tmp1 = Cshift(q_in_1, mu + 1, 1);
tmp2 = Cshift(q_in_2, mu + 1, 1);
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
{
unsigned int sF1 = sU * LLs;
unsigned int sF2 = (sU + 1) * LLs - 1;
for (unsigned int s = 0; s < LLs; ++s)
{
bool axial_sign = ((curr_type == Current::Axial) && \
(s < (LLs / 2)));
SitePropagator qSite2, qmuSite2;
// If vectorised in 5th dimension, reverse q2 vector to match up
// sites correctly.
if (Impl::LsVectorised)
{
REVERSE_LS(q_in_2._odata[sF2], qSite2, Ls / LLs);
REVERSE_LS(tmp2._odata[sF2], qmuSite2, Ls / LLs);
}
else
{
qSite2 = q_in_2._odata[sF2];
qmuSite2 = tmp2._odata[sF2];
}
Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sF1],
qSite2,
q_out._odata[sU],
Umu, sU, mu, axial_sign);
Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sF1],
qmuSite2,
q_out._odata[sU],
Umu, sU, mu, axial_sign);
sF1++;
sF2--;
}
}
}
template <class Impl>
void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax)
{
conformable(q_in._grid, FermionGrid());
conformable(q_in._grid, q_out._grid);
Lattice<iSinglet<Simd>> ph(FermionGrid()), coor(FermionGrid());
PropagatorField tmpFwd(FermionGrid()), tmpBwd(FermionGrid()),
tmp(FermionGrid());
ComplexD i(0.0, 1.0);
unsigned int tshift = (mu == Tp) ? 1 : 0;
unsigned int LLs = q_in._grid->_rdimensions[0];
unsigned int LLt = GridDefaultLatt()[Tp];
// Momentum projection.
ph = zero;
for(unsigned int nu = 0; nu < Nd - 1; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + 1);
ph = ph + mom[nu]*coor*((1./(_FourDimGrid->_fdimensions[nu])));
}
ph = exp((RealD)(2*M_PI)*i*ph);
q_out = zero;
LatticeInteger coords(_FourDimGrid);
LatticeCoordinate(coords, Tp);
// Need q(x + mu, s) and q(x - mu, s). 5D lattice so shift 4D coordinate mu
// by one.
tmp = Cshift(q_in, mu + 1, 1);
tmpFwd = tmp*ph;
tmp = ph*q_in;
tmpBwd = Cshift(tmp, mu + 1, -1);
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
{
// Compute the sequential conserved current insertion only if our simd
// object contains a timeslice we need.
vInteger t_mask = ((coords._odata[sU] >= tmin) &&
(coords._odata[sU] <= tmax));
Integer timeSlices = Reduce(t_mask);
if (timeSlices > 0)
{
unsigned int sF = sU * LLs;
for (unsigned int s = 0; s < LLs; ++s)
{
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sF],
q_out._odata[sF], Umu, sU,
mu, t_mask, axial_sign);
++sF;
}
}
// Repeat for backward direction.
t_mask = ((coords._odata[sU] >= (tmin + tshift)) &&
(coords._odata[sU] <= (tmax + tshift)));
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
unsigned int t0 = 0;
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
timeSlices = Reduce(t_mask);
if (timeSlices > 0)
{
unsigned int sF = sU * LLs;
for (unsigned int s = 0; s < LLs; ++s)
{
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sF],
q_out._odata[sF], Umu, sU,
mu, t_mask, axial_sign);
++sF;
}
}
}
}
FermOpTemplateInstantiate(WilsonFermion5D);
GparityFermOpTemplateInstantiate(WilsonFermion5D);

View File

@@ -214,6 +214,21 @@ namespace QCD {
// Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax);
};
}}

View File

@@ -281,6 +281,172 @@ void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal
vstream(out._odata[sF], result);
}
/*******************************************************************************
* Conserved current utilities for Wilson fermions, for contracting propagators
* to make a conserved current sink or inserting the conserved current
* sequentially. Common to both 4D and 5D.
******************************************************************************/
// N.B. Functions below assume a -1/2 factor within U.
#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
/*******************************************************************************
* Name: ContractConservedCurrentSiteFwd
* Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
* - Pass in q_in_1 shifted in +ve mu direction.
******************************************************************************/
template<class Impl>
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
bool switch_sign)
{
SitePropagator result, tmp;
Gamma g5(Gamma::Algebra::Gamma5);
Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
if (switch_sign)
{
q_out -= result;
}
else
{
q_out += result;
}
}
/*******************************************************************************
* Name: ContractConservedCurrentSiteBwd
* Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
* - Pass in q_in_2 shifted in +ve mu direction.
******************************************************************************/
template<class Impl>
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
bool switch_sign)
{
SitePropagator result, tmp;
Gamma g5(Gamma::Algebra::Gamma5);
Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
if (switch_sign)
{
q_out += result;
}
else
{
q_out -= result;
}
}
// G-parity requires more specialised implementation.
#define NO_CURR_SITE(Impl) \
template <> \
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
const SitePropagator &q_in_1, \
const SitePropagator &q_in_2, \
SitePropagator &q_out, \
DoubledGaugeField &U, \
unsigned int sU, \
unsigned int mu, \
bool switch_sign) \
{ \
assert(0); \
} \
template <> \
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
const SitePropagator &q_in_1, \
const SitePropagator &q_in_2, \
SitePropagator &q_out, \
DoubledGaugeField &U, \
unsigned int mu, \
unsigned int sU, \
bool switch_sign) \
{ \
assert(0); \
}
NO_CURR_SITE(GparityWilsonImplF);
NO_CURR_SITE(GparityWilsonImplD);
NO_CURR_SITE(GparityWilsonImplFH);
NO_CURR_SITE(GparityWilsonImplDF);
/*******************************************************************************
* Name: SeqConservedCurrentSiteFwd
* Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
* - Pass in q_in shifted in +ve mu direction.
******************************************************************************/
template<class Impl>
void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
vInteger t_mask,
bool switch_sign)
{
SitePropagator result;
Impl::multLinkProp(result, U._odata[sU], q_in, mu);
result = WilsonCurrentFwd(result, mu);
// Zero any unwanted timeslice entries.
result = predicatedWhere(t_mask, result, 0.*result);
if (switch_sign)
{
q_out -= result;
}
else
{
q_out += result;
}
}
/*******************************************************************************
* Name: SeqConservedCurrentSiteFwd
* Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
* - Pass in q_in shifted in -ve mu direction.
******************************************************************************/
template<class Impl>
void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
vInteger t_mask,
bool switch_sign)
{
SitePropagator result;
Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
result = WilsonCurrentBwd(result, mu);
// Zero any unwanted timeslice entries.
result = predicatedWhere(t_mask, result, 0.*result);
if (switch_sign)
{
q_out += result;
}
else
{
q_out -= result;
}
}
FermOpTemplateInstantiate(WilsonKernels);
AdjointFermOpTemplateInstantiate(WilsonKernels);
TwoIndexFermOpTemplateInstantiate(WilsonKernels);

View File

@@ -55,7 +55,7 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
public:
template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1)
{
@@ -99,7 +99,7 @@ public:
}
template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
// no kernel choice
@@ -116,7 +116,7 @@ public:
}
template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1)
{
@@ -161,7 +161,7 @@ public:
}
template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
@@ -180,6 +180,38 @@ public:
void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
//////////////////////////////////////////////////////////////////////////////
// Utilities for inserting Wilson conserved current.
//////////////////////////////////////////////////////////////////////////////
void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
bool switch_sign = false);
void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
bool switch_sign = false);
void SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
vInteger t_mask,
bool switch_sign = false);
void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
vInteger t_mask,
bool switch_sign = false);
private:
// Specialised variants
void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,

View File

@@ -30,181 +30,60 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define REGISTER
#define LOAD_CHIMU_BODY(F) \
Chimu_00=ref(F)(0)(0); \
Chimu_01=ref(F)(0)(1); \
Chimu_02=ref(F)(0)(2); \
Chimu_10=ref(F)(1)(0); \
Chimu_11=ref(F)(1)(1); \
Chimu_12=ref(F)(1)(2); \
Chimu_20=ref(F)(2)(0); \
Chimu_21=ref(F)(2)(1); \
Chimu_22=ref(F)(2)(2); \
Chimu_30=ref(F)(3)(0); \
Chimu_31=ref(F)(3)(1); \
Chimu_32=ref(F)(3)(2)
#define LOAD_CHIMU \
{const SiteSpinor & ref (in._odata[offset]); \
Chimu_00=ref()(0)(0);\
Chimu_01=ref()(0)(1);\
Chimu_02=ref()(0)(2);\
Chimu_10=ref()(1)(0);\
Chimu_11=ref()(1)(1);\
Chimu_12=ref()(1)(2);\
Chimu_20=ref()(2)(0);\
Chimu_21=ref()(2)(1);\
Chimu_22=ref()(2)(2);\
Chimu_30=ref()(3)(0);\
Chimu_31=ref()(3)(1);\
Chimu_32=ref()(3)(2);}
#define LOAD_CHIMU(DIR,F,PERM) \
{ const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
#define LOAD_CHI_BODY(F) \
Chi_00 = ref(F)(0)(0);\
Chi_01 = ref(F)(0)(1);\
Chi_02 = ref(F)(0)(2);\
Chi_10 = ref(F)(1)(0);\
Chi_11 = ref(F)(1)(1);\
Chi_12 = ref(F)(1)(2)
#define LOAD_CHI(DIR,F,PERM) \
{const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
//G-parity implementations using in-place intrinsic ops
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
//0h,1l -> 1l,0h
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
//Pulled fermion through forwards face, GPBC on upper component
//Need 0= 0l 1h 1= 1l 0h
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
//Pulled fermion through backwards face, GPBC on lower component
//Need 0= 1l 0h 1= 0l 1h
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(1)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1); \
INTO = tmp2;
//0l 0h -> 0h 0l
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(0)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1); \
INTO = tmp2;
#define LOAD_CHI_SETUP(DIR,F) \
g = F; \
direction = st._directions[DIR]; \
distance = st._distances[DIR]; \
sl = st._grid->_simd_layout[direction]; \
inplace_twist = 0; \
if(SE->_around_the_world && this->Params.twists[DIR % 4]){ \
if(sl == 1){ \
g = (F+1) % 2; \
}else{ \
inplace_twist = 1; \
} \
}
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteSpinor &ref(in._odata[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHIMU_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteHalfSpinor &ref(buf[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHI_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_0L_1H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_0L_1H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_1L_0H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_1L_0H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
#define LOAD_CHI\
{const SiteHalfSpinor &ref(buf[offset]); \
Chi_00 = ref()(0)(0);\
Chi_01 = ref()(0)(1);\
Chi_02 = ref()(0)(2);\
Chi_10 = ref()(1)(0);\
Chi_11 = ref()(1)(1);\
Chi_12 = ref()(1)(2);}
// To splat or not to splat depends on the implementation
#define MULT_2SPIN_BODY \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00; \
UChi_10 = U_00*Chi_10; \
UChi_01 = U_10*Chi_00; \
UChi_11 = U_10*Chi_10; \
UChi_02 = U_20*Chi_00; \
UChi_12 = U_20*Chi_10; \
UChi_00+= U_01*Chi_01; \
UChi_10+= U_01*Chi_11; \
UChi_01+= U_11*Chi_01; \
UChi_11+= U_11*Chi_11; \
UChi_02+= U_21*Chi_01; \
UChi_12+= U_21*Chi_11; \
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02; \
UChi_10+= U_00*Chi_12; \
UChi_01+= U_10*Chi_02; \
UChi_11+= U_10*Chi_12; \
UChi_02+= U_20*Chi_02; \
UChi_12+= U_20*Chi_12
#define MULT_2SPIN(A,F) \
{auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
#define MULT_2SPIN_GPARITY(A,F) \
{auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
#define MULT_2SPIN(A)\
{auto & ref(U._odata[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00;\
UChi_10 = U_00*Chi_10;\
UChi_01 = U_10*Chi_00;\
UChi_11 = U_10*Chi_10;\
UChi_02 = U_20*Chi_00;\
UChi_12 = U_20*Chi_10;\
UChi_00+= U_01*Chi_01;\
UChi_10+= U_01*Chi_11;\
UChi_01+= U_11*Chi_01;\
UChi_11+= U_11*Chi_11;\
UChi_02+= U_21*Chi_01;\
UChi_12+= U_21*Chi_11;\
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02;\
UChi_10+= U_00*Chi_12;\
UChi_01+= U_10*Chi_02;\
UChi_11+= U_10*Chi_12;\
UChi_02+= U_20*Chi_02;\
UChi_12+= U_20*Chi_12;}
#define PERMUTE_DIR(dir) \
@@ -428,87 +307,84 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
result_31-= UChi_11; \
result_32-= UChi_12;
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
LOAD_CHIMU; \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI_IMPL(DIR,F,PERM); \
LOAD_CHI; \
} \
MULT_2SPIN_IMPL(DIR,F); \
MULT_2SPIN(DIR); \
RECON;
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
LOAD_CHIMU; \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else if ( st.same_node[DIR] ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
LOAD_CHI; \
} \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN_IMPL(DIR,F); \
MULT_2SPIN(DIR); \
RECON; \
}
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
MULT_2SPIN_IMPL(DIR,F); \
LOAD_CHI; \
MULT_2SPIN(DIR); \
RECON; \
nmu++; \
}
#define HAND_RESULT(ss,F) \
#define HAND_RESULT(ss) \
{ \
SiteSpinor & ref (out._odata[ss]); \
vstream(ref(F)(0)(0),result_00); \
vstream(ref(F)(0)(1),result_01); \
vstream(ref(F)(0)(2),result_02); \
vstream(ref(F)(1)(0),result_10); \
vstream(ref(F)(1)(1),result_11); \
vstream(ref(F)(1)(2),result_12); \
vstream(ref(F)(2)(0),result_20); \
vstream(ref(F)(2)(1),result_21); \
vstream(ref(F)(2)(2),result_22); \
vstream(ref(F)(3)(0),result_30); \
vstream(ref(F)(3)(1),result_31); \
vstream(ref(F)(3)(2),result_32); \
vstream(ref()(0)(0),result_00); \
vstream(ref()(0)(1),result_01); \
vstream(ref()(0)(2),result_02); \
vstream(ref()(1)(0),result_10); \
vstream(ref()(1)(1),result_11); \
vstream(ref()(1)(2),result_12); \
vstream(ref()(2)(0),result_20); \
vstream(ref()(2)(1),result_21); \
vstream(ref()(2)(2),result_22); \
vstream(ref()(3)(0),result_30); \
vstream(ref()(3)(1),result_31); \
vstream(ref()(3)(2),result_32); \
}
#define HAND_RESULT_EXT(ss,F) \
#define HAND_RESULT_EXT(ss) \
if (nmu){ \
SiteSpinor & ref (out._odata[ss]); \
ref(F)(0)(0)+=result_00; \
ref(F)(0)(1)+=result_01; \
ref(F)(0)(2)+=result_02; \
ref(F)(1)(0)+=result_10; \
ref(F)(1)(1)+=result_11; \
ref(F)(1)(2)+=result_12; \
ref(F)(2)(0)+=result_20; \
ref(F)(2)(1)+=result_21; \
ref(F)(2)(2)+=result_22; \
ref(F)(3)(0)+=result_30; \
ref(F)(3)(1)+=result_31; \
ref(F)(3)(2)+=result_32; \
ref()(0)(0)+=result_00; \
ref()(0)(1)+=result_01; \
ref()(0)(2)+=result_02; \
ref()(1)(0)+=result_10; \
ref()(1)(1)+=result_11; \
ref()(1)(2)+=result_12; \
ref()(2)(0)+=result_20; \
ref()(2)(1)+=result_21; \
ref()(2)(2)+=result_22; \
ref()(3)(0)+=result_30; \
ref()(3)(1)+=result_31; \
ref()(3)(2)+=result_32; \
}
@@ -587,18 +463,15 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl>
@@ -612,19 +485,16 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl> void
@@ -639,20 +509,16 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl>
@@ -666,20 +532,16 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl> void
@@ -695,20 +557,16 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
int offset,local,perm, ptype;
StencilEntry *SE;
int nmu=0;
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT_EXT(ss);
}
template<class Impl>
@@ -723,193 +581,18 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
StencilEntry *SE;
int offset,local,perm, ptype;
int nmu=0;
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT_EXT(ss);
}
////////////////////////////////////////////////
// Specialise Gparity to simple implementation
////////////////////////////////////////////////
#define HAND_SPECIALISE_EMPTY(IMPL) \
template<> void \
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
#define HAND_SPECIALISE_GPARITY(IMPL) \
template<> void \
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
int nmu=0; \
HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
int nmu=0; \
HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
}
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
////////////// Wilson ; uses this implementation /////////////////////
#define INSTANTIATE_THEM(A) \
@@ -930,8 +613,6 @@ INSTANTIATE_THEM(WilsonImplF);
INSTANTIATE_THEM(WilsonImplD);
INSTANTIATE_THEM(ZWilsonImplF);
INSTANTIATE_THEM(ZWilsonImplD);
INSTANTIATE_THEM(GparityWilsonImplF);
INSTANTIATE_THEM(GparityWilsonImplD);
INSTANTIATE_THEM(DomainWallVec5dImplF);
INSTANTIATE_THEM(DomainWallVec5dImplD);
INSTANTIATE_THEM(ZDomainWallVec5dImplF);
@@ -940,11 +621,11 @@ INSTANTIATE_THEM(WilsonImplFH);
INSTANTIATE_THEM(WilsonImplDF);
INSTANTIATE_THEM(ZWilsonImplFH);
INSTANTIATE_THEM(ZWilsonImplDF);
INSTANTIATE_THEM(GparityWilsonImplFH);
INSTANTIATE_THEM(GparityWilsonImplDF);
INSTANTIATE_THEM(DomainWallVec5dImplFH);
INSTANTIATE_THEM(DomainWallVec5dImplDF);
INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
}}

View File

@@ -0,0 +1,878 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#define REGISTER
#define LOAD_CHIMU_BODY(F) \
Chimu_00=ref(F)(0)(0); \
Chimu_01=ref(F)(0)(1); \
Chimu_02=ref(F)(0)(2); \
Chimu_10=ref(F)(1)(0); \
Chimu_11=ref(F)(1)(1); \
Chimu_12=ref(F)(1)(2); \
Chimu_20=ref(F)(2)(0); \
Chimu_21=ref(F)(2)(1); \
Chimu_22=ref(F)(2)(2); \
Chimu_30=ref(F)(3)(0); \
Chimu_31=ref(F)(3)(1); \
Chimu_32=ref(F)(3)(2)
#define LOAD_CHIMU(DIR,F,PERM) \
{ const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
#define LOAD_CHI_BODY(F) \
Chi_00 = ref(F)(0)(0);\
Chi_01 = ref(F)(0)(1);\
Chi_02 = ref(F)(0)(2);\
Chi_10 = ref(F)(1)(0);\
Chi_11 = ref(F)(1)(1);\
Chi_12 = ref(F)(1)(2)
#define LOAD_CHI(DIR,F,PERM) \
{const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
//G-parity implementations using in-place intrinsic ops
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
//0h,1l -> 1l,0h
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
//Pulled fermion through forwards face, GPBC on upper component
//Need 0= 0l 1h 1= 1l 0h
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
//Pulled fermion through backwards face, GPBC on lower component
//Need 0= 1l 0h 1= 0l 1h
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(1)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1); \
INTO = tmp2;
//0l 0h -> 0h 0l
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(0)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1); \
INTO = tmp2;
#define LOAD_CHI_SETUP(DIR,F) \
g = F; \
direction = st._directions[DIR]; \
distance = st._distances[DIR]; \
sl = st._grid->_simd_layout[direction]; \
inplace_twist = 0; \
if(SE->_around_the_world && this->Params.twists[DIR % 4]){ \
if(sl == 1){ \
g = (F+1) % 2; \
}else{ \
inplace_twist = 1; \
} \
}
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteSpinor &ref(in._odata[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHIMU_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteHalfSpinor &ref(buf[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHI_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_0L_1H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_0L_1H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_1L_0H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_1L_0H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
// To splat or not to splat depends on the implementation
#define MULT_2SPIN_BODY \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00; \
UChi_10 = U_00*Chi_10; \
UChi_01 = U_10*Chi_00; \
UChi_11 = U_10*Chi_10; \
UChi_02 = U_20*Chi_00; \
UChi_12 = U_20*Chi_10; \
UChi_00+= U_01*Chi_01; \
UChi_10+= U_01*Chi_11; \
UChi_01+= U_11*Chi_01; \
UChi_11+= U_11*Chi_11; \
UChi_02+= U_21*Chi_01; \
UChi_12+= U_21*Chi_11; \
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02; \
UChi_10+= U_00*Chi_12; \
UChi_01+= U_10*Chi_02; \
UChi_11+= U_10*Chi_12; \
UChi_02+= U_20*Chi_02; \
UChi_12+= U_20*Chi_12
#define MULT_2SPIN(A,F) \
{auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
#define MULT_2SPIN_GPARITY(A,F) \
{auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
#define PERMUTE_DIR(dir) \
permute##dir(Chi_00,Chi_00);\
permute##dir(Chi_01,Chi_01);\
permute##dir(Chi_02,Chi_02);\
permute##dir(Chi_10,Chi_10);\
permute##dir(Chi_11,Chi_11);\
permute##dir(Chi_12,Chi_12);
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_30);\
Chi_01 = Chimu_01+timesI(Chimu_31);\
Chi_02 = Chimu_02+timesI(Chimu_32);\
Chi_10 = Chimu_10+timesI(Chimu_20);\
Chi_11 = Chimu_11+timesI(Chimu_21);\
Chi_12 = Chimu_12+timesI(Chimu_22);
#define YP_PROJ \
Chi_00 = Chimu_00-Chimu_30;\
Chi_01 = Chimu_01-Chimu_31;\
Chi_02 = Chimu_02-Chimu_32;\
Chi_10 = Chimu_10+Chimu_20;\
Chi_11 = Chimu_11+Chimu_21;\
Chi_12 = Chimu_12+Chimu_22;
#define ZP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_20); \
Chi_01 = Chimu_01+timesI(Chimu_21); \
Chi_02 = Chimu_02+timesI(Chimu_22); \
Chi_10 = Chimu_10-timesI(Chimu_30); \
Chi_11 = Chimu_11-timesI(Chimu_31); \
Chi_12 = Chimu_12-timesI(Chimu_32);
#define TP_PROJ \
Chi_00 = Chimu_00+Chimu_20; \
Chi_01 = Chimu_01+Chimu_21; \
Chi_02 = Chimu_02+Chimu_22; \
Chi_10 = Chimu_10+Chimu_30; \
Chi_11 = Chimu_11+Chimu_31; \
Chi_12 = Chimu_12+Chimu_32;
// hspin(0)=fspin(0)-timesI(fspin(3));
// hspin(1)=fspin(1)-timesI(fspin(2));
#define XM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_30);\
Chi_01 = Chimu_01-timesI(Chimu_31);\
Chi_02 = Chimu_02-timesI(Chimu_32);\
Chi_10 = Chimu_10-timesI(Chimu_20);\
Chi_11 = Chimu_11-timesI(Chimu_21);\
Chi_12 = Chimu_12-timesI(Chimu_22);
#define YM_PROJ \
Chi_00 = Chimu_00+Chimu_30;\
Chi_01 = Chimu_01+Chimu_31;\
Chi_02 = Chimu_02+Chimu_32;\
Chi_10 = Chimu_10-Chimu_20;\
Chi_11 = Chimu_11-Chimu_21;\
Chi_12 = Chimu_12-Chimu_22;
#define ZM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_20); \
Chi_01 = Chimu_01-timesI(Chimu_21); \
Chi_02 = Chimu_02-timesI(Chimu_22); \
Chi_10 = Chimu_10+timesI(Chimu_30); \
Chi_11 = Chimu_11+timesI(Chimu_31); \
Chi_12 = Chimu_12+timesI(Chimu_32);
#define TM_PROJ \
Chi_00 = Chimu_00-Chimu_20; \
Chi_01 = Chimu_01-Chimu_21; \
Chi_02 = Chimu_02-Chimu_22; \
Chi_10 = Chimu_10-Chimu_30; \
Chi_11 = Chimu_11-Chimu_31; \
Chi_12 = Chimu_12-Chimu_32;
// fspin(0)=hspin(0);
// fspin(1)=hspin(1);
// fspin(2)=timesMinusI(hspin(1));
// fspin(3)=timesMinusI(hspin(0));
#define XP_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesMinusI(UChi_10);\
result_21 = timesMinusI(UChi_11);\
result_22 = timesMinusI(UChi_12);\
result_30 = timesMinusI(UChi_00);\
result_31 = timesMinusI(UChi_01);\
result_32 = timesMinusI(UChi_02);
#define XP_RECON_ACCUM\
result_00+=UChi_00;\
result_01+=UChi_01;\
result_02+=UChi_02;\
result_10+=UChi_10;\
result_11+=UChi_11;\
result_12+=UChi_12;\
result_20-=timesI(UChi_10);\
result_21-=timesI(UChi_11);\
result_22-=timesI(UChi_12);\
result_30-=timesI(UChi_00);\
result_31-=timesI(UChi_01);\
result_32-=timesI(UChi_02);
#define XM_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesI(UChi_10);\
result_21 = timesI(UChi_11);\
result_22 = timesI(UChi_12);\
result_30 = timesI(UChi_00);\
result_31 = timesI(UChi_01);\
result_32 = timesI(UChi_02);
#define XM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_10);\
result_21+= timesI(UChi_11);\
result_22+= timesI(UChi_12);\
result_30+= timesI(UChi_00);\
result_31+= timesI(UChi_01);\
result_32+= timesI(UChi_02);
#define YP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_10;\
result_21+= UChi_11;\
result_22+= UChi_12;\
result_30-= UChi_00;\
result_31-= UChi_01;\
result_32-= UChi_02;
#define YM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_10;\
result_21-= UChi_11;\
result_22-= UChi_12;\
result_30+= UChi_00;\
result_31+= UChi_01;\
result_32+= UChi_02;
#define ZP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= timesI(UChi_00); \
result_21-= timesI(UChi_01); \
result_22-= timesI(UChi_02); \
result_30+= timesI(UChi_10); \
result_31+= timesI(UChi_11); \
result_32+= timesI(UChi_12);
#define ZM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_00); \
result_21+= timesI(UChi_01); \
result_22+= timesI(UChi_02); \
result_30-= timesI(UChi_10); \
result_31-= timesI(UChi_11); \
result_32-= timesI(UChi_12);
#define TP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_00; \
result_21+= UChi_01; \
result_22+= UChi_02; \
result_30+= UChi_10; \
result_31+= UChi_11; \
result_32+= UChi_12;
#define TM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_00; \
result_21-= UChi_01; \
result_22-= UChi_02; \
result_30-= UChi_10; \
result_31-= UChi_11; \
result_32-= UChi_12;
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI_IMPL(DIR,F,PERM); \
} \
MULT_2SPIN_IMPL(DIR,F); \
RECON;
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else if ( st.same_node[DIR] ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
} \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN_IMPL(DIR,F); \
RECON; \
}
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
MULT_2SPIN_IMPL(DIR,F); \
RECON; \
nmu++; \
}
#define HAND_RESULT(ss,F) \
{ \
SiteSpinor & ref (out._odata[ss]); \
vstream(ref(F)(0)(0),result_00); \
vstream(ref(F)(0)(1),result_01); \
vstream(ref(F)(0)(2),result_02); \
vstream(ref(F)(1)(0),result_10); \
vstream(ref(F)(1)(1),result_11); \
vstream(ref(F)(1)(2),result_12); \
vstream(ref(F)(2)(0),result_20); \
vstream(ref(F)(2)(1),result_21); \
vstream(ref(F)(2)(2),result_22); \
vstream(ref(F)(3)(0),result_30); \
vstream(ref(F)(3)(1),result_31); \
vstream(ref(F)(3)(2),result_32); \
}
#define HAND_RESULT_EXT(ss,F) \
if (nmu){ \
SiteSpinor & ref (out._odata[ss]); \
ref(F)(0)(0)+=result_00; \
ref(F)(0)(1)+=result_01; \
ref(F)(0)(2)+=result_02; \
ref(F)(1)(0)+=result_10; \
ref(F)(1)(1)+=result_11; \
ref(F)(1)(2)+=result_12; \
ref(F)(2)(0)+=result_20; \
ref(F)(2)(1)+=result_21; \
ref(F)(2)(2)+=result_22; \
ref(F)(3)(0)+=result_30; \
ref(F)(3)(1)+=result_31; \
ref(F)(3)(2)+=result_32; \
}
#define HAND_DECLARATIONS(a) \
Simd result_00; \
Simd result_01; \
Simd result_02; \
Simd result_10; \
Simd result_11; \
Simd result_12; \
Simd result_20; \
Simd result_21; \
Simd result_22; \
Simd result_30; \
Simd result_31; \
Simd result_32; \
Simd Chi_00; \
Simd Chi_01; \
Simd Chi_02; \
Simd Chi_10; \
Simd Chi_11; \
Simd Chi_12; \
Simd UChi_00; \
Simd UChi_01; \
Simd UChi_02; \
Simd UChi_10; \
Simd UChi_11; \
Simd UChi_12; \
Simd U_00; \
Simd U_10; \
Simd U_20; \
Simd U_01; \
Simd U_11; \
Simd U_21;
#define ZERO_RESULT \
result_00=zero; \
result_01=zero; \
result_02=zero; \
result_10=zero; \
result_11=zero; \
result_12=zero; \
result_20=zero; \
result_21=zero; \
result_22=zero; \
result_30=zero; \
result_31=zero; \
result_32=zero;
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
namespace Grid {
namespace QCD {
template<class Impl> void
WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
int nmu=0;
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
int nmu=0;
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
#define HAND_SPECIALISE_GPARITY(IMPL) \
template<> void \
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
int nmu=0; \
HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
int nmu=0; \
HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
}
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
////////////// Wilson ; uses this implementation /////////////////////
#define INSTANTIATE_THEM(A) \
template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out);
INSTANTIATE_THEM(GparityWilsonImplF);
INSTANTIATE_THEM(GparityWilsonImplD);
INSTANTIATE_THEM(GparityWilsonImplFH);
INSTANTIATE_THEM(GparityWilsonImplDF);
}}

View File

@@ -71,18 +71,14 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
RealD factor = 0.5 * beta / RealD(Nc);
//GaugeLinkField Umu(U._grid);
GaugeLinkField Umu(U._grid);
GaugeLinkField dSdU_mu(U._grid);
for (int mu = 0; mu < Nd; mu++) {
//Umu = PeekIndex<LorentzIndex>(U, mu);
Umu = PeekIndex<LorentzIndex>(U, mu);
// Staple in direction mu
//WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
//dSdU_mu = Ta(Umu * dSdU_mu) * factor;
WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
dSdU_mu = Ta(dSdU_mu) * factor;
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}

View File

@@ -16,12 +16,12 @@ class ScalarImplTypes {
typedef iImplField<Simd> SiteField;
typedef SiteField SitePropagator;
typedef SiteField SiteComplex;
typedef Lattice<SiteField> Field;
typedef Field ComplexField;
typedef Field FermionField;
typedef Field PropagatorField;
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
gaussian(pRNG, P);
}
@@ -47,54 +47,60 @@ class ScalarImplTypes {
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
U = 1.0;
}
static void MomentumSpacePropagator(Field &out, RealD m)
{
GridBase *grid = out._grid;
Field kmu(grid), one(grid);
const unsigned int nd = grid->_ndimension;
std::vector<int> &l = grid->_fdimensions;
one = Complex(1.0,0.0);
out = m*m;
for(int mu = 0; mu < nd; mu++)
{
Real twoPiL = M_PI*2./l[mu];
LatticeCoordinate(kmu,mu);
kmu = 2.*sin(.5*twoPiL*kmu);
out = out + kmu*kmu;
}
out = one/out;
}
static void FreePropagator(const Field &in, Field &out,
const Field &momKernel)
{
FFT fft((GridCartesian *)in._grid);
Field inFT(in._grid);
fft.FFT_all_dim(inFT, in, FFT::forward);
inFT = inFT*momKernel;
fft.FFT_all_dim(out, inFT, FFT::backward);
}
static void FreePropagator(const Field &in, Field &out, RealD m)
{
Field momKernel(in._grid);
MomentumSpacePropagator(momKernel, m);
FreePropagator(in, out, momKernel);
}
};
#ifdef USE_FFT_ACCELERATION
#ifndef FFT_MASS
#error "USE_FFT_ACCELERATION is defined but not FFT_MASS"
#endif
#endif
template <class S, unsigned int N>
class ScalarAdjMatrixImplTypes {
public:
typedef S Simd;
typedef QCD::SU<N> Group;
template <typename vtype>
using iImplField = iScalar<iScalar<iMatrix<vtype, N>>>;
template <typename vtype>
@@ -103,24 +109,119 @@ class ScalarImplTypes {
typedef iImplField<Simd> SiteField;
typedef SiteField SitePropagator;
typedef iImplComplex<Simd> SiteComplex;
typedef Lattice<SiteField> Field;
typedef Lattice<SiteComplex> ComplexField;
typedef Field FermionField;
typedef Field PropagatorField;
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
static void MomentaSquare(ComplexField &out)
{
GridBase *grid = out._grid;
const std::vector<int> &l = grid->FullDimensions();
ComplexField kmu(grid);
for (int mu = 0; mu < grid->Nd(); mu++)
{
Real twoPiL = M_PI * 2.0 / l[mu];
LatticeCoordinate(kmu, mu);
kmu = 2.0 * sin(0.5 * twoPiL * kmu);
out += kmu * kmu;
}
}
static void MomentumSpacePropagator(ComplexField &out, RealD m)
{
GridBase *grid = out._grid;
ComplexField one(grid);
one = Complex(1.0, 0.0);
out = m * m;
MomentaSquare(out);
out = one / out;
}
static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
{
#ifndef USE_FFT_ACCELERATION
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
#else
Field Pgaussian(P._grid), Pp(P._grid);
ComplexField p2(P._grid); p2 = zero;
RealD M = FFT_MASS;
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
FFT theFFT((GridCartesian*)P._grid);
theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
MomentaSquare(p2);
p2 += M * M;
p2 = sqrt(p2);
Pp *= p2;
theFFT.FFT_all_dim(P, Pp, FFT::backward);
#endif //USE_FFT_ACCELERATION
}
static inline Field projectForce(Field& P) {return P;}
static inline void update_field(Field& P, Field& U, double ep) {
U += P*ep;
static inline void update_field(Field &P, Field &U, double ep)
{
#ifndef USE_FFT_ACCELERATION
double t0=usecond();
U += P * ep;
double t1=usecond();
double total_time = (t1-t0)/1e6;
std::cout << GridLogIntegrator << "Total time for updating field (s) : " << total_time << std::endl;
#else
// FFT transform P(x) -> P(p)
// divide by (M^2+p^2) M external parameter (how to pass?)
// P'(p) = P(p)/(M^2+p^2)
// Transform back -> P'(x)
// U += P'(x)*ep
Field Pp(U._grid), P_FFT(U._grid);
static ComplexField p2(U._grid);
RealD M = FFT_MASS;
FFT theFFT((GridCartesian*)U._grid);
theFFT.FFT_all_dim(Pp, P, FFT::forward);
static bool first_call = true;
if (first_call)
{
// avoid recomputing
MomentumSpacePropagator(p2, M);
first_call = false;
}
Pp *= p2;
theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
U += P_FFT * ep;
#endif //USE_FFT_ACCELERATION
}
static inline RealD FieldSquareNorm(Field& U) {
return (TensorRemove(sum(trace(U*U))).real());
static inline RealD FieldSquareNorm(Field &U)
{
#ifndef USE_FFT_ACCELERATION
return (TensorRemove(sum(trace(U * U))).real());
#else
// In case of Fourier acceleration we have to:
// compute U(p)*U(p)/(M^2+p^2)) Parseval theorem
// 1 FFT needed U(x) -> U(p)
// M to be passed
FFT theFFT((GridCartesian*)U._grid);
Field Up(U._grid);
theFFT.FFT_all_dim(Up, U, FFT::forward);
RealD M = FFT_MASS;
ComplexField p2(U._grid);
MomentumSpacePropagator(p2, M);
Field Up2 = Up * p2;
// from the definition of the DFT we need to divide by the volume
return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
#endif //USE_FFT_ACCELERATION
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
@@ -146,7 +247,7 @@ class ScalarImplTypes {
typedef ScalarImplTypes<vComplex> ScalarImplCR;
typedef ScalarImplTypes<vComplexF> ScalarImplCF;
typedef ScalarImplTypes<vComplexD> ScalarImplCD;
// Hardcoding here the size of the matrices
typedef ScalarAdjMatrixImplTypes<vComplex, QCD::Nc> ScalarAdjImplR;
typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
@@ -155,7 +256,7 @@ class ScalarImplTypes {
template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex, Colours >;
template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF, Colours >;
template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD, Colours >;
//}
}

View File

@@ -30,119 +30,179 @@ directory
#ifndef SCALAR_INT_ACTION_H
#define SCALAR_INT_ACTION_H
// Note: this action can completely absorb the ScalarAction for real float fields
// use the scalarObjs to generalise the structure
namespace Grid {
// FIXME drop the QCD namespace everywhere here
namespace Grid
{
// FIXME drop the QCD namespace everywhere here
template <class Impl, int Ndim >
class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
public:
INHERIT_FIELD_TYPES(Impl);
private:
RealD mass_square;
RealD lambda;
template <class Impl, int Ndim>
class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
{
public:
INHERIT_FIELD_TYPES(Impl);
private:
RealD mass_square;
RealD lambda;
RealD g;
const unsigned int N = Impl::Group::Dimension;
typedef typename Field::vector_object vobj;
typedef CartesianStencil<vobj,vobj> Stencil;
typedef typename Field::vector_object vobj;
typedef CartesianStencil<vobj, vobj> Stencil;
SimpleCompressor<vobj> compressor;
int npoint = 2*Ndim;
std::vector<int> directions;// = {0,1,2,3,0,1,2,3}; // forcing 4 dimensions
std::vector<int> displacements;// = {1,1,1,1, -1,-1,-1,-1};
SimpleCompressor<vobj> compressor;
int npoint = 2 * Ndim;
std::vector<int> directions; //
std::vector<int> displacements; //
public:
ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
for (int mu = 0 ; mu < Ndim; mu++){
directions[mu] = mu; directions[mu+Ndim] = mu;
displacements[mu] = 1; displacements[mu+Ndim] = -1;
}
public:
ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
{
for (int mu = 0; mu < Ndim; mu++)
{
directions[mu] = mu;
directions[mu + Ndim] = mu;
displacements[mu] = 1;
displacements[mu + Ndim] = -1;
}
}
virtual std::string LogParameters() {
std::stringstream sstream;
sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl;
sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
return sstream.str();
}
virtual std::string LogParameters()
{
std::stringstream sstream;
sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl;
sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
sstream << GridLogMessage << "[ScalarAction] g : " << g << std::endl;
return sstream.str();
}
virtual std::string action_name() {return "ScalarAction";}
virtual std::string action_name() { return "ScalarAction"; }
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
virtual RealD S(const Field &p) {
assert(p._grid->Nd() == Ndim);
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor);
Field action(p._grid), pshift(p._grid), phisquared(p._grid);
phisquared = p*p;
action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
for (int mu = 0; mu < Ndim; mu++) {
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
parallel_for (int i = 0; i < p._grid->oSites(); i++) {
int permute_type;
StencilEntry *SE;
vobj temp2;
const vobj *temp, *t_p;
SE = phiStencil.GetEntry(permute_type, mu, i);
t_p = &p._odata[i];
if ( SE->_is_local ) {
temp = &p._odata[SE->_offset];
if ( SE->_permute ) {
permute(temp2, *temp, permute_type);
action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
} else {
action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
}
} else {
action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
}
}
// action -= pshift*p + p*pshift;
}
// NB the trace in the algebra is normalised to 1/2
// minus sign coming from the antihermitian fields
return -(TensorRemove(sum(trace(action)))).real();
};
virtual void deriv(const Field &p, Field &force) {
assert(p._grid->Nd() == Ndim);
force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
// move this outside
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor);
//for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
for (int point = 0; point < npoint; point++) {
parallel_for (int i = 0; i < p._grid->oSites(); i++) {
const vobj *temp;
vobj temp2;
int permute_type;
StencilEntry *SE;
SE = phiStencil.GetEntry(permute_type, point, i);
if ( SE->_is_local ) {
temp = &p._odata[SE->_offset];
if ( SE->_permute ) {
permute(temp2, *temp, permute_type);
force._odata[i] -= temp2;
} else {
force._odata[i] -= *temp;
}
} else {
force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
}
}
virtual RealD S(const Field &p)
{
assert(p._grid->Nd() == Ndim);
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor);
Field action(p._grid), pshift(p._grid), phisquared(p._grid);
phisquared = p * p;
action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
for (int mu = 0; mu < Ndim; mu++)
{
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
parallel_for(int i = 0; i < p._grid->oSites(); i++)
{
int permute_type;
StencilEntry *SE;
vobj temp2;
const vobj *temp, *t_p;
SE = phiStencil.GetEntry(permute_type, mu, i);
t_p = &p._odata[i];
if (SE->_is_local)
{
temp = &p._odata[SE->_offset];
if (SE->_permute)
{
permute(temp2, *temp, permute_type);
action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
}
else
{
action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
}
}
else
{
action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
}
}
// action -= pshift*p + p*pshift;
}
// NB the trace in the algebra is normalised to 1/2
// minus sign coming from the antihermitian fields
return -(TensorRemove(sum(trace(action)))).real() * N / g;
};
} // namespace Grid
#endif // SCALAR_INT_ACTION_H
virtual void deriv(const Field &p, Field &force)
{
double t0 = usecond();
assert(p._grid->Nd() == Ndim);
force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
double interm_t = usecond();
// move this outside
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor);
double halo_t = usecond();
int chunk = 128;
//for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
// inverting the order of the loops slows down the code(! g++ 7)
// cannot try to reduce the number of force writes by factor npoint...
// use cache blocking
for (int point = 0; point < npoint; point++)
{
#pragma omp parallel
{
int permute_type;
StencilEntry *SE;
const vobj *temp;
#pragma omp for schedule(static, chunk)
for (int i = 0; i < p._grid->oSites(); i++)
{
SE = phiStencil.GetEntry(permute_type, point, i);
// prefetch next p?
if (SE->_is_local)
{
temp = &p._odata[SE->_offset];
if (SE->_permute)
{
vobj temp2;
permute(temp2, *temp, permute_type);
force._odata[i] -= temp2;
}
else
{
force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
}
}
else
{
force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
}
}
}
}
force *= N / g;
double t1 = usecond();
double total_time = (t1 - t0) / 1e6;
double interm_time = (interm_t - t0) / 1e6;
double halo_time = (halo_t - interm_t) / 1e6;
double stencil_time = (t1 - halo_t) / 1e6;
std::cout << GridLogIntegrator << "Total time for force computation (s) : " << total_time << std::endl;
std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
std::cout << GridLogIntegrator << "Halo time in force computation (s) : " << halo_time << std::endl;
std::cout << GridLogIntegrator << "Stencil time in force computation (s) : " << stencil_time << std::endl;
double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
double Gflops = flops / (total_time * 1e9);
double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
std::cout << GridLogIntegrator << "Flops: " << flops << " - Gflop/s : " << Gflops << std::endl;
std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << " - Gflop/s NS: " << Gflops_no_stencil << std::endl;
}
};
} // namespace Grid
#endif // SCALAR_INT_ACTION_H

View File

@@ -211,7 +211,7 @@ typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
ScalarAdjGenericHMCRunner;
template <int Colours>
using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, ForceGradient, ScalarNxNMatrixFields<Colours> >;
} // namespace QCD
} // namespace Grid

View File

@@ -48,6 +48,22 @@ with this program; if not, write to the Free Software Foundation, Inc.,
} \
}
#define RegisterLoadCheckPointerMetadataFunction(NAME) \
template < class Metadata > \
void Load##NAME##Checkpointer(const CheckpointerParameters& Params_, const Metadata& M_) { \
if (!have_CheckPointer) { \
std::cout << GridLogDebug << "Loading Metadata Checkpointer " << #NAME \
<< std::endl; \
CP = std::unique_ptr<CheckpointerBaseModule>( \
new NAME##CPModule<ImplementationPolicy, Metadata >(Params_, M_)); \
have_CheckPointer = true; \
} else { \
std::cout << GridLogError << "Checkpointer already loaded " \
<< std::endl; \
exit(1); \
} \
}
namespace Grid {
namespace QCD {
@@ -77,7 +93,7 @@ class HMCResourceManager {
bool have_CheckPointer;
// NOTE: operator << is not overloaded for std::vector<string>
// so thsi function is necessary
// so this function is necessary
void output_vector_string(const std::vector<std::string> &vs){
for (auto &i: vs)
std::cout << i << " ";
@@ -254,6 +270,7 @@ class HMCResourceManager {
RegisterLoadCheckPointerFunction(Nersc);
#ifdef HAVE_LIME
RegisterLoadCheckPointerFunction(ILDG);
RegisterLoadCheckPointerMetadataFunction(Scidac);
#endif
////////////////////////////////////////////////////////

View File

@@ -76,6 +76,14 @@ class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
}
}
void check_filename(const std::string &filename){
std::ifstream f(filename.c_str());
if(!f.good()){
std::cout << GridLogError << "Filename " << filename << " not found. Aborting. " << std::endl;
abort();
};
}
virtual void initialize(const CheckpointerParameters &Params) = 0;
virtual void CheckpointRestore(int traj, typename Impl::Field &U,

View File

@@ -93,6 +93,9 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> {
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
this->check_filename(rng);
this->check_filename(config);
BinarySimpleMunger<sobj_double, sobj> munge;

View File

@@ -136,6 +136,20 @@ class ILDGCPModule: public CheckPointerModule< ImplementationPolicy> {
};
template<class ImplementationPolicy, class Metadata>
class ScidacCPModule: public CheckPointerModule< ImplementationPolicy> {
typedef CheckPointerModule< ImplementationPolicy> CPBase;
Metadata M;
// acquire resource
virtual void initialize(){
this->CheckPointPtr.reset(new ScidacHmcCheckpointer<ImplementationPolicy, Metadata>(this->Par_, M));
}
public:
ScidacCPModule(typename CPBase::APar Par, Metadata M_):M(M_), CPBase(Par) {}
template <class ReaderClass>
ScidacCPModule(Reader<ReaderClass>& Reader) : Parametrized<typename CPBase::APar>(Reader), M(Reader){};
};
#endif

View File

@@ -34,6 +34,7 @@ directory
#include <Grid/qcd/hmc/checkpointers/NerscCheckpointer.h>
#include <Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h>
#include <Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h>
#include <Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h>
//#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h>

View File

@@ -74,10 +74,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
GridBase *grid = U._grid;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
IldgWriter _IldgWriter;
IldgWriter _IldgWriter(grid->IsBoss());
_IldgWriter.open(config);
_IldgWriter.writeConfiguration(U, traj, config, config);
_IldgWriter.close();
@@ -95,6 +95,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
this->check_filename(rng);
this->check_filename(config);
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);

View File

@@ -69,6 +69,9 @@ class NerscHmcCheckpointer : public BaseHmcCheckpointer<Gimpl> {
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
this->check_filename(rng);
this->check_filename(config);
FieldMetaData header;
NerscIO::readRNGState(sRNG, pRNG, header, rng);

View File

@@ -0,0 +1,125 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/ScidacCheckpointer.h
Copyright (C) 2018
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef SCIDAC_CHECKPOINTER
#define SCIDAC_CHECKPOINTER
#ifdef HAVE_LIME
#include <iostream>
#include <sstream>
#include <string>
namespace Grid {
namespace QCD {
// For generic fields
template <class Implementation, class Metadata>
class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
private:
CheckpointerParameters Params;
Metadata MData;
typedef typename Implementation::Field Field;
public:
//INHERIT_GIMPL_TYPES(Implementation);
ScidacHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
ScidacHmcCheckpointer(const CheckpointerParameters &Params_, const Metadata& M_):MData(M_) { initialize(Params_); }
void initialize(const CheckpointerParameters &Params_) {
Params = Params_;
// check here that the format is valid
int ieee32big = (Params.format == std::string("IEEE32BIG"));
int ieee32 = (Params.format == std::string("IEEE32"));
int ieee64big = (Params.format == std::string("IEEE64BIG"));
int ieee64 = (Params.format == std::string("IEEE64"));
if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
std::cout << GridLogError << "Unrecognized file format " << Params.format
<< std::endl;
std::cout << GridLogError
<< "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64"
<< std::endl;
exit(1);
}
}
void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
GridBase *grid = U._grid;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
ScidacWriter _ScidacWriter(grid->IsBoss());
_ScidacWriter.open(config);
_ScidacWriter.writeScidacFieldRecord(U, MData);
_ScidacWriter.close();
std::cout << GridLogMessage << "Written Scidac Configuration on " << config
<< " checksum " << std::hex << nersc_csum<<"/"
<< scidac_csuma<<"/" << scidac_csumb
<< std::dec << std::endl;
}
};
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
this->check_filename(rng);
this->check_filename(config);
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
Metadata md_content;
ScidacReader _ScidacReader;
_ScidacReader.open(config);
_ScidacReader.readScidacFieldRecord(U,md_content); // format from the header
_ScidacReader.close();
std::cout << GridLogMessage << "Read Scidac Configuration from " << config
<< " checksum " << std::hex
<< nersc_csum<<"/"
<< scidac_csuma<<"/"
<< scidac_csumb
<< std::dec << std::endl;
};
};
}
}
#endif // HAVE_LIME
#endif // ILDG_CHECKPOINTER

View File

@@ -114,18 +114,26 @@ class Integrator {
// input U actually not used in the fundamental case
// Fundamental updates, include smearing
for (int a = 0; a < as[level].actions.size(); ++a) {
for (int a = 0; a < as[level].actions.size(); ++a) {
double start_full = usecond();
Field force(U._grid);
conformable(U._grid, Mom._grid);
Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
double start_force = usecond();
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
force = FieldImplementation::projectForce(force); // Ta for gauge fields
double end_force = usecond();
Real force_abs = std::sqrt(norm2(force)/U._grid->gSites());
std::cout << GridLogIntegrator << "Force average: " << force_abs << std::endl;
std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
Mom -= force * ep;
double end_full = usecond();
double time_full = (end_full - start_full) / 1e3;
double time_force = (end_force - start_force) / 1e3;
std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)" << std::endl;
}
// Force from the other representations

View File

@@ -92,6 +92,19 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
PlaquetteMod(): ObsBase(NoParameters()){}
};
template < class Impl >
class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
typedef ObservableModule<PolyakovLogger<Impl>, NoParameters> ObsBase;
using ObsBase::ObsBase; // for constructors
// acquire resource
virtual void initialize(){
this->ObservablePtr.reset(new PolyakovLogger<Impl>());
}
public:
PolyakovMod(): ObsBase(NoParameters()){}
};
template < class Impl >
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{

View File

@@ -45,5 +45,7 @@ class HmcObservable {
#include "plaquette.h"
#include "topological_charge.h"
#include "polyakov_loop.h"
#endif // HMC_OBSERVABLE_H

View File

@@ -0,0 +1,68 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/modules/polyakov_line.h
Copyright (C) 2017
Author: David Preti <david.preti@csic.es>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef HMC_POLYAKOV_H
#define HMC_POLYAKOV_H
namespace Grid {
namespace QCD {
// this is only defined for a gauge theory
template <class Impl>
class PolyakovLogger : public HmcObservable<typename Impl::Field> {
public:
// here forces the Impl to be of gauge fields
// if not the compiler will complain
INHERIT_GIMPL_TYPES(Impl);
// necessary for HmcObservable compatibility
typedef typename Impl::Field Field;
void TrajectoryComplete(int traj,
Field &U,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
ComplexD polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U);
int def_prec = std::cout.precision();
std::cout << GridLogMessage
<< std::setprecision(std::numeric_limits<Real>::digits10 + 1)
<< "Polyakov Loop: [ " << traj << " ] "<< polyakov << std::endl;
std::cout.precision(def_prec);
}
};
} // namespace QCD
} // namespace Grid
#endif // HMC_POLYAKOV_H

View File

@@ -23,6 +23,7 @@ class AdjointRep {
typedef typename SU_Adjoint<ncolour>::LatticeAdjMatrix LatticeMatrix;
typedef typename SU_Adjoint<ncolour>::LatticeAdjField LatticeField;
static const int Dimension = ncolour * ncolour - 1;
static const bool isFundamental = false;
LatticeField U;

View File

@@ -19,6 +19,7 @@ template <int ncolour>
class FundamentalRep {
public:
static const int Dimension = ncolour;
static const bool isFundamental = true;
// typdef to be used by the Representations class in HMC to get the
// types for the higher representation fields

View File

@@ -29,6 +29,7 @@ class TwoIndexRep {
typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexMatrix LatticeMatrix;
typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexField LatticeField;
static const int Dimension = ncolour * (ncolour + S) / 2;
static const bool isFundamental = false;
LatticeField U;

View File

@@ -6,30 +6,33 @@
#ifndef GAUGE_CONFIG_
#define GAUGE_CONFIG_
namespace Grid {
namespace Grid
{
namespace QCD {
namespace QCD
{
//trivial class for no smearing
template< class Impl >
class NoSmearing {
//trivial class for no smearing
template <class Impl>
class NoSmearing
{
public:
INHERIT_FIELD_TYPES(Impl);
Field* ThinField;
Field *ThinField;
NoSmearing(): ThinField(NULL) {}
NoSmearing() : ThinField(NULL) {}
void set_Field(Field& U) { ThinField = &U; }
void set_Field(Field &U) { ThinField = &U; }
void smeared_force(Field&) const {}
void smeared_force(Field &) const {}
Field& get_SmearedU() { return *ThinField; }
Field &get_SmearedU() { return *ThinField; }
Field& get_U(bool smeared = false) {
Field &get_U(bool smeared = false)
{
return *ThinField;
}
};
/*!
@@ -44,32 +47,36 @@ public:
It stores a list of smeared configurations.
*/
template <class Gimpl>
class SmearedConfiguration {
public:
class SmearedConfiguration
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
private:
private:
const unsigned int smearingLevels;
Smear_Stout<Gimpl> StoutSmearing;
std::vector<GaugeField> SmearedSet;
// Member functions
//====================================================================
void fill_smearedSet(GaugeField& U) {
ThinLinks = &U; // attach the smearing routine to the field U
void fill_smearedSet(GaugeField &U)
{
ThinLinks = &U; // attach the smearing routine to the field U
// check the pointer is not null
if (ThinLinks == NULL)
std::cout << GridLogError
<< "[SmearedConfiguration] Error in ThinLinks pointer\n";
if (smearingLevels > 0) {
if (smearingLevels > 0)
{
std::cout << GridLogDebug
<< "[SmearedConfiguration] Filling SmearedSet\n";
GaugeField previous_u(ThinLinks->_grid);
previous_u = *ThinLinks;
for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) {
for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
{
StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
previous_u = SmearedSet[smearLvl];
@@ -81,9 +88,10 @@ class SmearedConfiguration {
}
}
//====================================================================
GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
const GaugeField& GaugeK) const {
GridBase* grid = GaugeK._grid;
GaugeField AnalyticSmearedForce(const GaugeField &SigmaKPrime,
const GaugeField &GaugeK) const
{
GridBase *grid = GaugeK._grid;
GaugeField C(grid), SigmaK(grid), iLambda(grid);
GaugeLinkField iLambda_mu(grid);
GaugeLinkField iQ(grid), e_iQ(grid);
@@ -94,7 +102,8 @@ class SmearedConfiguration {
SigmaK = zero;
iLambda = zero;
for (int mu = 0; mu < Nd; mu++) {
for (int mu = 0; mu < Nd; mu++)
{
Cmu = peekLorentz(C, mu);
GaugeKmu = peekLorentz(GaugeK, mu);
SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu);
@@ -104,20 +113,22 @@ class SmearedConfiguration {
pokeLorentz(iLambda, iLambda_mu, mu);
}
StoutSmearing.derivative(SigmaK, iLambda,
GaugeK); // derivative of SmearBase
GaugeK); // derivative of SmearBase
return SigmaK;
}
/*! @brief Returns smeared configuration at level 'Level' */
const GaugeField& get_smeared_conf(int Level) const {
const GaugeField &get_smeared_conf(int Level) const
{
return SmearedSet[Level];
}
//====================================================================
void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ,
const GaugeLinkField& iQ, const GaugeLinkField& Sigmap,
const GaugeLinkField& GaugeK) const {
GridBase* grid = iQ._grid;
void set_iLambda(GaugeLinkField &iLambda, GaugeLinkField &e_iQ,
const GaugeLinkField &iQ, const GaugeLinkField &Sigmap,
const GaugeLinkField &GaugeK) const
{
GridBase *grid = iQ._grid;
GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid);
GaugeLinkField unity(grid);
unity = 1.0;
@@ -206,15 +217,15 @@ class SmearedConfiguration {
}
//====================================================================
public:
GaugeField*
ThinLinks; /*!< @brief Pointer to the thin
links configuration */
public:
GaugeField *
ThinLinks; /* Pointer to the thin links configuration */
/*! @brief Standard constructor */
SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
Smear_Stout<Gimpl>& Stout)
: smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) {
/* Standard constructor */
SmearedConfiguration(GridCartesian *UGrid, unsigned int Nsmear,
Smear_Stout<Gimpl> &Stout)
: smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL)
{
for (unsigned int i = 0; i < smearingLevels; ++i)
SmearedSet.push_back(*(new GaugeField(UGrid)));
}
@@ -223,21 +234,29 @@ class SmearedConfiguration {
SmearedConfiguration()
: smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
// attach the smeared routines to the thin links U and fill the smeared set
void set_Field(GaugeField& U) { fill_smearedSet(U); }
void set_Field(GaugeField &U)
{
double start = usecond();
fill_smearedSet(U);
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << "Smearing in " << time << " ms" << std::endl;
}
//====================================================================
void smeared_force(GaugeField& SigmaTilde) const {
if (smearingLevels > 0) {
void smeared_force(GaugeField &SigmaTilde) const
{
if (smearingLevels > 0)
{
double start = usecond();
GaugeField force = SigmaTilde; // actually = U*SigmaTilde
GaugeLinkField tmp_mu(SigmaTilde._grid);
for (int mu = 0; mu < Nd; mu++) {
for (int mu = 0; mu < Nd; mu++)
{
// to get just SigmaTilde
tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) *
peekLorentz(force, mu);
tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * peekLorentz(force, mu);
pokeLorentz(force, tmp_mu, mu);
}
@@ -246,33 +265,43 @@ class SmearedConfiguration {
force = AnalyticSmearedForce(force, *ThinLinks);
for (int mu = 0; mu < Nd; mu++) {
for (int mu = 0; mu < Nd; mu++)
{
tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu);
pokeLorentz(SigmaTilde, tmp_mu, mu);
}
} // if smearingLevels = 0 do nothing
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;
} // if smearingLevels = 0 do nothing
}
//====================================================================
GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
GaugeField &get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
GaugeField& get_U(bool smeared = false) {
GaugeField &get_U(bool smeared = false)
{
// get the config, thin links by default
if (smeared) {
if (smearingLevels) {
if (smeared)
{
if (smearingLevels)
{
RealD impl_plaq =
WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]);
std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq
<< std::endl;
return get_SmearedU();
} else {
}
else
{
RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
<< std::endl;
return *ThinLinks;
}
} else {
}
else
{
RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
<< std::endl;

View File

@@ -173,8 +173,8 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
std::cout << "Time to evolve " << diff.count() << " s\n";
#endif
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
<< step << " "
<< energyDensityPlaquette(step,out) << std::endl;
<< step << " " << tau(step) << " "
<< energyDensityPlaquette(step,out) << std::endl;
if( step % measure_interval == 0){
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : "
<< step << " "
@@ -193,8 +193,8 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
//std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
evolve_step_adaptive(out, maxTau);
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
<< step << " "
<< energyDensityPlaquette(out) << std::endl;
<< step << " " << taus << " "
<< energyDensityPlaquette(out) << std::endl;
if( step % measure_interval == 0){
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : "
<< step << " "

View File

@@ -746,7 +746,7 @@ template<typename GaugeField,typename GaugeMat>
}
}
template<typename GaugeField>
static void ColdConfiguration(GridParallelRNG &pRNG,GaugeField &out){
static void ColdConfiguration(GaugeField &out){
typedef typename GaugeField::vector_type vector_type;
typedef iSUnMatrix<vector_type> vMatrixType;
typedef Lattice<vMatrixType> LatticeMatrixType;
@@ -757,6 +757,10 @@ template<typename GaugeField,typename GaugeMat>
PokeIndex<LorentzIndex>(out,Umu,mu);
}
}
template<typename GaugeField>
static void ColdConfiguration(GridParallelRNG &pRNG,GaugeField &out){
ColdConfiguration(out);
}
template<typename LatticeMatrixType>
static void taProj( const LatticeMatrixType &in, LatticeMatrixType &out){

View File

@@ -123,6 +123,28 @@ public:
return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
}
//////////////////////////////////////////////////
// average over all x,y,z the temporal loop
//////////////////////////////////////////////////
static ComplexD avgPolyakovLoop(const GaugeField &Umu) { //assume Nd=4
GaugeMat Ut(Umu._grid), P(Umu._grid);
ComplexD out;
int T = Umu._grid->GlobalDimensions()[3];
int X = Umu._grid->GlobalDimensions()[0];
int Y = Umu._grid->GlobalDimensions()[1];
int Z = Umu._grid->GlobalDimensions()[2];
Ut = peekLorentz(Umu,3); //Select temporal direction
P = Ut;
for (int t=1;t<T;t++){
P = Gimpl::CovShiftForward(Ut,3,P);
}
RealD norm = 1.0/(Nc*X*Y*Z*T);
out = sum(trace(P))*norm;
return out;
}
//////////////////////////////////////////////////
// average over traced single links
//////////////////////////////////////////////////
@@ -190,6 +212,7 @@ public:
// For the force term
/*
static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
GridBase *grid = Umu._grid;
std::vector<GaugeMat> U(Nd, grid);
@@ -203,7 +226,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
for (int nu = 0; nu < Nd; nu++) {
if (nu != mu) {
// this is ~10% faster than the Staple
// this is ~10% faster than the Staple -- PAB: so what it gives the WRONG answers for other BC's!
tmp1 = Cshift(U[nu], mu, 1);
tmp2 = Cshift(U[mu], nu, 1);
staple += tmp1* adj(U[nu]*tmp2);
@@ -213,7 +236,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
}
staple = U[mu]*staple;
}
*/
//////////////////////////////////////////////////
// the sum over all staples on each site
//////////////////////////////////////////////////
@@ -291,9 +314,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
}
}
//////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
// the sum over all staples on each site in direction mu,nu, lower part
//////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
static void StapleLower(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
int nu) {
if (nu != mu) {
@@ -315,7 +338,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
//
staple = Gimpl::ShiftStaple(
Gimpl::CovShiftBackward(U[nu], nu,
Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
mu);
}
}
@@ -325,7 +350,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
static void FieldStrength(GaugeMat &FS, const GaugeLorentz &Umu, int mu, int nu){
// Fmn +--<--+ Ut +--<--+
// | | | |
// (x)+-->--+ +-->--+(x)
// (x)+-->--+ +-->--+(x) - h.c.
// | | | |
// +--<--+ +--<--+
@@ -335,7 +360,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
GaugeMat v = Vup - Vdn;
GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu); // some redundant copies
GaugeMat vu = v*u;
FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
//FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
FS = (u*v + Cshift(vu, mu, -1));
FS = 0.125*(FS - adj(FS));
}
static Real TopologicalCharge(GaugeLorentz &U){
@@ -360,6 +387,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
return TensorRemove(Tq).real();
}
//////////////////////////////////////////////////////
// Similar to above for rectangle is required
//////////////////////////////////////////////////////

View File

@@ -31,113 +31,10 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk>
#define GRID_SERIALISATION_ABSTRACT_READER_H
#include <type_traits>
#include <Grid/tensors/Tensors.h>
#include <Grid/serialisation/VectorUtils.h>
namespace Grid {
// Vector IO utilities ///////////////////////////////////////////////////////
// helper function to read space-separated values
template <typename T>
std::vector<T> strToVec(const std::string s)
{
std::istringstream sstr(s);
T buf;
std::vector<T> v;
while(!sstr.eof())
{
sstr >> buf;
v.push_back(buf);
}
return v;
}
// output to streams for vectors
template < class T >
inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
{
os << "[";
for (auto &x: v)
{
os << x << " ";
}
if (v.size() > 0)
{
os << "\b";
}
os << "]";
return os;
}
// Vector element trait //////////////////////////////////////////////////////
template <typename T>
struct element
{
typedef T type;
static constexpr bool is_number = false;
};
template <typename T>
struct element<std::vector<T>>
{
typedef typename element<T>::type type;
static constexpr bool is_number = std::is_arithmetic<T>::value
or is_complex<T>::value
or element<T>::is_number;
};
// Vector flattening utility class ////////////////////////////////////////////
// Class to flatten a multidimensional std::vector
template <typename V>
class Flatten
{
public:
typedef typename element<V>::type Element;
public:
explicit Flatten(const V &vector);
const V & getVector(void);
const std::vector<Element> & getFlatVector(void);
const std::vector<size_t> & getDim(void);
private:
void accumulate(const Element &e);
template <typename W>
void accumulate(const W &v);
void accumulateDim(const Element &e);
template <typename W>
void accumulateDim(const W &v);
private:
const V &vector_;
std::vector<Element> flatVector_;
std::vector<size_t> dim_;
};
// Class to reconstruct a multidimensional std::vector
template <typename V>
class Reconstruct
{
public:
typedef typename element<V>::type Element;
public:
Reconstruct(const std::vector<Element> &flatVector,
const std::vector<size_t> &dim);
const V & getVector(void);
const std::vector<Element> & getFlatVector(void);
const std::vector<size_t> & getDim(void);
private:
void fill(std::vector<Element> &v);
template <typename W>
void fill(W &v);
void resize(std::vector<Element> &v, const unsigned int dim);
template <typename W>
void resize(W &v, const unsigned int dim);
private:
V vector_;
const std::vector<Element> &flatVector_;
std::vector<size_t> dim_;
size_t ind_{0};
unsigned int dimInd_{0};
};
// Pair IO utilities /////////////////////////////////////////////////////////
// helper function to parse input in the format "<obj1 obj2>"
template <typename T1, typename T2>
@@ -151,15 +48,15 @@ namespace Grid {
do
{
is.get(c);
} while (c != '<' && !is.eof());
if (c == '<')
} while (c != '(' && !is.eof());
if (c == '(')
{
int start = is.tellg();
do
{
is.get(c);
} while (c != '>' && !is.eof());
if (c == '>')
} while (c != ')' && !is.eof());
if (c == ')')
{
int end = is.tellg();
int psize = end - start - 1;
@@ -182,7 +79,7 @@ namespace Grid {
template <class T1, class T2>
inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p)
{
os << "<" << p.first << " " << p.second << ">";
os << "(" << p.first << " " << p.second << ")";
return os;
}
@@ -205,6 +102,12 @@ namespace Grid {
template <typename U>
typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
write(const std::string& s, const U &output);
template <typename U>
void write(const std::string &s, const iScalar<U> &output);
template <typename U, int N>
void write(const std::string &s, const iVector<U, N> &output);
template <typename U, int N>
void write(const std::string &s, const iMatrix<U, N> &output);
private:
T *upcast;
};
@@ -224,6 +127,12 @@ namespace Grid {
template <typename U>
typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
read(const std::string& s, U &output);
template <typename U>
void read(const std::string &s, iScalar<U> &output);
template <typename U, int N>
void read(const std::string &s, iVector<U, N> &output);
template <typename U, int N>
void read(const std::string &s, iMatrix<U, N> &output);
protected:
template <typename U>
void fromString(U &output, const std::string &s);
@@ -237,203 +146,9 @@ namespace Grid {
};
template<typename T> struct isWriter {
static const bool value = false;
};
// Generic writer interface
// serializable base class
class Serializable
{
public:
template <typename T>
static inline void write(Writer<T> &WR,const std::string &s,
const Serializable &obj)
{}
template <typename T>
static inline void read(Reader<T> &RD,const std::string &s,
Serializable &obj)
{}
friend inline std::ostream & operator<<(std::ostream &os,
const Serializable &obj)
{
return os;
}
};
// Flatten class template implementation /////////////////////////////////////
template <typename V>
void Flatten<V>::accumulate(const Element &e)
{
flatVector_.push_back(e);
}
template <typename V>
template <typename W>
void Flatten<V>::accumulate(const W &v)
{
for (auto &e: v)
{
accumulate(e);
}
}
template <typename V>
void Flatten<V>::accumulateDim(const Element &e) {};
template <typename V>
template <typename W>
void Flatten<V>::accumulateDim(const W &v)
{
dim_.push_back(v.size());
accumulateDim(v[0]);
}
template <typename V>
Flatten<V>::Flatten(const V &vector)
: vector_(vector)
{
accumulate(vector_);
accumulateDim(vector_);
}
template <typename V>
const V & Flatten<V>::getVector(void)
{
return vector_;
}
template <typename V>
const std::vector<typename Flatten<V>::Element> &
Flatten<V>::getFlatVector(void)
{
return flatVector_;
}
template <typename V>
const std::vector<size_t> & Flatten<V>::getDim(void)
{
return dim_;
}
// Reconstruct class template implementation /////////////////////////////////
template <typename V>
void Reconstruct<V>::fill(std::vector<Element> &v)
{
for (auto &e: v)
{
e = flatVector_[ind_++];
}
}
template <typename V>
template <typename W>
void Reconstruct<V>::fill(W &v)
{
for (auto &e: v)
{
fill(e);
}
}
template <typename V>
void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
{
v.resize(dim_[dim]);
}
template <typename V>
template <typename W>
void Reconstruct<V>::resize(W &v, const unsigned int dim)
{
v.resize(dim_[dim]);
for (auto &e: v)
{
resize(e, dim + 1);
}
}
template <typename V>
Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
const std::vector<size_t> &dim)
: flatVector_(flatVector)
, dim_(dim)
{
resize(vector_, 0);
fill(vector_);
}
template <typename V>
const V & Reconstruct<V>::getVector(void)
{
return vector_;
}
template <typename V>
const std::vector<typename Reconstruct<V>::Element> &
Reconstruct<V>::getFlatVector(void)
{
return flatVector_;
}
template <typename V>
const std::vector<size_t> & Reconstruct<V>::getDim(void)
{
return dim_;
}
// Generic writer interface //////////////////////////////////////////////////
template <typename T>
inline void push(Writer<T> &w, const std::string &s) {
w.push(s);
}
template <typename T>
inline void push(Writer<T> &w, const char *s)
{
w.push(std::string(s));
}
template <typename T>
inline void pop(Writer<T> &w)
{
w.pop();
}
template <typename T, typename U>
inline void write(Writer<T> &w, const std::string& s, const U &output)
{
w.write(s, output);
}
// Generic reader interface
template <typename T>
inline bool push(Reader<T> &r, const std::string &s)
{
return r.push(s);
}
template <typename T>
inline bool push(Reader<T> &r, const char *s)
{
return r.push(std::string(s));
}
template <typename T>
inline void pop(Reader<T> &r)
{
r.pop();
}
template <typename T, typename U>
inline void read(Reader<T> &r, const std::string &s, U &output)
{
r.read(s, output);
}
// Writer template implementation ////////////////////////////////////////////
// Writer template implementation
template <typename T>
Writer<T>::Writer(void)
{
@@ -467,6 +182,27 @@ namespace Grid {
{
upcast->writeDefault(s, output);
}
template <typename T>
template <typename U>
void Writer<T>::write(const std::string &s, const iScalar<U> &output)
{
upcast->writeDefault(s, tensorToVec(output));
}
template <typename T>
template <typename U, int N>
void Writer<T>::write(const std::string &s, const iVector<U, N> &output)
{
upcast->writeDefault(s, tensorToVec(output));
}
template <typename T>
template <typename U, int N>
void Writer<T>::write(const std::string &s, const iMatrix<U, N> &output)
{
upcast->writeDefault(s, tensorToVec(output));
}
// Reader template implementation
template <typename T>
@@ -502,7 +238,37 @@ namespace Grid {
{
upcast->readDefault(s, output);
}
template <typename T>
template <typename U>
void Reader<T>::read(const std::string &s, iScalar<U> &output)
{
typename TensorToVec<iScalar<U>>::type v;
upcast->readDefault(s, v);
vecToTensor(output, v);
}
template <typename T>
template <typename U, int N>
void Reader<T>::read(const std::string &s, iVector<U, N> &output)
{
typename TensorToVec<iVector<U, N>>::type v;
upcast->readDefault(s, v);
vecToTensor(output, v);
}
template <typename T>
template <typename U, int N>
void Reader<T>::read(const std::string &s, iMatrix<U, N> &output)
{
typename TensorToVec<iMatrix<U, N>>::type v;
upcast->readDefault(s, v);
vecToTensor(output, v);
}
template <typename T>
template <typename U>
void Reader<T>::fromString(U &output, const std::string &s)
@@ -521,6 +287,76 @@ namespace Grid {
abort();
}
}
// serializable base class ///////////////////////////////////////////////////
class Serializable
{
public:
template <typename T>
static inline void write(Writer<T> &WR,const std::string &s,
const Serializable &obj)
{}
template <typename T>
static inline void read(Reader<T> &RD,const std::string &s,
Serializable &obj)
{}
friend inline std::ostream & operator<<(std::ostream &os,
const Serializable &obj)
{
return os;
}
};
// Generic writer interface //////////////////////////////////////////////////
template <typename T>
inline void push(Writer<T> &w, const std::string &s) {
w.push(s);
}
template <typename T>
inline void push(Writer<T> &w, const char *s)
{
w.push(std::string(s));
}
template <typename T>
inline void pop(Writer<T> &w)
{
w.pop();
}
template <typename T, typename U>
inline void write(Writer<T> &w, const std::string& s, const U &output)
{
w.write(s, output);
}
// Generic reader interface //////////////////////////////////////////////////
template <typename T>
inline bool push(Reader<T> &r, const std::string &s)
{
return r.push(s);
}
template <typename T>
inline bool push(Reader<T> &r, const char *s)
{
return r.push(std::string(s));
}
template <typename T>
inline void pop(Reader<T> &r)
{
r.pop();
}
template <typename T, typename U>
inline void read(Reader<T> &r, const std::string &s, U &output)
{
r.read(s, output);
}
}
#endif

View File

@@ -55,6 +55,11 @@ void Hdf5Writer::writeDefault(const std::string &s, const char *x)
writeDefault(s, sx);
}
Group & Hdf5Writer::getGroup(void)
{
return group_;
}
// Reader implementation ///////////////////////////////////////////////////////
Hdf5Reader::Hdf5Reader(const std::string &fileName)
: fileName_(fileName)
@@ -103,3 +108,8 @@ void Hdf5Reader::readDefault(const std::string &s, std::string &x)
x.resize(strType.getSize());
attribute.read(strType, &(x[0]));
}
Group & Hdf5Reader::getGroup(void)
{
return group_;
}

View File

@@ -5,6 +5,7 @@
#include <string>
#include <vector>
#include <H5Cpp.h>
#include <Grid/tensors/Tensors.h>
#include "Hdf5Type.h"
#ifndef H5_NO_NAMESPACE
@@ -37,6 +38,7 @@ namespace Grid
template <typename U>
typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
writeDefault(const std::string &s, const std::vector<U> &x);
H5NS::Group & getGroup(void);
private:
template <typename U>
void writeSingleAttribute(const U &x, const std::string &name,
@@ -64,6 +66,7 @@ namespace Grid
template <typename U>
typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
readDefault(const std::string &s, std::vector<U> &x);
H5NS::Group & getGroup(void);
private:
template <typename U>
void readSingleAttribute(U &x, const std::string &name,

View File

@@ -25,7 +25,7 @@
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
#include <Grid/Grid.h>
using namespace Grid;
using namespace std;

View File

@@ -125,7 +125,11 @@ static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){
}\
template <typename T>\
static inline void read(Reader<T> &RD,const std::string &s, cname &obj){ \
push(RD,s);\
if (!push(RD,s))\
{\
std::cout << Grid::GridLogWarning << "IO: Cannot open node '" << s << "'" << std::endl;\
return;\
};\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \
pop(RD);\
}\

Some files were not shown because too many files have changed in this diff Show More