1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Merge branch 'develop' into feature/hadrons

This commit is contained in:
Antonin Portelli 2018-05-28 11:52:47 +02:00
commit 200d35b38a
31 changed files with 1973 additions and 679 deletions

View File

@ -51,7 +51,7 @@ namespace Grid {
virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
virtual void HermOp(const Field &in, Field &out)=0; virtual void HermOp(const Field &in, Field &out)=0;
}; };
@ -309,36 +309,59 @@ namespace Grid {
class SchurStaggeredOperator : public SchurOperatorBase<Field> { class SchurStaggeredOperator : public SchurOperatorBase<Field> {
protected: protected:
Matrix &_Mat; Matrix &_Mat;
Field tmp;
RealD mass;
double tMpc;
double tIP;
double tMeo;
double taxpby_norm;
uint64_t ncall;
public: public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; void Report(void)
{
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
}
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{
assert( _Mat.isTrivialEE() );
mass = _Mat.Mass();
tMpc=0;
tIP =0;
tMeo=0;
taxpby_norm=0;
ncall=0;
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
GridLogIterative.TimingMode(1); ncall++;
std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl; tMpc-=usecond();
n2 = Mpc(in,out); n2 = Mpc(in,out);
std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl; tMpc+=usecond();
tIP-=usecond();
ComplexD dot= innerProduct(in,out); ComplexD dot= innerProduct(in,out);
std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl; tIP+=usecond();
n1 = real(dot); n1 = real(dot);
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
std::cout << GridLogIterative << " HermOp "<<std::endl; ncall++;
Mpc(in,out); tMpc-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMpc+=usecond();
taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
} }
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid); tMeo-=usecond();
Field tmp2(in._grid);
std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
_Mat.Mooee(in,out);
_Mat.Mooee(out,tmp);
std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.Meooe(out,tmp2); _Mat.Meooe(out,tmp);
std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl; tMeo+=usecond();
taxpby_norm-=usecond();
RealD nn=axpy_norm(out,-1.0,tmp2,tmp); RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl; taxpby_norm+=usecond();
return nn; return nn;
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){

View File

@ -54,6 +54,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard; psi.checkerboard = src.checkerboard;
conformable(psi, src); conformable(psi, src);
@ -69,7 +70,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, b); Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp; r = src - mmp;
p = r; p = r;
@ -96,38 +96,44 @@ class ConjugateGradient : public OperatorFunction<Field> {
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer; GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
GridStopWatch LinearCombTimer;
GridStopWatch MatrixTimer; GridStopWatch MatrixTimer;
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
SolverTimer.Start(); SolverTimer.Start();
int k; int k;
for (k = 1; k <= MaxIterations; k++) { for (k = 1; k <= MaxIterations*1000; k++) {
c = cp; c = cp;
MatrixTimer.Start(); MatrixTimer.Start();
Linop.HermOpAndNorm(p, mmp, d, qq); Linop.HermOp(p, mmp);
MatrixTimer.Stop(); MatrixTimer.Stop();
LinalgTimer.Start(); LinalgTimer.Start();
// RealD qqck = norm2(mmp);
// ComplexD dck = innerProduct(p,mmp);
InnerTimer.Start();
ComplexD dc = innerProduct(p,mmp);
InnerTimer.Stop();
d = dc.real();
a = c / d; a = c / d;
b_pred = a * (a * qq - d) / c;
AxpyNormTimer.Start();
cp = axpy_norm(r, -a, mmp, r); cp = axpy_norm(r, -a, mmp, r);
AxpyNormTimer.Stop();
b = cp / c; b = cp / c;
// Fuse these loops ; should be really easy LinearCombTimer.Start();
psi = a * p + psi; parallel_for(int ss=0;ss<src._grid->oSites();ss++){
p = p * b + r; vstream(psi[ss], a * p[ss] + psi[ss]);
vstream(p [ss], b * p[ss] + r[ss]);
}
LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
// Stopping condition // Stopping condition
if (cp <= rsq) { if (cp <= rsq) {
@ -148,6 +154,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

View File

@ -43,6 +43,7 @@ namespace Grid {
public: public:
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
int verbose; int verbose;
MultiShiftFunction shifts; MultiShiftFunction shifts;
@ -163,7 +164,16 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
for(int s=0;s<nshift;s++) { for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src); axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
} }
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer;
GridStopWatch ShiftTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
// Iteration loop // Iteration loop
int k; int k;
@ -171,7 +181,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
for (k=1;k<=MaxIterations;k++){ for (k=1;k<=MaxIterations;k++){
a = c /cp; a = c /cp;
AXPYTimer.Start();
axpy(p,a,p,r); axpy(p,a,p,r);
AXPYTimer.Stop();
// Note to self - direction ps is iterated seperately // Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope // for each shift. Does not appear to have any scope
@ -180,6 +192,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
// However SAME r is used. Could load "r" and update // However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving // ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps // New Kernel: Load r, vector of coeffs, vector of pointers ps
AXPYTimer.Start();
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
if ( ! converged[s] ) { if ( ! converged[s] ) {
if (s==0){ if (s==0){
@ -190,22 +203,34 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
} }
} }
} }
AXPYTimer.Stop();
cp=c; cp=c;
MatrixTimer.Start();
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
// The below is faster on KNL
Linop.HermOp(p,mmp);
d=real(innerProduct(p,mmp));
Linop.HermOpAndNorm(p,mmp,d,qq); MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp,mass[0],p,mmp); axpy(mmp,mass[0],p,mmp);
AXPYTimer.Stop();
RealD rn = norm2(p); RealD rn = norm2(p);
d += rn*mass[0]; d += rn*mass[0];
bp=b; bp=b;
b=-cp/d; b=-cp/d;
AXPYTimer.Start();
c=axpy_norm(r,b,mmp,r); c=axpy_norm(r,b,mmp,r);
AXPYTimer.Stop();
// Toggle the recurrence history // Toggle the recurrence history
bs[0] = b; bs[0] = b;
iz = 1-iz; iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){ for(int s=1;s<nshift;s++){
if((!converged[s])){ if((!converged[s])){
RealD z0 = z[s][1-iz]; RealD z0 = z[s][1-iz];
@ -215,6 +240,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
} }
} }
ShiftTimer.Stop();
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
int ss = s; int ss = s;
@ -257,6 +283,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
if ( all_converged ){ if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl; std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl; std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
@ -269,8 +298,19 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
RealD cn = norm2(src); RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
} }
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return; return;
} }
} }
// ugly hack // ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;

View File

@ -244,19 +244,11 @@ namespace Grid {
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard; return axpy_norm_fast(ret,a,x,y);
conformable(ret,x);
conformable(x,y);
axpy(ret,a,x,y);
return norm2(ret);
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard; return axpby_norm_fast(ret,a,b,x,y);
conformable(ret,x);
conformable(x,y);
axpby(ret,a,b,x,y);
return norm2(ret); // FIXME implement parallel norm in ss loop
} }
} }

View File

@ -33,7 +33,7 @@ namespace Grid {
// Deterministic Reduction operations // Deterministic Reduction operations
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg); auto nrm = innerProduct(arg,arg);
return std::real(nrm); return std::real(nrm);
} }
@ -43,31 +43,84 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
scalar_type nrm;
GridBase *grid = left._grid; GridBase *grid = left._grid;
const int pad = 8;
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
ComplexD inner;
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff; int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){ for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]); vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
} }
sumarray[thr]=TensorRemove(vnrm) ; // All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
} }
vector_type vvnrm; vvnrm=zero; // sum across threads inner=0.0;
for(int i=0;i<grid->SumArraySize();i++){ for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i]; inner = inner+sumarray[i*pad];
} }
nrm = Reduce(vvnrm);// sum across simd right._grid->GlobalSum(inner);
right._grid->GlobalSum(nrm); return inner;
return nrm;
} }
/////////////////////////
// Fast axpby_norm
// z = a x + b y
// return norm z
/////////////////////////
template<class sobj,class vobj> strong_inline RealD
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
sobj one(1.0);
return axpby_norm_fast(z,a,one,x,y);
}
template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
const int pad = 8;
z.checkerboard = x.checkerboard;
conformable(z,x);
conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x._grid;
Vector<RealD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
// private to thread; sub summation
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vnrm = vnrm + innerProductD(tmp,tmp);
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
nrm = 0.0; // sum across threads; linear in thread count but fast
for(int i=0;i<grid->SumArraySize();i++){
nrm = nrm+sumarray[i*pad];
}
z._grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)

View File

@ -49,7 +49,8 @@ inline double usecond(void) {
typedef std::chrono::system_clock GridClock; typedef std::chrono::system_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint; typedef std::chrono::time_point<GridClock> GridTimePoint;
typedef std::chrono::milliseconds GridTime; typedef std::chrono::milliseconds GridMillisecs;
typedef std::chrono::microseconds GridTime;
typedef std::chrono::microseconds GridUsecs; typedef std::chrono::microseconds GridUsecs;
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time) inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli
stream << time.count()<<" ms"; stream << time.count()<<" ms";
return stream; return stream;
} }
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
{
stream << time.count()<<" usec";
return stream;
}
class GridStopWatch { class GridStopWatch {
private: private:

View File

@ -64,9 +64,12 @@ namespace Grid {
virtual RealD M (const FermionField &in, FermionField &out)=0; virtual RealD M (const FermionField &in, FermionField &out)=0;
virtual RealD Mdag (const FermionField &in, FermionField &out)=0; virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
// half checkerboard operaions // Query the even even properties to make algorithmic decisions
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
virtual int isTrivialEE(void) { return 0; };
virtual RealD Mass(void) {return 0.0;};
// half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out)=0; virtual void Meooe (const FermionField &in, FermionField &out)=0;
virtual void MeooeDag (const FermionField &in, FermionField &out)=0; virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
virtual void Mooee (const FermionField &in, FermionField &out)=0; virtual void Mooee (const FermionField &in, FermionField &out)=0;

View File

@ -764,7 +764,12 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
inline void loadLinkElement(Simd &reg, ref &memory) { inline void loadLinkElement(Simd &reg, ref &memory) {
reg = memory; reg = memory;
} }
inline void InsertGaugeField(DoubledGaugeField &U_ds,
const GaugeLinkField &U,int mu)
{
PokeIndex<LorentzIndex>(U_ds, U, mu);
}
inline void DoubleStore(GridBase *GaugeGrid, inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &UUUds, // for Naik term
DoubledGaugeField &Uds, DoubledGaugeField &Uds,
@ -803,8 +808,10 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
U = U *phases; U = U *phases;
Udag = Udag *phases; Udag = Udag *phases;
PokeIndex<LorentzIndex>(Uds, U, mu); InsertGaugeField(Uds,U,mu);
PokeIndex<LorentzIndex>(Uds, Udag, mu + 4); InsertGaugeField(Uds,Udag,mu+4);
// PokeIndex<LorentzIndex>(Uds, U, mu);
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
// 3 hop based on thin links. Crazy huh ? // 3 hop based on thin links. Crazy huh ?
U = PeekIndex<LorentzIndex>(Uthin, mu); U = PeekIndex<LorentzIndex>(Uthin, mu);
@ -816,8 +823,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
UUU = UUU *phases; UUU = UUU *phases;
UUUdag = UUUdag *phases; UUUdag = UUUdag *phases;
PokeIndex<LorentzIndex>(UUUds, UUU, mu); InsertGaugeField(UUUds,UUU,mu);
PokeIndex<LorentzIndex>(UUUds, UUUdag, mu+4); InsertGaugeField(UUUds,UUUdag,mu+4);
} }
} }
@ -910,6 +917,23 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
mac(&phi(), &UU(), &chi()); mac(&phi(), &UU(), &chi());
} }
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
{
GridBase *GaugeGrid = U_ds._grid;
parallel_for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, U_ds, lcoor);
peekLocalSite(ScalarU, U, lcoor);
ScalarUds(mu) = ScalarU();
}
}
inline void DoubleStore(GridBase *GaugeGrid, inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &UUUds, // for Naik term
DoubledGaugeField &Uds, DoubledGaugeField &Uds,
@ -951,23 +975,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
U = U *phases; U = U *phases;
Udag = Udag *phases; Udag = Udag *phases;
InsertGaugeField(Uds,U,mu);
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { InsertGaugeField(Uds,Udag,mu+4);
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, Uds, lcoor);
peekLocalSite(ScalarU, U, lcoor);
ScalarUds(mu) = ScalarU();
peekLocalSite(ScalarU, Udag, lcoor);
ScalarUds(mu + 4) = ScalarU();
pokeLocalSite(ScalarUds, Uds, lcoor);
}
// 3 hop based on thin links. Crazy huh ? // 3 hop based on thin links. Crazy huh ?
U = PeekIndex<LorentzIndex>(Uthin, mu); U = PeekIndex<LorentzIndex>(Uthin, mu);
@ -979,24 +988,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
UUU = UUU *phases; UUU = UUU *phases;
UUUdag = UUUdag *phases; UUUdag = UUUdag *phases;
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { InsertGaugeField(UUUds,UUU,mu);
InsertGaugeField(UUUds,UUUdag,mu+4);
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, UUUds, lcoor);
peekLocalSite(ScalarU, UUU, lcoor);
ScalarUds(mu) = ScalarU();
peekLocalSite(ScalarU, UUUdag, lcoor);
ScalarUds(mu + 4) = ScalarU();
pokeLocalSite(ScalarUds, UUUds, lcoor);
}
} }
} }

View File

@ -44,6 +44,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3,
template <class Impl> template <class Impl>
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
RealD _mass, RealD _mass,
RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p) const ImplParams &p)
: Kernels(p), : Kernels(p),
_grid(&Fgrid), _grid(&Fgrid),
@ -62,6 +63,16 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
UUUmuOdd(&Hgrid) , UUUmuOdd(&Hgrid) ,
_tmp(&Hgrid) _tmp(&Hgrid)
{ {
int vol4;
int LLs=1;
c1=_c1;
c2=_c2;
u0=_u0;
vol4= _grid->oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4= _cbgrid->oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
} }
template <class Impl> template <class Impl>
@ -69,22 +80,10 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
GridRedBlackCartesian &Hgrid, RealD _mass, GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _c2,RealD _u0, RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p) const ImplParams &p)
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
{ {
c1=_c1;
c2=_c2;
u0=_u0;
ImportGauge(_Uthin,_Ufat); ImportGauge(_Uthin,_Ufat);
} }
template <class Impl>
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
const ImplParams &p)
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
{
ImportGaugeSimple(_Utriple,_Ufat);
}
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Momentum space propagator should be // Momentum space propagator should be
@ -98,11 +97,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,Gaug
// of above link to implmement fourier based solver. // of above link to implmement fourier based solver.
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin)
{
ImportGauge(_Uthin,_Uthin);
};
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat) void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
{ {
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
@ -125,6 +119,20 @@ void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utripl
PokeIndex<LorentzIndex>(Umu, -U, mu+4); PokeIndex<LorentzIndex>(Umu, -U, mu+4);
} }
CopyGaugeCheckerboards();
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
{
Umu = _U;
UUUmu = _UUU;
CopyGaugeCheckerboards();
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu); pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu); pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven,UUUmu); pickCheckerboard(Even, UUUmuEven,UUUmu);
@ -160,10 +168,7 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
} }
pickCheckerboard(Even, UmuEven, Umu); CopyGaugeCheckerboards();
pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven, UUUmu);
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
} }
///////////////////////////// /////////////////////////////
@ -322,6 +327,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) { void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=2;
conformable(in._grid, _grid); // verifies full grid conformable(in._grid, _grid); // verifies full grid
conformable(in._grid, out._grid); conformable(in._grid, out._grid);
@ -332,6 +338,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) { void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=1;
conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check conformable(in._grid, out._grid); // drops the cb check
@ -343,6 +350,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) { void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=1;
conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check conformable(in._grid, out._grid); // drops the cb check
@ -374,25 +382,193 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
DoubledGaugeField &U, DoubledGaugeField &U,
DoubledGaugeField &UUU, DoubledGaugeField &UUU,
const FermionField &in, const FermionField &in,
FermionField &out, int dag) { FermionField &out, int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out, int dag)
{
#ifdef GRID_OMP
Compressor compressor;
int len = U._grid->oSites();
const int LLs = 1;
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
DhopFaceTime += usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
}
}
} else {
st.CommunicateThreaded();
}
}
DhopComputeTime += usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
}
} else {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
}
}
DhopComputeTime2 += usecond();
#else
assert(0);
#endif
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor; Compressor compressor;
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
PARALLEL_FOR_LOOP parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
} }
} else { } else {
PARALLEL_FOR_LOOP parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
} }
} }
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}; };
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void ImprovedStaggeredFermion<Impl>::Report(void)
{
std::vector<int> latt = GridDefaultLatt();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Conserved current - not yet implemented. // Conserved current - not yet implemented.
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////

View File

@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@ -105,25 +117,34 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
// Constructor //////////////////////////////////////////////////////////////////////////
// Grid own interface Constructor
//////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass, GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p = ImplParams());
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
const ImplParams &p = ImplParams()); const ImplParams &p = ImplParams());
//////////////////////////////////////////////////////////////////////////
// MILC constructor no gauge fields
//////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
const ImplParams &p = ImplParams()); const ImplParams &p = ImplParams());
// DoubleStore impl dependent // DoubleStore impl dependent
void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat); void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat); void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
void ImportGauge(const GaugeField &_Uthin); void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U);
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
DoubledGaugeField &GetU(void) { return Umu ; } ;
DoubledGaugeField &GetUUU(void) { return UUUmu; };
void CopyGaugeCheckerboards(void);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Data members require to support the functionality // Data members require to support the functionality
@ -132,7 +153,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
// protected: // protected:
public: public:
// any other parameters of action ??? // any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass; RealD mass;
RealD u0; RealD u0;
RealD c1; RealD c1;

View File

@ -41,8 +41,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3,
// 5d lattice for DWF. // 5d lattice for DWF.
template<class Impl> template<class Impl>
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid, GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid, GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid, GridRedBlackCartesian &FourDimRedBlackGrid,
@ -121,16 +120,74 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,
assert(FiveDimGrid._simd_layout[0] ==1); assert(FiveDimGrid._simd_layout[0] ==1);
} }
int LLs = FiveDimGrid._rdimensions[0];
int vol4= FourDimGrid.oSites();
Stencil.BuildSurfaceList(LLs,vol4);
// Allocate the required comms buffer vol4=FourDimRedBlackGrid.oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven,UUUmu);
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
}
template<class Impl>
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,
RealD _c1,RealD _c2, RealD _u0,
const ImplParams &p) :
ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
FourDimGrid,FourDimRedBlackGrid,
_mass,_c1,_c2,_u0,p)
{
ImportGauge(_Uthin,_Ufat); ImportGauge(_Uthin,_Ufat);
} }
///////////////////////////////////////////////////
// For MILC use; pass three link U's and 1 link U
///////////////////////////////////////////////////
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin) void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
{ {
ImportGauge(_Uthin,_Uthin); /////////////////////////////////////////////////////////////////
}; // Trivial import; phases and fattening and such like preapplied
/////////////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
Impl::InsertGaugeField(UUUmu,U,mu);
U = adj( Cshift(U, mu, -3));
Impl::InsertGaugeField(UUUmu,-U,mu+4);
U = PeekIndex<LorentzIndex>(_Ufat, mu);
Impl::InsertGaugeField(Umu,U,mu);
U = adj( Cshift(U, mu, -1));
Impl::InsertGaugeField(Umu,-U,mu+4);
}
CopyGaugeCheckerboards();
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
{
/////////////////////////////////////////////////////////////////
// Trivial import; phases and fattening and such like preapplied
/////////////////////////////////////////////////////////////////
Umu = _U;
UUUmu = _UUU;
CopyGaugeCheckerboards();
}
template<class Impl> template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
{ {
@ -159,10 +216,7 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,cons
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
} }
pickCheckerboard(Even, UmuEven, Umu); CopyGaugeCheckerboards();
pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven, UUUmu);
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
} }
template<class Impl> template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp) void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
@ -223,6 +277,162 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
assert(0); assert(0);
} }
/*CHANGE */
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
int len = U._grid->oSites();
DhopFaceTime-=usecond();
st.Prepare();
st.HaloGather(in,compressor);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond();
double ctime=0;
double ptime=0;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U._grid->oSites(); // 4d vol
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
}
}
ptime = usecond() - start;
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
}
}
DhopCommTime += ctime;
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
DhopComputeTime2-=usecond();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
}
} else {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
}
}
DhopComputeTime2+=usecond();
#else
assert(0);
#endif
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
//double t1=usecond();
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
st.HaloExchange(in,compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
}
} else {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
}
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
//double t2=usecond();
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
}
/*CHANGE END*/
/* ORG
template<class Impl> template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU, DoubledGaugeField & U,DoubledGaugeField & UUU,
@ -254,6 +464,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
DhopComputeTime += usecond(); DhopComputeTime += usecond();
DhopTotalTime += usecond(); DhopTotalTime += usecond();
} }
*/
template<class Impl> template<class Impl>
@ -336,6 +547,9 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
DhopTotalTime = 0; DhopTotalTime = 0;
DhopCommTime = 0; DhopCommTime = 0;
DhopComputeTime = 0; DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters(); Stencil.ZeroCounters();
StencilEven.ZeroCounters(); StencilEven.ZeroCounters();
StencilOdd.ZeroCounters(); StencilOdd.ZeroCounters();

View File

@ -64,6 +64,8 @@ namespace QCD {
double DhopCalls; double DhopCalls;
double DhopCommTime; double DhopCommTime;
double DhopComputeTime; double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
@ -119,7 +121,27 @@ namespace QCD {
FermionField &out, FermionField &out,
int dag); int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out,
int dag);
// Constructors // Constructors
////////////////////////////////////////////////////////////////////////////////////////////////
// Grid internal interface -- Thin link and fat link, with coefficients
////////////////////////////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion5D(GaugeField &_Uthin, ImprovedStaggeredFermion5D(GaugeField &_Uthin,
GaugeField &_Ufat, GaugeField &_Ufat,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,
@ -127,17 +149,37 @@ namespace QCD {
GridCartesian &FourDimGrid, GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid, GridRedBlackCartesian &FourDimRedBlackGrid,
double _mass, double _mass,
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p= ImplParams()); const ImplParams &p= ImplParams());
////////////////////////////////////////////////////////////////////////////////////////////////
// DoubleStore // MILC constructor ; triple links, no rescale factors; must be externally pre multiplied
void ImportGauge(const GaugeField &_U); ////////////////////////////////////////////////////////////////////////////////////////////////
void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat); ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
double _mass,
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
const ImplParams &p= ImplParams());
// DoubleStore gauge field in operator
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
// Give a reference; can be used to do an assignment or copy back out after import
// if Carleton wants to cache them and not use the ImportSimple
DoubledGaugeField &GetU(void) { return Umu ; } ;
DoubledGaugeField &GetUUU(void) { return UUUmu; };
void CopyGaugeCheckerboards(void);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Data members require to support the functionality // Data members require to support the functionality
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
public: public:
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
GridBase *_FourDimGrid; GridBase *_FourDimGrid;
GridBase *_FourDimRedBlackGrid; GridBase *_FourDimRedBlackGrid;

View File

@ -32,223 +32,241 @@ namespace Grid {
namespace QCD { namespace QCD {
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric; int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in._odata[SE->_offset], ptype); \
} else { \
chi_p = &in._odata[SE->_offset]; \
} \
} else { \
chi_p = &buf[SE->_offset]; \
} \
multLink(Uchi, U._odata[sU], *chi_p, Dir);
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in._odata[SE->_offset], ptype); \
} else { \
chi_p = &in._odata[SE->_offset]; \
} \
} else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
}
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
chi_p = &buf[SE->_offset]; \
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
}
template <class Impl> template <class Impl>
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){}; StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
//////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// Generic implementation; move to different file? // Generic implementation; move to different file?
//////////////////////////////////////////// // Int, Ext, Int+Ext cases for comms overlap
////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
SiteSpinor *buf, int sF, DoubledGaugeField &U, DoubledGaugeField &UUU,
int sU, const FermionField &in, SiteSpinor &out,int threeLink) { SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out, int dag) {
const SiteSpinor *chi_p; const SiteSpinor *chi_p;
SiteSpinor chi; SiteSpinor chi;
SiteSpinor Uchi; SiteSpinor Uchi;
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
int skew = 0; int skew;
if (threeLink) skew=8;
///////////////////////////
// Xp
///////////////////////////
SE = st.GetEntry(ptype, Xp+skew, sF); for(int s=0;s<LLs;s++){
if (SE->_is_local) { int sF=LLs*sU+s;
if (SE->_permute) { skew = 0;
chi_p = &chi; GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
permute(chi, in._odata[SE->_offset], ptype); GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
} else { GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
chi_p = &in._odata[SE->_offset]; GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
} GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
} else { GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
chi_p = &buf[SE->_offset]; GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
if ( dag ) {
Uchi = - Uchi;
}
vstream(out._odata[sF], Uchi);
} }
Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp);
///////////////////////////
// Yp
///////////////////////////
SE = st.GetEntry(ptype, Yp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp);
///////////////////////////
// Zp
///////////////////////////
SE = st.GetEntry(ptype, Zp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp);
///////////////////////////
// Tp
///////////////////////////
SE = st.GetEntry(ptype, Tp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp);
///////////////////////////
// Xm
///////////////////////////
SE = st.GetEntry(ptype, Xm+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm);
///////////////////////////
// Ym
///////////////////////////
SE = st.GetEntry(ptype, Ym+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym);
///////////////////////////
// Zm
///////////////////////////
SE = st.GetEntry(ptype, Zm+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm);
///////////////////////////
// Tm
///////////////////////////
SE = st.GetEntry(ptype, Tm+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm);
vstream(out, Uchi);
}; };
///////////////////////////////////////////////////
// Only contributions from interior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
Uchi=zero;
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
if ( dag ) {
Uchi = - Uchi;
}
vstream(out._odata[sF], Uchi);
}
};
///////////////////////////////////////////////////
// Only contributions from exterior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int nmu=0;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
Uchi=zero;
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
if ( nmu ) {
if ( dag ) {
out._odata[sF] = out._odata[sF] - Uchi;
} else {
out._odata[sF] = out._odata[sF] + Uchi;
}
}
}
};
////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel
////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out) { const FermionField &in, FermionField &out,
SiteSpinor naik; int interior,int exterior)
SiteSpinor naive; {
int oneLink =0;
int threeLink=1;
int dag=1; int dag=1;
switch(Opt) { DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
#ifdef AVX512 };
//FIXME; move the sign into the Asm routine
case OptInlineAsm: template <class Impl>
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
for(int s=0;s<LLs;s++) { SiteSpinor *buf, int LLs, int sU,
int sF=s+LLs*sU; const FermionField &in, FermionField &out,
out._odata[sF]=-out._odata[sF]; int interior,int exterior)
} {
break; int dag=0;
#endif DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
case OptHandUnroll:
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
break;
case OptGeneric:
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =-naive-naik;
}
break;
default:
std::cout<<"Oops Opt = "<<Opt<<std::endl;
assert(0);
break;
}
}; };
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out) int sU, const FermionField &in, FermionField &out,
int dag,int interior,int exterior)
{ {
int oneLink =0;
int threeLink=1;
SiteSpinor naik;
SiteSpinor naive;
int dag=0;
switch(Opt) { switch(Opt) {
#ifdef AVX512 #ifdef AVX512
case OptInlineAsm: case OptInlineAsm:
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); if ( interior && exterior ) {
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else {
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
assert(0);
}
break; break;
#endif #endif
case OptHandUnroll: case OptHandUnroll:
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag); if ( interior && exterior ) {
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break; break;
case OptGeneric: case OptGeneric:
for(int s=0;s<LLs;s++){ if ( interior && exterior ) {
int sF=LLs*sU+s; DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
// assert(sF<in._odata.size()); } else if ( interior ) {
// assert(sU< U._odata.size()); DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
// assert(sF>=0); assert(sU>=0); } else if ( exterior ) {
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =naive+naik;
} }
break; break;
default: default:

View File

@ -38,8 +38,9 @@ namespace QCD {
class StaggeredKernelsStatic { class StaggeredKernelsStatic {
public: public:
enum { OptGeneric, OptHandUnroll, OptInlineAsm }; enum { OptGeneric, OptHandUnroll, OptInlineAsm };
// S-direction is INNERMOST and takes no part in the parity. enum { CommsAndCompute, CommsThenCompute };
static int Opt; // these are a temporary hack static int Opt;
static int Comms;
}; };
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic { template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
@ -53,24 +54,62 @@ public:
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp); int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, ///////////////////////////////////////////////////////////////////////////////////////
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink); // Generic Nc kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, ///////////////////////////////////////////////////////////////////////////////////////
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink); // Asm Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Generic interface; fan out to right routine
///////////////////////////////////////////////////////////////////////////////////////////////////
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf, void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
int LLs, int sU, const FermionField &in, FermionField &out, int dag); DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf, void DhopSite(StencilImpl &st, LebesgueOrder &lo,
int LLs, int sU, const FermionField &in, FermionField &out); DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, const FermionField &in, FermionField &out, int dag, int interior,int exterior);
int sF, int sU, const FermionField &in, FermionField &out);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,
int LLs, int sU, const FermionField &in, FermionField &out);
public: public:

View File

@ -560,16 +560,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSTORE(2,%0,pUChi_02) \ VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" ); : : "r" (out) : "memory" );
#define nREDUCE(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) \
VADD(UChi_30,UChi_20,UChi_30) \
VADD(UChi_31,UChi_21,UChi_31) \
VADD(UChi_32,UChi_22,UChi_32) \
VADD(UChi_00,UChi_30,UChi_00) \
VADD(UChi_01,UChi_31,UChi_01) \
VADD(UChi_02,UChi_32,UChi_02) ); \
asm (VZERO(Chi_00) \
VSUB(UChi_00,Chi_00,UChi_00) \
VSUB(UChi_01,Chi_00,UChi_01) \
VSUB(UChi_02,Chi_00,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define REDUCEa(out) \ #define REDUCEa(out) \
asm ( \ asm ( \
VADD(UChi_00,UChi_10,UChi_00) \ VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \ VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) ); \ VADD(UChi_02,UChi_12,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
// FIXME is sign right in the VSUB ?
#define nREDUCEa(out) \
asm ( \ asm ( \
VSTORE(0,%0,pUChi_00) \ VADD(UChi_00,UChi_10,UChi_00) \
VSTORE(1,%0,pUChi_01) \ VADD(UChi_01,UChi_11,UChi_01) \
VSTORE(2,%0,pUChi_02) \ VADD(UChi_02,UChi_12,UChi_02) ); \
: : "r" (out) : "memory" ); asm (VZERO(Chi_00) \
VSUB(UChi_00,Chi_00,UChi_00) \
VSUB(UChi_01,Chi_00,UChi_01) \
VSUB(UChi_02,Chi_00,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define PERMUTE_DIR(dir) \ #define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0);\ permute##dir(Chi_0,Chi_0);\
@ -581,10 +618,9 @@ namespace QCD {
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U, DoubledGaugeField &UUU,
DoubledGaugeField &UUU, SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int LLs, const FermionField &in, FermionField &out,int dag)
int sU, const FermionField &in, FermionField &out)
{ {
assert(0); assert(0);
}; };
@ -645,10 +681,9 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
// This is the single precision 5th direction vectorised kernel // This is the single precision 5th direction vectorised kernel
#include <simd/Intel512single.h> #include <simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U, DoubledGaugeField &UUU,
DoubledGaugeField &UUU, SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int LLs, const FermionField &in, FermionField &out,int dag)
int sU, const FermionField &in, FermionField &out)
{ {
#ifdef AVX512 #ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3; uint64_t gauge0,gauge1,gauge2,gauge3;
@ -685,7 +720,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF]; addr0 = (uint64_t) &out._odata[sF];
REDUCE(addr0); if ( dag ) {
nREDUCE(addr0);
} else {
REDUCE(addr0);
}
} }
#else #else
assert(0); assert(0);
@ -695,10 +734,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
#include <simd/Intel512double.h> #include <simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U, DoubledGaugeField &UUU,
DoubledGaugeField &UUU, SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int LLs, const FermionField &in, FermionField &out,int dag)
int sU, const FermionField &in, FermionField &out)
{ {
#ifdef AVX512 #ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3; uint64_t gauge0,gauge1,gauge2,gauge3;
@ -734,7 +772,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF]; addr0 = (uint64_t) &out._odata[sF];
REDUCE(addr0); if ( dag ) {
nREDUCE(addr0);
} else {
REDUCE(addr0);
}
} }
#else #else
assert(0); assert(0);
@ -776,10 +818,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
#include <simd/Intel512single.h> #include <simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U, DoubledGaugeField &UUU,
DoubledGaugeField &UUU, SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int LLs, const FermionField &in, FermionField &out,int dag)
int sU, const FermionField &in, FermionField &out)
{ {
#ifdef AVX512 #ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3; uint64_t gauge0,gauge1,gauge2,gauge3;
@ -832,7 +873,11 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
MULT_ADD_XYZT(gauge2,gauge3); MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF]; addr0 = (uint64_t) &out._odata[sF];
REDUCEa(addr0); if ( dag ) {
nREDUCEa(addr0);
} else {
REDUCEa(addr0);
}
} }
#else #else
assert(0); assert(0);
@ -841,10 +886,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
#include <simd/Intel512double.h> #include <simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U, DoubledGaugeField &UUU,
DoubledGaugeField &UUU, SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int LLs, const FermionField &in, FermionField &out,int dag)
int sU, const FermionField &in, FermionField &out)
{ {
#ifdef AVX512 #ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3; uint64_t gauge0,gauge1,gauge2,gauge3;
@ -897,7 +941,11 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
MULT_ADD_XYZT(gauge2,gauge3); MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF]; addr0 = (uint64_t) &out._odata[sF];
REDUCEa(addr0); if ( dag ) {
nREDUCEa(addr0);
} else {
REDUCEa(addr0);
}
} }
#else #else
assert(0); assert(0);
@ -909,7 +957,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
DoubledGaugeField &U, \ DoubledGaugeField &U, \
DoubledGaugeField &UUU, \ DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, \ SiteSpinor *buf, int LLs, \
int sU, const FermionField &in, FermionField &out); int sU, const FermionField &in, FermionField &out,int dag);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF); KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);

View File

@ -28,7 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid.h>
#define REGISTER
#define LOAD_CHI(b) \ #define LOAD_CHI(b) \
const SiteSpinor & ref (b[offset]); \ const SiteSpinor & ref (b[offset]); \
@ -59,7 +58,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
UChi ## _1 += U_12*Chi_2;\ UChi ## _1 += U_12*Chi_2;\
UChi ## _2 += U_22*Chi_2; UChi ## _2 += U_22*Chi_2;
#define MULT_ADD(A,UChi) \ #define MULT_ADD(U,A,UChi) \
auto & ref(U._odata[sU](A)); \ auto & ref(U._odata[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \ Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \ Impl::loadLinkElement(U_10,ref()(1,0)); \
@ -82,241 +81,319 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define PERMUTE_DIR(dir) \ #define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0);\ permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1);\ permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2); permute##dir(Chi_2,Chi_2);
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in._odata); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else { \
LOAD_CHI(buf); \
}
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
{ \
MULT(Dir,even); \
}
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
{ \
MULT_ADD(U,Dir,even); \
}
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in._odata); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else if ( st.same_node[Dir] ) { \
LOAD_CHI(buf); \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
MULT_ADD(U,Dir,even); \
}
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
{ LOAD_CHI(buf); } \
{ MULT_ADD(U,Dir,even); } \
}
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
SiteSpinor *buf, int LLs, DoubledGaugeField &U,DoubledGaugeField &UUU,
int sU, const FermionField &in, FermionField &out, int dag) SiteSpinor *buf, int LLs, int sU,
{ const FermionField &in, FermionField &out,int dag)
SiteSpinor naik;
SiteSpinor naive;
int oneLink =0;
int threeLink=1;
int skew(0);
Real scale(1.0);
if(dag) scale = -1.0;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =scale*(naive+naik);
}
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteSpinor *buf, int sF,
int sU, const FermionField &in, SiteSpinor &out,int threeLink)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
REGISTER Simd even_0; // 12 regs on knc Simd even_0; // 12 regs on knc
REGISTER Simd even_1; Simd even_1;
REGISTER Simd even_2; Simd even_2;
REGISTER Simd odd_0; // 12 regs on knc Simd odd_0; // 12 regs on knc
REGISTER Simd odd_1; Simd odd_1;
REGISTER Simd odd_2; Simd odd_2;
REGISTER Simd Chi_0; // two spinor; 6 regs Simd Chi_0; // two spinor; 6 regs
REGISTER Simd Chi_1; Simd Chi_1;
REGISTER Simd Chi_2; Simd Chi_2;
REGISTER Simd U_00; // two rows of U matrix Simd U_00; // two rows of U matrix
REGISTER Simd U_10; Simd U_10;
REGISTER Simd U_20; Simd U_20;
REGISTER Simd U_01; Simd U_01;
REGISTER Simd U_11; Simd U_11;
REGISTER Simd U_21; // 2 reg left. Simd U_21; // 2 reg left.
REGISTER Simd U_02; Simd U_02;
REGISTER Simd U_12; Simd U_12;
REGISTER Simd U_22; Simd U_22;
int skew = 0;
if (threeLink) skew=8;
SiteSpinor result;
int offset,local,perm, ptype; int offset,local,perm, ptype;
StencilEntry *SE; StencilEntry *SE;
int skew;
// Xp for(int s=0;s<LLs;s++){
SE=st.GetEntry(ptype,Xp+skew,sF); int sF=s+LLs*sU;
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT(Xp,even);
}
// Yp
SE=st.GetEntry(ptype,Yp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT(Yp,odd);
}
skew = 0;
// Zp HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
SE=st.GetEntry(ptype,Zp+skew,sF); HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
offset = SE->_offset; HAND_STENCIL_LEG (U,Zp,1,skew,even);
local = SE->_is_local; HAND_STENCIL_LEG (U,Tp,0,skew,odd);
perm = SE->_permute; HAND_STENCIL_LEG (U,Xm,3,skew,even);
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
if ( local ) { HAND_STENCIL_LEG (U,Zm,1,skew,even);
LOAD_CHI(in._odata); HAND_STENCIL_LEG (U,Tm,0,skew,odd);
if ( perm) { skew = 8;
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
} }
} else { vstream(out._odata[sF],result);
LOAD_CHI(buf);
} }
{
MULT_ADD(Zp,even);
}
// Tp
SE=st.GetEntry(ptype,Tp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Tp,odd);
}
// Xm
SE=st.GetEntry(ptype,Xm+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Xm,even);
}
// Ym
SE=st.GetEntry(ptype,Ym+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Ym,odd);
}
// Zm
SE=st.GetEntry(ptype,Zm+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Zm,even);
}
// Tm
SE=st.GetEntry(ptype,Tm+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Tm,odd);
}
vstream(out()()(0),even_0+odd_0);
vstream(out()()(1),even_1+odd_1);
vstream(out()()(2),even_2+odd_2);
} }
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
even_0 = zero; even_1 = zero; even_2 = zero;
odd_0 = zero; odd_1 = zero; odd_2 = zero;
skew = 0;
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
vstream(out._odata[sF],result);
}
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
even_0 = zero; even_1 = zero; even_2 = zero;
odd_0 = zero; odd_1 = zero; odd_2 = zero;
int nmu=0;
skew = 0;
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
// Add sum of all exterior connected stencil legs
if ( nmu ) {
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
out._odata[sF] = out._odata[sF] + result;
}
}
}
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ #define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \ DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, \ SiteSpinor *buf, int LLs, int sU, \
int sU, const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \
SiteSpinor *buf, int sF, \
int sU, const FermionField &in, SiteSpinor &out,int threeLink) ;
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF); DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF); DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
}} }
}

View File

@ -274,41 +274,16 @@ public:
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl; if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
} }
std::vector<int> same_node;
std::vector<int> surface_list;
WilsonStencil(GridBase *grid, WilsonStencil(GridBase *grid,
int npoints, int npoints,
int checkerboard, int checkerboard,
const std::vector<int> &directions, const std::vector<int> &directions,
const std::vector<int> &distances) const std::vector<int> &distances)
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
same_node(npoints)
{ {
ZeroCountersi(); ZeroCountersi();
surface_list.resize(0);
}; };
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
template < class compressor> template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
@ -369,23 +344,23 @@ public:
int dag = compress.dag; int dag = compress.dag;
int face_idx=0; int face_idx=0;
if ( dag ) { if ( dag ) {
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
} else { } else {
assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
} }
this->face_table_computed=1; this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size); assert(this->u_comm_offset==this->_unified_buffer_size);

View File

@ -349,15 +349,98 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
} }
}; }
/*Change starts*/
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo, void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, const FermionField &in,
FermionField &out, int dag) { FermionField &out, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes)); #ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
#endif
DhopInternalSerial(st,lo,U,in,out,dag);
}
template <class Impl>
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes));
#ifdef GRID_OMP
Compressor compressor;
int len = U._grid->oSites();
const int LLs = 1;
st.Prepare();
st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int sss = myblock; sss < myblock+myn; ++sss) {
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} else {
for (int sss = myblock; sss < myblock+myn; ++sss) {
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} //else
} else {
st.CommunicateThreaded();
}
Compressor compressor(dag);
if (dag == DaggerYes) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} else {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
}
} //pragma
#else
assert(0);
#endif
};
template <class Impl>
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
@ -371,6 +454,7 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
} }
} }
}; };
/*Change ends */
/******************************************************************************* /*******************************************************************************
* Conserved current utilities for Wilson fermions, for contracting propagators * Conserved current utilities for Wilson fermions, for contracting propagators

View File

@ -130,6 +130,12 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
// Constructor // Constructor
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass, GridRedBlackCartesian &Hgrid, RealD _mass,
@ -145,6 +151,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
// protected: // protected:
public: public:
virtual RealD Mass(void) { return mass; }
virtual int isTrivialEE(void) { return 1; };
RealD mass; RealD mass;
RealD diag_mass; RealD diag_mass;

View File

@ -446,8 +446,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
} }
} }
ptime = usecond() - start; ptime = usecond() - start;
} } else {
{
double start = usecond(); double start = usecond();
st.CommunicateThreaded(); st.CommunicateThreaded();
ctime = usecond() - start; ctime = usecond() - start;

View File

@ -53,7 +53,7 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
typedef FermionOperator<Impl> Base; typedef FermionOperator<Impl> Base;
public: public:
template <bool EnableBool = true> template <bool EnableBool = true>
typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
@ -70,27 +70,27 @@ public:
break; break;
#endif #endif
case OptHandUnroll: case OptHandUnroll:
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out); if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out); else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out); else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
sF++; sF++;
} }
sU++; sU++;
} }
break; break;
case OptGeneric: case OptGeneric:
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out); if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out); else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out); else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
else assert(0); else assert(0);
sF++; sF++;
} }
sU++; sU++;
} }
break; break;
default: default:
assert(0); assert(0);
@ -232,6 +232,7 @@ private:
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out); int sF, int sU, const FermionField &in, FermionField &out);
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);

View File

@ -150,7 +150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
std::vector<int> _distances; std::vector<int> _distances;
std::vector<int> _comm_buf_size; std::vector<int> _comm_buf_size;
std::vector<int> _permute_type; std::vector<int> _permute_type;
std::vector<int> same_node;
std::vector<int> surface_list;
Vector<StencilEntry> _entries; Vector<StencilEntry> _entries;
std::vector<Packet> Packets; std::vector<Packet> Packets;
std::vector<Merge> Mergers; std::vector<Merge> Mergers;
@ -201,7 +203,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
int dimension = _directions[point]; int dimension = _directions[point];
int displacement = _distances[point]; int displacement = _distances[point];
assert( (displacement==1) || (displacement==-1));
int pd = _grid->_processors[dimension]; int pd = _grid->_processors[dimension];
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
@ -216,9 +218,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
if ( ! comm_dim ) return 1; if ( ! comm_dim ) return 1;
int nbr_proc; int nbr_proc;
if (displacement==1) nbr_proc = 1; if (displacement>0) nbr_proc = 1;
else nbr_proc = pd-1; else nbr_proc = pd-1;
// FIXME this logic needs to be sorted for three link term
// assert( (displacement==1) || (displacement==-1));
// Present hack only works for >= 4^4 subvol per node
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
@ -539,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
} }
}; };
// Move interior/exterior split into the generic stencil
// FIXME Explicit Ls in interface is a pain. Should just use a vol
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
CartesianStencil(GridBase *grid, CartesianStencil(GridBase *grid,
int npoints, int npoints,
int checkerboard, int checkerboard,
@ -549,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
comm_bytes_thr(npoints), comm_bytes_thr(npoints),
comm_enter_thr(npoints), comm_enter_thr(npoints),
comm_leave_thr(npoints), comm_leave_thr(npoints),
comm_time_thr(npoints) comm_time_thr(npoints),
same_node(npoints)
{ {
face_table_computed=0; face_table_computed=0;
_npoints = npoints; _npoints = npoints;
@ -557,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
_directions = directions; _directions = directions;
_distances = distances; _distances = distances;
_unified_buffer_size=0; _unified_buffer_size=0;
surface_list.resize(0);
int osites = _grid->oSites(); int osites = _grid->oSites();

View File

@ -368,8 +368,10 @@ void Grid_init(int *argc,char ***argv)
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute; QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute;
} else { } else {
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute;
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent); CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);

View File

@ -141,6 +141,7 @@ int main (int argc, char ** argv)
t1=usecond(); t1=usecond();
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl; std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl; std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
@ -160,7 +161,8 @@ int main (int argc, char ** argv)
localConvert(sresult,tmp); localConvert(sresult,tmp);
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl; std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl; std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
@ -181,6 +183,7 @@ int main (int argc, char ** argv)
localConvert(sresult,tmp); localConvert(sresult,tmp);
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl; std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl; std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;

View File

@ -0,0 +1,196 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_wilson.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG4(UGrid);
GridParallelRNG pRNG5(FGrid);
pRNG4.SeedFixedIntegers(seeds);
pRNG5.SeedFixedIntegers(seeds);
typedef typename ImprovedStaggeredFermion5DF::FermionField FermionField;
typedef typename ImprovedStaggeredFermion5DF::ComplexField ComplexField;
typename ImprovedStaggeredFermion5DF::ImplParams params;
FermionField src (FGrid);
random(pRNG5,src);
/*
std::vector<int> site({0,1,2,0,0});
ColourVector cv = zero;
cv()()(0)=1.0;
src = zero;
pokeSite(cv,src,site);
*/
FermionField result(FGrid); result=zero;
FermionField tmp(FGrid); tmp=zero;
FermionField err(FGrid); tmp=zero;
FermionField phi (FGrid); random(pRNG5,phi);
FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeFieldF Umu(UGrid);
SU3::HotConfiguration(pRNG4,Umu);
/*
for(int mu=1;mu<4;mu++){
auto tmp = PeekIndex<LorentzIndex>(Umu,mu);
tmp = zero;
PokeIndex<LorentzIndex>(Umu,tmp,mu);
}
*/
double volume=Ls;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermion5DF Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params);
ImprovedStaggeredFermionVec5dF sDs(Umu,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,c1,c2,u0,params);
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation "<<std::endl;
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
int ncall=1000;
int ncall1=1000;
double t0(0),t1(0);
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "Calling staggered operator"<<std::endl;
t0=usecond();
for(int i=0;i<ncall1;i++){
Ds.Dhop(src,result,0);
}
t1=usecond();
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
#ifdef AVX512
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
#else
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
#endif
t0=usecond();
for(int i=0;i<ncall1;i++){
Ds.Dhop(src,tmp,0);
}
t1=usecond();
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = tmp-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
FermionField ssrc (sFGrid); localConvert(src,ssrc);
FermionField sresult(sFGrid); sresult=zero;
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
t0=usecond();
for(int i=0;i<ncall1;i++){
sDs.Dhop(ssrc,sresult,0);
}
t1=usecond();
localConvert(sresult,tmp);
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
#ifdef AVX512
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
#else
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
#endif
err = tmp-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
int extra=1;
t0=usecond();
for(int i=0;i<ncall1*extra;i++){
sDs.Dhop(ssrc,sresult,0);
}
t1=usecond();
localConvert(sresult,tmp);
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
err = tmp-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Grid_finalize();
}

View File

@ -74,8 +74,16 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
RealD mass=0.003; RealD mass=0.003;
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds); SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
ConjugateGradient<FermionField> CG(1.0e-8,10000); ConjugateGradient<FermionField> CG(1.0e-8,10000);
@ -87,14 +95,26 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "****************************************************************** "<<std::endl; std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl; std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
std::cout << GridLogMessage << "****************************************************************** "<<std::endl; std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass); ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d); SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
FermionField src4d(UGrid); random(pRNG,src4d); FermionField src4d(UGrid); random(pRNG,src4d);
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d); FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
FermionField result4d_o(UrbGrid); FermionField result4d_o(UrbGrid);
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
result4d_o=zero; result4d_o=zero;
CG(HermOp4d,src4d_o,result4d_o); {
double t1=usecond();
CG(HermOp4d,src4d_o,result4d_o);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp4d.Report();
}
Ds4d.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -103,7 +123,17 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
Ds.ZeroCounters(); Ds.ZeroCounters();
result_o=zero; result_o=zero;
CG(HermOp,src_o,result_o); {
double t1=usecond();
CG(HermOp,src_o,result_o);
double t2=usecond();
double ncall=CG.IterationsToComplete*Ls;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -112,7 +142,18 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
Ds.ZeroCounters(); Ds.ZeroCounters();
result_o=zero; result_o=zero;
mCG(HermOp,src_o,result_o); {
double t1=usecond();
mCG(HermOp,src_o,result_o);
double t2=usecond();
double ncall=mCG.IterationsToComplete*Ls;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -121,7 +162,17 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
Ds.ZeroCounters(); Ds.ZeroCounters();
result_o=zero; result_o=zero;
BCGrQ(HermOp,src_o,result_o); {
double t1=usecond();
BCGrQ(HermOp,src_o,result_o);
double t2=usecond();
double ncall=BCGrQ.IterationsToComplete*Ls;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

View File

@ -74,7 +74,16 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
RealD mass=0.003; RealD mass=0.003;
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds); MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
ConjugateGradient<FermionField> CG(1.0e-8,10000); ConjugateGradient<FermionField> CG(1.0e-8,10000);
@ -86,11 +95,23 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "****************************************************************** "<<std::endl; std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl; std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
std::cout << GridLogMessage << "****************************************************************** "<<std::endl; std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass); ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d); MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
FermionField src4d(UGrid); random(pRNG,src4d); FermionField src4d(UGrid); random(pRNG,src4d);
FermionField result4d(UGrid); result4d=zero; FermionField result4d(UGrid); result4d=zero;
CG(HermOp4d,src4d,result4d);
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
{
double t1=usecond();
CG(HermOp4d,src4d,result4d);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
}
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -98,9 +119,18 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl; std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero; result=zero;
{
Ds.ZeroCounters(); Ds.ZeroCounters();
double t1=usecond();
CG(HermOp,src,result); CG(HermOp,src,result);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
Ds.Report(); Ds.Report();
}
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -108,7 +138,16 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero; result=zero;
Ds.ZeroCounters(); Ds.ZeroCounters();
{
double t1=usecond();
mCG(HermOp,src,result); mCG(HermOp,src,result);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -117,7 +156,16 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero; result=zero;
Ds.ZeroCounters(); Ds.ZeroCounters();
{
double t1=usecond();
BCGrQ(HermOp,src,result); BCGrQ(HermOp,src,result);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

View File

@ -71,7 +71,10 @@ int main (int argc, char ** argv)
} }
RealD mass=0.003; RealD mass=0.003;
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
FermionField res_o(&RBGrid); FermionField res_o(&RBGrid);
FermionField src_o(&RBGrid); FermionField src_o(&RBGrid);
@ -80,7 +83,19 @@ int main (int argc, char ** argv)
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds); SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
ConjugateGradient<FermionField> CG(1.0e-8,10000); ConjugateGradient<FermionField> CG(1.0e-8,10000);
double t1=usecond();
CG(HermOpEO,src_o,res_o); CG(HermOpEO,src_o,res_o);
double t2=usecond();
// Schur solver: uses DeoDoe => volume * 1146
double ncall=CG.IterationsToComplete;
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
FermionField tmp(&RBGrid); FermionField tmp(&RBGrid);

View File

@ -65,7 +65,10 @@ int main (int argc, char ** argv)
FermionField resid(&Grid); FermionField resid(&Grid);
RealD mass=0.1; RealD mass=0.1;
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
ConjugateGradient<FermionField> CG(1.0e-8,10000); ConjugateGradient<FermionField> CG(1.0e-8,10000);
SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG); SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);

View File

@ -73,7 +73,10 @@ int main (int argc, char ** argv)
} }
RealD mass=0.1; RealD mass=0.1;
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass); RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds); MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
ConjugateGradient<FermionField> CG(1.0e-6,10000); ConjugateGradient<FermionField> CG(1.0e-6,10000);

View File

@ -0,0 +1,121 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_wilson_cg_unprec.cc
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class d>
struct scal {
d internal;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv)
{
typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
typename ImprovedStaggeredFermionR::ImplParams params;
Grid_init(&argc,&argv);
std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(&Grid);
std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
////////////////////////////////////////
// sqrt
////////////////////////////////////////
double lo=0.001;
double hi=1.0;
int precision=64;
int degree=10;
AlgRemez remez(lo,hi,precision);
remez.generateApprox(degree,1,2);
MultiShiftFunction Sqrt(remez,1.0e-6,false);
std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
////////////////////////////////////////////
// Setup staggered
////////////////////////////////////////////
RealD mass=0.003;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
FermionField src(&Grid); random(pRNG,src);
FermionField src_o(&RBGrid);
pickCheckerboard(Odd,src_o,src);
/////////////////////////////////
//Multishift CG
/////////////////////////////////
std::vector<FermionField> result(degree,&RBGrid);
ConjugateGradientMultiShift<FermionField> MSCG(10000,Sqrt);
double deodoe_flops=(1205+15*degree)*volume; // == 66*16 + == 1146
double t1=usecond();
MSCG(HermOpEO,src_o,result);
double t2=usecond();
double ncall=MSCG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
// HermOpEO.Report();
Grid_finalize();
}