mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'feature/staggered-comms-compute' into develop
This commit is contained in:
commit
013ea4e8d1
@ -51,7 +51,7 @@ namespace Grid {
|
||||
|
||||
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
|
||||
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
|
||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
|
||||
virtual void HermOp(const Field &in, Field &out)=0;
|
||||
};
|
||||
|
||||
@ -309,36 +309,59 @@ namespace Grid {
|
||||
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
||||
protected:
|
||||
Matrix &_Mat;
|
||||
Field tmp;
|
||||
RealD mass;
|
||||
double tMpc;
|
||||
double tIP;
|
||||
double tMeo;
|
||||
double taxpby_norm;
|
||||
uint64_t ncall;
|
||||
public:
|
||||
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
|
||||
void Report(void)
|
||||
{
|
||||
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
|
||||
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
|
||||
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
|
||||
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
|
||||
}
|
||||
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
|
||||
{
|
||||
assert( _Mat.isTrivialEE() );
|
||||
mass = _Mat.Mass();
|
||||
tMpc=0;
|
||||
tIP =0;
|
||||
tMeo=0;
|
||||
taxpby_norm=0;
|
||||
ncall=0;
|
||||
}
|
||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||
GridLogIterative.TimingMode(1);
|
||||
std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl;
|
||||
ncall++;
|
||||
tMpc-=usecond();
|
||||
n2 = Mpc(in,out);
|
||||
std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl;
|
||||
tMpc+=usecond();
|
||||
tIP-=usecond();
|
||||
ComplexD dot= innerProduct(in,out);
|
||||
std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl;
|
||||
tIP+=usecond();
|
||||
n1 = real(dot);
|
||||
}
|
||||
virtual void HermOp(const Field &in, Field &out){
|
||||
std::cout << GridLogIterative << " HermOp "<<std::endl;
|
||||
Mpc(in,out);
|
||||
ncall++;
|
||||
tMpc-=usecond();
|
||||
_Mat.Meooe(in,out);
|
||||
_Mat.Meooe(out,tmp);
|
||||
tMpc+=usecond();
|
||||
taxpby_norm-=usecond();
|
||||
axpby(out,-1.0,mass*mass,tmp,in);
|
||||
taxpby_norm+=usecond();
|
||||
}
|
||||
virtual RealD Mpc (const Field &in, Field &out) {
|
||||
Field tmp(in._grid);
|
||||
Field tmp2(in._grid);
|
||||
|
||||
std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
|
||||
_Mat.Mooee(in,out);
|
||||
_Mat.Mooee(out,tmp);
|
||||
std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
|
||||
|
||||
tMeo-=usecond();
|
||||
_Mat.Meooe(in,out);
|
||||
_Mat.Meooe(out,tmp2);
|
||||
std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl;
|
||||
|
||||
RealD nn=axpy_norm(out,-1.0,tmp2,tmp);
|
||||
std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl;
|
||||
_Mat.Meooe(out,tmp);
|
||||
tMeo+=usecond();
|
||||
taxpby_norm-=usecond();
|
||||
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
|
||||
taxpby_norm+=usecond();
|
||||
return nn;
|
||||
}
|
||||
virtual RealD MpcDag (const Field &in, Field &out){
|
||||
|
@ -54,6 +54,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
||||
|
||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
||||
|
||||
|
||||
psi.checkerboard = src.checkerboard;
|
||||
conformable(psi, src);
|
||||
|
||||
@ -70,7 +71,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
||||
|
||||
Linop.HermOpAndNorm(psi, mmp, d, b);
|
||||
|
||||
|
||||
r = src - mmp;
|
||||
p = r;
|
||||
|
||||
@ -96,38 +96,44 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
||||
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
|
||||
|
||||
GridStopWatch LinalgTimer;
|
||||
GridStopWatch InnerTimer;
|
||||
GridStopWatch AxpyNormTimer;
|
||||
GridStopWatch LinearCombTimer;
|
||||
GridStopWatch MatrixTimer;
|
||||
GridStopWatch SolverTimer;
|
||||
|
||||
SolverTimer.Start();
|
||||
int k;
|
||||
for (k = 1; k <= MaxIterations; k++) {
|
||||
for (k = 1; k <= MaxIterations*1000; k++) {
|
||||
c = cp;
|
||||
|
||||
MatrixTimer.Start();
|
||||
Linop.HermOpAndNorm(p, mmp, d, qq);
|
||||
Linop.HermOp(p, mmp);
|
||||
MatrixTimer.Stop();
|
||||
|
||||
LinalgTimer.Start();
|
||||
// RealD qqck = norm2(mmp);
|
||||
// ComplexD dck = innerProduct(p,mmp);
|
||||
|
||||
InnerTimer.Start();
|
||||
ComplexD dc = innerProduct(p,mmp);
|
||||
InnerTimer.Stop();
|
||||
d = dc.real();
|
||||
a = c / d;
|
||||
b_pred = a * (a * qq - d) / c;
|
||||
|
||||
AxpyNormTimer.Start();
|
||||
cp = axpy_norm(r, -a, mmp, r);
|
||||
AxpyNormTimer.Stop();
|
||||
b = cp / c;
|
||||
|
||||
// Fuse these loops ; should be really easy
|
||||
psi = a * p + psi;
|
||||
p = p * b + r;
|
||||
|
||||
LinearCombTimer.Start();
|
||||
parallel_for(int ss=0;ss<src._grid->oSites();ss++){
|
||||
vstream(psi[ss], a * p[ss] + psi[ss]);
|
||||
vstream(p [ss], b * p[ss] + r[ss]);
|
||||
}
|
||||
LinearCombTimer.Stop();
|
||||
LinalgTimer.Stop();
|
||||
|
||||
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
||||
<< " residual " << cp << " target " << rsq << std::endl;
|
||||
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
|
||||
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
|
||||
|
||||
// Stopping condition
|
||||
if (cp <= rsq) {
|
||||
@ -148,6 +154,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
||||
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
|
||||
|
||||
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
||||
|
||||
|
@ -43,6 +43,7 @@ namespace Grid {
|
||||
public:
|
||||
RealD Tolerance;
|
||||
Integer MaxIterations;
|
||||
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
|
||||
int verbose;
|
||||
MultiShiftFunction shifts;
|
||||
|
||||
@ -164,6 +165,15 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
||||
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
|
||||
}
|
||||
|
||||
///////////////////////////////////////
|
||||
// Timers
|
||||
///////////////////////////////////////
|
||||
GridStopWatch AXPYTimer;
|
||||
GridStopWatch ShiftTimer;
|
||||
GridStopWatch QRTimer;
|
||||
GridStopWatch MatrixTimer;
|
||||
GridStopWatch SolverTimer;
|
||||
SolverTimer.Start();
|
||||
|
||||
// Iteration loop
|
||||
int k;
|
||||
@ -171,7 +181,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
||||
for (k=1;k<=MaxIterations;k++){
|
||||
|
||||
a = c /cp;
|
||||
AXPYTimer.Start();
|
||||
axpy(p,a,p,r);
|
||||
AXPYTimer.Stop();
|
||||
|
||||
// Note to self - direction ps is iterated seperately
|
||||
// for each shift. Does not appear to have any scope
|
||||
@ -180,6 +192,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
||||
// However SAME r is used. Could load "r" and update
|
||||
// ALL ps[s]. 2/3 Bandwidth saving
|
||||
// New Kernel: Load r, vector of coeffs, vector of pointers ps
|
||||
AXPYTimer.Start();
|
||||
for(int s=0;s<nshift;s++){
|
||||
if ( ! converged[s] ) {
|
||||
if (s==0){
|
||||
@ -190,22 +203,34 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
||||
}
|
||||
}
|
||||
}
|
||||
AXPYTimer.Stop();
|
||||
|
||||
cp=c;
|
||||
MatrixTimer.Start();
|
||||
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
|
||||
// The below is faster on KNL
|
||||
Linop.HermOp(p,mmp);
|
||||
d=real(innerProduct(p,mmp));
|
||||
|
||||
Linop.HermOpAndNorm(p,mmp,d,qq);
|
||||
MatrixTimer.Stop();
|
||||
|
||||
AXPYTimer.Start();
|
||||
axpy(mmp,mass[0],p,mmp);
|
||||
AXPYTimer.Stop();
|
||||
RealD rn = norm2(p);
|
||||
d += rn*mass[0];
|
||||
|
||||
bp=b;
|
||||
b=-cp/d;
|
||||
|
||||
AXPYTimer.Start();
|
||||
c=axpy_norm(r,b,mmp,r);
|
||||
AXPYTimer.Stop();
|
||||
|
||||
// Toggle the recurrence history
|
||||
bs[0] = b;
|
||||
iz = 1-iz;
|
||||
ShiftTimer.Start();
|
||||
for(int s=1;s<nshift;s++){
|
||||
if((!converged[s])){
|
||||
RealD z0 = z[s][1-iz];
|
||||
@ -215,6 +240,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
||||
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
|
||||
}
|
||||
}
|
||||
ShiftTimer.Stop();
|
||||
|
||||
for(int s=0;s<nshift;s++){
|
||||
int ss = s;
|
||||
@ -257,6 +283,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
||||
|
||||
if ( all_converged ){
|
||||
|
||||
SolverTimer.Stop();
|
||||
|
||||
|
||||
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
|
||||
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
|
||||
|
||||
@ -269,8 +298,19 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
||||
RealD cn = norm2(src);
|
||||
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
|
||||
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
|
||||
|
||||
IterationsToComplete = k;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
// ugly hack
|
||||
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
||||
|
@ -244,19 +244,11 @@ namespace Grid {
|
||||
|
||||
template<class sobj,class vobj> strong_inline
|
||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||
ret.checkerboard = x.checkerboard;
|
||||
conformable(ret,x);
|
||||
conformable(x,y);
|
||||
axpy(ret,a,x,y);
|
||||
return norm2(ret);
|
||||
return axpy_norm_fast(ret,a,x,y);
|
||||
}
|
||||
template<class sobj,class vobj> strong_inline
|
||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||
ret.checkerboard = x.checkerboard;
|
||||
conformable(ret,x);
|
||||
conformable(x,y);
|
||||
axpby(ret,a,b,x,y);
|
||||
return norm2(ret); // FIXME implement parallel norm in ss loop
|
||||
return axpby_norm_fast(ret,a,b,x,y);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -33,7 +33,7 @@ namespace Grid {
|
||||
// Deterministic Reduction operations
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
||||
ComplexD nrm = innerProduct(arg,arg);
|
||||
auto nrm = innerProduct(arg,arg);
|
||||
return std::real(nrm);
|
||||
}
|
||||
|
||||
@ -43,32 +43,85 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_typeD vector_type;
|
||||
scalar_type nrm;
|
||||
|
||||
GridBase *grid = left._grid;
|
||||
const int pad = 8;
|
||||
|
||||
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
|
||||
ComplexD inner;
|
||||
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
|
||||
|
||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||
int nwork, mywork, myoff;
|
||||
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
||||
|
||||
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
|
||||
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
|
||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
|
||||
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
|
||||
}
|
||||
sumarray[thr]=TensorRemove(vnrm) ;
|
||||
// All threads sum across SIMD; reduce serial work at end
|
||||
// one write per cacheline with streaming store
|
||||
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
|
||||
vstream(sumarray[thr*pad],tmp);
|
||||
}
|
||||
|
||||
vector_type vvnrm; vvnrm=zero; // sum across threads
|
||||
inner=0.0;
|
||||
for(int i=0;i<grid->SumArraySize();i++){
|
||||
vvnrm = vvnrm+sumarray[i];
|
||||
inner = inner+sumarray[i*pad];
|
||||
}
|
||||
nrm = Reduce(vvnrm);// sum across simd
|
||||
right._grid->GlobalSum(nrm);
|
||||
right._grid->GlobalSum(inner);
|
||||
return inner;
|
||||
}
|
||||
|
||||
/////////////////////////
|
||||
// Fast axpby_norm
|
||||
// z = a x + b y
|
||||
// return norm z
|
||||
/////////////////////////
|
||||
template<class sobj,class vobj> strong_inline RealD
|
||||
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
sobj one(1.0);
|
||||
return axpby_norm_fast(z,a,one,x,y);
|
||||
}
|
||||
|
||||
template<class sobj,class vobj> strong_inline RealD
|
||||
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
const int pad = 8;
|
||||
z.checkerboard = x.checkerboard;
|
||||
conformable(z,x);
|
||||
conformable(x,y);
|
||||
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_typeD vector_type;
|
||||
RealD nrm;
|
||||
|
||||
GridBase *grid = x._grid;
|
||||
|
||||
Vector<RealD> sumarray(grid->SumArraySize()*pad);
|
||||
|
||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||
int nwork, mywork, myoff;
|
||||
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
|
||||
|
||||
// private to thread; sub summation
|
||||
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
|
||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
||||
vnrm = vnrm + innerProductD(tmp,tmp);
|
||||
vstream(z._odata[ss],tmp);
|
||||
}
|
||||
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
|
||||
}
|
||||
|
||||
nrm = 0.0; // sum across threads; linear in thread count but fast
|
||||
for(int i=0;i<grid->SumArraySize();i++){
|
||||
nrm = nrm+sumarray[i*pad];
|
||||
}
|
||||
z._grid->GlobalSum(nrm);
|
||||
return nrm;
|
||||
}
|
||||
|
||||
|
||||
template<class Op,class T1>
|
||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
||||
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
|
||||
|
@ -49,7 +49,8 @@ inline double usecond(void) {
|
||||
|
||||
typedef std::chrono::system_clock GridClock;
|
||||
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
||||
typedef std::chrono::milliseconds GridTime;
|
||||
typedef std::chrono::milliseconds GridMillisecs;
|
||||
typedef std::chrono::microseconds GridTime;
|
||||
typedef std::chrono::microseconds GridUsecs;
|
||||
|
||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
||||
@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli
|
||||
stream << time.count()<<" ms";
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
|
||||
{
|
||||
stream << time.count()<<" usec";
|
||||
return stream;
|
||||
}
|
||||
|
||||
class GridStopWatch {
|
||||
private:
|
||||
|
@ -63,9 +63,12 @@ namespace Grid {
|
||||
virtual RealD M (const FermionField &in, FermionField &out)=0;
|
||||
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
|
||||
|
||||
// half checkerboard operaions
|
||||
// Query the even even properties to make algorithmic decisions
|
||||
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
|
||||
virtual int isTrivialEE(void) { return 0; };
|
||||
virtual RealD Mass(void) {return 0.0;};
|
||||
|
||||
// half checkerboard operaions
|
||||
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
|
||||
virtual void Mooee (const FermionField &in, FermionField &out)=0;
|
||||
|
@ -765,6 +765,11 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
||||
reg = memory;
|
||||
}
|
||||
|
||||
inline void InsertGaugeField(DoubledGaugeField &U_ds,
|
||||
const GaugeLinkField &U,int mu)
|
||||
{
|
||||
PokeIndex<LorentzIndex>(U_ds, U, mu);
|
||||
}
|
||||
inline void DoubleStore(GridBase *GaugeGrid,
|
||||
DoubledGaugeField &UUUds, // for Naik term
|
||||
DoubledGaugeField &Uds,
|
||||
@ -803,8 +808,10 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
||||
U = U *phases;
|
||||
Udag = Udag *phases;
|
||||
|
||||
PokeIndex<LorentzIndex>(Uds, U, mu);
|
||||
PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
||||
InsertGaugeField(Uds,U,mu);
|
||||
InsertGaugeField(Uds,Udag,mu+4);
|
||||
// PokeIndex<LorentzIndex>(Uds, U, mu);
|
||||
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
||||
|
||||
// 3 hop based on thin links. Crazy huh ?
|
||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||
@ -816,8 +823,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
||||
UUU = UUU *phases;
|
||||
UUUdag = UUUdag *phases;
|
||||
|
||||
PokeIndex<LorentzIndex>(UUUds, UUU, mu);
|
||||
PokeIndex<LorentzIndex>(UUUds, UUUdag, mu+4);
|
||||
InsertGaugeField(UUUds,UUU,mu);
|
||||
InsertGaugeField(UUUds,UUUdag,mu+4);
|
||||
|
||||
}
|
||||
}
|
||||
@ -910,6 +917,23 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
||||
mac(&phi(), &UU(), &chi());
|
||||
}
|
||||
|
||||
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
||||
{
|
||||
GridBase *GaugeGrid = U_ds._grid;
|
||||
parallel_for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||
|
||||
SiteScalarGaugeLink ScalarU;
|
||||
SiteDoubledGaugeField ScalarUds;
|
||||
|
||||
std::vector<int> lcoor;
|
||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||
peekLocalSite(ScalarUds, U_ds, lcoor);
|
||||
|
||||
peekLocalSite(ScalarU, U, lcoor);
|
||||
ScalarUds(mu) = ScalarU();
|
||||
|
||||
}
|
||||
}
|
||||
inline void DoubleStore(GridBase *GaugeGrid,
|
||||
DoubledGaugeField &UUUds, // for Naik term
|
||||
DoubledGaugeField &Uds,
|
||||
@ -951,23 +975,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
||||
U = U *phases;
|
||||
Udag = Udag *phases;
|
||||
|
||||
|
||||
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||
SiteScalarGaugeLink ScalarU;
|
||||
SiteDoubledGaugeField ScalarUds;
|
||||
|
||||
std::vector<int> lcoor;
|
||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||
peekLocalSite(ScalarUds, Uds, lcoor);
|
||||
|
||||
peekLocalSite(ScalarU, U, lcoor);
|
||||
ScalarUds(mu) = ScalarU();
|
||||
|
||||
peekLocalSite(ScalarU, Udag, lcoor);
|
||||
ScalarUds(mu + 4) = ScalarU();
|
||||
|
||||
pokeLocalSite(ScalarUds, Uds, lcoor);
|
||||
}
|
||||
InsertGaugeField(Uds,U,mu);
|
||||
InsertGaugeField(Uds,Udag,mu+4);
|
||||
|
||||
// 3 hop based on thin links. Crazy huh ?
|
||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||
@ -979,24 +988,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
||||
UUU = UUU *phases;
|
||||
UUUdag = UUUdag *phases;
|
||||
|
||||
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||
|
||||
SiteScalarGaugeLink ScalarU;
|
||||
SiteDoubledGaugeField ScalarUds;
|
||||
|
||||
std::vector<int> lcoor;
|
||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||
|
||||
peekLocalSite(ScalarUds, UUUds, lcoor);
|
||||
|
||||
peekLocalSite(ScalarU, UUU, lcoor);
|
||||
ScalarUds(mu) = ScalarU();
|
||||
|
||||
peekLocalSite(ScalarU, UUUdag, lcoor);
|
||||
ScalarUds(mu + 4) = ScalarU();
|
||||
|
||||
pokeLocalSite(ScalarUds, UUUds, lcoor);
|
||||
}
|
||||
InsertGaugeField(UUUds,UUU,mu);
|
||||
InsertGaugeField(UUUds,UUUdag,mu+4);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -44,6 +44,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3,
|
||||
template <class Impl>
|
||||
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
||||
RealD _mass,
|
||||
RealD _c1, RealD _c2,RealD _u0,
|
||||
const ImplParams &p)
|
||||
: Kernels(p),
|
||||
_grid(&Fgrid),
|
||||
@ -62,6 +63,16 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
|
||||
UUUmuOdd(&Hgrid) ,
|
||||
_tmp(&Hgrid)
|
||||
{
|
||||
int vol4;
|
||||
int LLs=1;
|
||||
c1=_c1;
|
||||
c2=_c2;
|
||||
u0=_u0;
|
||||
vol4= _grid->oSites();
|
||||
Stencil.BuildSurfaceList(LLs,vol4);
|
||||
vol4= _cbgrid->oSites();
|
||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -69,22 +80,10 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
RealD _c1, RealD _c2,RealD _u0,
|
||||
const ImplParams &p)
|
||||
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
|
||||
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
|
||||
{
|
||||
c1=_c1;
|
||||
c2=_c2;
|
||||
u0=_u0;
|
||||
ImportGauge(_Uthin,_Ufat);
|
||||
}
|
||||
template <class Impl>
|
||||
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
const ImplParams &p)
|
||||
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
|
||||
{
|
||||
ImportGaugeSimple(_Utriple,_Ufat);
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Momentum space propagator should be
|
||||
@ -98,11 +97,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,Gaug
|
||||
// of above link to implmement fourier based solver.
|
||||
////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin)
|
||||
{
|
||||
ImportGauge(_Uthin,_Uthin);
|
||||
};
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
||||
{
|
||||
/////////////////////////////////////////////////////////////////
|
||||
@ -125,6 +119,20 @@ void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utripl
|
||||
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
|
||||
|
||||
}
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
||||
{
|
||||
|
||||
Umu = _U;
|
||||
UUUmu = _UUU;
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
|
||||
{
|
||||
pickCheckerboard(Even, UmuEven, Umu);
|
||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
||||
@ -160,10 +168,7 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
|
||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||
}
|
||||
|
||||
pickCheckerboard(Even, UmuEven, Umu);
|
||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||
pickCheckerboard(Even, UUUmuEven, UUUmu);
|
||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
@ -322,6 +327,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
||||
DhopCalls+=2;
|
||||
conformable(in._grid, _grid); // verifies full grid
|
||||
conformable(in._grid, out._grid);
|
||||
|
||||
@ -332,6 +338,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
||||
DhopCalls+=1;
|
||||
conformable(in._grid, _cbgrid); // verifies half grid
|
||||
conformable(in._grid, out._grid); // drops the cb check
|
||||
|
||||
@ -343,6 +350,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
|
||||
DhopCalls+=1;
|
||||
conformable(in._grid, _cbgrid); // verifies half grid
|
||||
conformable(in._grid, out._grid); // drops the cb check
|
||||
|
||||
@ -374,25 +382,193 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag) {
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
Compressor compressor;
|
||||
int len = U._grid->oSites();
|
||||
const int LLs = 1;
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
|
||||
DhopFaceTime -= usecond();
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
st.CommsMergeSHM(compressor);
|
||||
DhopFaceTime += usecond();
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
DhopComputeTime -= usecond();
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
|
||||
if (tid >= ncomms) {
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = len;
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
|
||||
// do the compute
|
||||
if (dag == DaggerYes) {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
// Interior = 1; Exterior = 0; must implement for staggered
|
||||
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
|
||||
}
|
||||
} else {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
// Interior = 1; Exterior = 0;
|
||||
int sU = ss;
|
||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
st.CommunicateThreaded();
|
||||
}
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
|
||||
// First to enter, last to leave timing
|
||||
DhopFaceTime -= usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime -= usecond();
|
||||
|
||||
DhopComputeTime2 -= usecond();
|
||||
if (dag == DaggerYes) {
|
||||
int sz=st.surface_list.size();
|
||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
|
||||
}
|
||||
} else {
|
||||
int sz=st.surface_list.size();
|
||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
|
||||
}
|
||||
}
|
||||
DhopComputeTime2 += usecond();
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
|
||||
DhopCommTime -= usecond();
|
||||
Compressor compressor;
|
||||
st.HaloExchange(in, compressor);
|
||||
DhopCommTime += usecond();
|
||||
|
||||
DhopComputeTime -= usecond();
|
||||
if (dag == DaggerYes) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||
}
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Reporting
|
||||
////////////////////////////////////////////////////////////////
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::Report(void)
|
||||
{
|
||||
std::vector<int> latt = GridDefaultLatt();
|
||||
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
RealD NP = _grid->_Nprocessors;
|
||||
RealD NN = _grid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
|
||||
<< DhopCalls << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
|
||||
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
|
||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
|
||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||
|
||||
// Average the compute time
|
||||
_grid->GlobalSum(DhopComputeTime);
|
||||
DhopComputeTime/=NP;
|
||||
|
||||
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||
|
||||
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
|
||||
{
|
||||
DhopCalls = 0;
|
||||
DhopTotalTime = 0;
|
||||
DhopCommTime = 0;
|
||||
DhopComputeTime = 0;
|
||||
DhopFaceTime = 0;
|
||||
|
||||
Stencil.ZeroCounters();
|
||||
StencilEven.ZeroCounters();
|
||||
StencilOdd.ZeroCounters();
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Conserved current - not yet implemented.
|
||||
////////////////////////////////////////////////////////
|
||||
|
@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
||||
FermionField _tmp;
|
||||
FermionField &tmp(void) { return _tmp; }
|
||||
|
||||
////////////////////////////////////////
|
||||
// Performance monitoring
|
||||
////////////////////////////////////////
|
||||
void Report(void);
|
||||
void ZeroCounters(void);
|
||||
double DhopTotalTime;
|
||||
double DhopCalls;
|
||||
double DhopCommTime;
|
||||
double DhopComputeTime;
|
||||
double DhopComputeTime2;
|
||||
double DhopFaceTime;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Implement the abstract base
|
||||
///////////////////////////////////////////////////////////////
|
||||
@ -105,25 +117,34 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
||||
|
||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
// Constructor
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Grid own interface Constructor
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
|
||||
const ImplParams &p = ImplParams());
|
||||
|
||||
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
RealD _c1, RealD _c2,RealD _u0,
|
||||
const ImplParams &p = ImplParams());
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// MILC constructor no gauge fields
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
|
||||
const ImplParams &p = ImplParams());
|
||||
|
||||
|
||||
// DoubleStore impl dependent
|
||||
void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat);
|
||||
void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
|
||||
void ImportGauge(const GaugeField &_Uthin);
|
||||
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
|
||||
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
|
||||
void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U);
|
||||
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
|
||||
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
||||
DoubledGaugeField &GetUUU(void) { return UUUmu; };
|
||||
void CopyGaugeCheckerboards(void);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Data members require to support the functionality
|
||||
@ -132,7 +153,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
||||
// protected:
|
||||
public:
|
||||
// any other parameters of action ???
|
||||
|
||||
virtual int isTrivialEE(void) { return 1; };
|
||||
virtual RealD Mass(void) { return mass; }
|
||||
RealD mass;
|
||||
RealD u0;
|
||||
RealD c1;
|
||||
|
@ -41,8 +41,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3,
|
||||
|
||||
// 5d lattice for DWF.
|
||||
template<class Impl>
|
||||
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
|
||||
GridCartesian &FiveDimGrid,
|
||||
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
@ -121,16 +120,74 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
||||
assert(FiveDimGrid._simd_layout[0] ==1);
|
||||
|
||||
}
|
||||
int LLs = FiveDimGrid._rdimensions[0];
|
||||
int vol4= FourDimGrid.oSites();
|
||||
Stencil.BuildSurfaceList(LLs,vol4);
|
||||
|
||||
// Allocate the required comms buffer
|
||||
vol4=FourDimRedBlackGrid.oSites();
|
||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
|
||||
{
|
||||
pickCheckerboard(Even, UmuEven, Umu);
|
||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
||||
}
|
||||
template<class Impl>
|
||||
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,
|
||||
RealD _c1,RealD _c2, RealD _u0,
|
||||
const ImplParams &p) :
|
||||
ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
|
||||
FourDimGrid,FourDimRedBlackGrid,
|
||||
_mass,_c1,_c2,_u0,p)
|
||||
{
|
||||
ImportGauge(_Uthin,_Ufat);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// For MILC use; pass three link U's and 1 link U
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin)
|
||||
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
||||
{
|
||||
ImportGauge(_Uthin,_Uthin);
|
||||
};
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Trivial import; phases and fattening and such like preapplied
|
||||
/////////////////////////////////////////////////////////////////
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
|
||||
Impl::InsertGaugeField(UUUmu,U,mu);
|
||||
|
||||
U = adj( Cshift(U, mu, -3));
|
||||
Impl::InsertGaugeField(UUUmu,-U,mu+4);
|
||||
|
||||
U = PeekIndex<LorentzIndex>(_Ufat, mu);
|
||||
Impl::InsertGaugeField(Umu,U,mu);
|
||||
|
||||
U = adj( Cshift(U, mu, -1));
|
||||
Impl::InsertGaugeField(Umu,-U,mu+4);
|
||||
|
||||
}
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
||||
{
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Trivial import; phases and fattening and such like preapplied
|
||||
/////////////////////////////////////////////////////////////////
|
||||
Umu = _U;
|
||||
UUUmu = _UUU;
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
|
||||
{
|
||||
@ -159,10 +216,7 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,cons
|
||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||
}
|
||||
|
||||
pickCheckerboard(Even, UmuEven, Umu);
|
||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||
pickCheckerboard(Even, UUUmuEven, UUUmu);
|
||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
|
||||
@ -223,6 +277,162 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||
assert(0);
|
||||
}
|
||||
|
||||
/*CHANGE */
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
|
||||
Compressor compressor;
|
||||
|
||||
int LLs = in._grid->_rdimensions[0];
|
||||
int len = U._grid->oSites();
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
double ctime=0;
|
||||
double ptime=0;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
if (tid >= ncomms) {
|
||||
double start = usecond();
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = U._grid->oSites(); // 4d vol
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
|
||||
// do the compute
|
||||
if (dag == DaggerYes) {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
// Interior = 1; Exterior = 0; must implement for staggered
|
||||
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
|
||||
}
|
||||
} else {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
// Interior = 1; Exterior = 0;
|
||||
int sU = ss;
|
||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
|
||||
}
|
||||
}
|
||||
ptime = usecond() - start;
|
||||
} else {
|
||||
double start = usecond();
|
||||
st.CommunicateThreaded();
|
||||
ctime = usecond() - start;
|
||||
}
|
||||
}
|
||||
DhopCommTime += ctime;
|
||||
DhopComputeTime+=ptime;
|
||||
|
||||
// First to enter, last to leave timing
|
||||
st.CollateThreads();
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
DhopComputeTime2-=usecond();
|
||||
if (dag == DaggerYes) {
|
||||
int sz=st.surface_list.size();
|
||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
|
||||
}
|
||||
} else {
|
||||
int sz=st.surface_list.size();
|
||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
|
||||
}
|
||||
}
|
||||
DhopComputeTime2+=usecond();
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
Compressor compressor;
|
||||
int LLs = in._grid->_rdimensions[0];
|
||||
|
||||
|
||||
|
||||
//double t1=usecond();
|
||||
DhopTotalTime -= usecond();
|
||||
DhopCommTime -= usecond();
|
||||
st.HaloExchange(in,compressor);
|
||||
DhopCommTime += usecond();
|
||||
|
||||
DhopComputeTime -= usecond();
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
if (dag == DaggerYes) {
|
||||
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||
int sU=ss;
|
||||
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
|
||||
}
|
||||
} else {
|
||||
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||
int sU=ss;
|
||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
||||
}
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
//double t2=usecond();
|
||||
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
|
||||
|
||||
}
|
||||
/*CHANGE END*/
|
||||
|
||||
/* ORG
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
@ -254,6 +464,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
template<class Impl>
|
||||
@ -336,6 +547,9 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
|
||||
DhopTotalTime = 0;
|
||||
DhopCommTime = 0;
|
||||
DhopComputeTime = 0;
|
||||
DhopFaceTime = 0;
|
||||
|
||||
|
||||
Stencil.ZeroCounters();
|
||||
StencilEven.ZeroCounters();
|
||||
StencilOdd.ZeroCounters();
|
||||
|
@ -64,6 +64,8 @@ namespace QCD {
|
||||
double DhopCalls;
|
||||
double DhopCommTime;
|
||||
double DhopComputeTime;
|
||||
double DhopComputeTime2;
|
||||
double DhopFaceTime;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Implement the abstract base
|
||||
@ -119,7 +121,27 @@ namespace QCD {
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalSerialComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
|
||||
// Constructors
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Grid internal interface -- Thin link and fat link, with coefficients
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
||||
GaugeField &_Ufat,
|
||||
GridCartesian &FiveDimGrid,
|
||||
@ -127,18 +149,38 @@ namespace QCD {
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
double _mass,
|
||||
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
|
||||
RealD _c1, RealD _c2,RealD _u0,
|
||||
const ImplParams &p= ImplParams());
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// MILC constructor ; triple links, no rescale factors; must be externally pre multiplied
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
double _mass,
|
||||
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
|
||||
const ImplParams &p= ImplParams());
|
||||
|
||||
// DoubleStore
|
||||
void ImportGauge(const GaugeField &_U);
|
||||
void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
|
||||
// DoubleStore gauge field in operator
|
||||
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
|
||||
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
|
||||
void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
|
||||
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
|
||||
// Give a reference; can be used to do an assignment or copy back out after import
|
||||
// if Carleton wants to cache them and not use the ImportSimple
|
||||
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
||||
DoubledGaugeField &GetUUU(void) { return UUUmu; };
|
||||
void CopyGaugeCheckerboards(void);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Data members require to support the functionality
|
||||
///////////////////////////////////////////////////////////////
|
||||
public:
|
||||
|
||||
virtual int isTrivialEE(void) { return 1; };
|
||||
virtual RealD Mass(void) { return mass; }
|
||||
|
||||
GridBase *_FourDimGrid;
|
||||
GridBase *_FourDimRedBlackGrid;
|
||||
GridBase *_FiveDimGrid;
|
||||
|
@ -32,223 +32,241 @@ namespace Grid {
|
||||
namespace QCD {
|
||||
|
||||
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
||||
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
|
||||
|
||||
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
|
||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||
if (SE->_is_local ) { \
|
||||
if (SE->_permute) { \
|
||||
chi_p = χ \
|
||||
permute(chi, in._odata[SE->_offset], ptype); \
|
||||
} else { \
|
||||
chi_p = &in._odata[SE->_offset]; \
|
||||
} \
|
||||
} else { \
|
||||
chi_p = &buf[SE->_offset]; \
|
||||
} \
|
||||
multLink(Uchi, U._odata[sU], *chi_p, Dir);
|
||||
|
||||
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
|
||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||
if (SE->_is_local ) { \
|
||||
if (SE->_permute) { \
|
||||
chi_p = χ \
|
||||
permute(chi, in._odata[SE->_offset], ptype); \
|
||||
} else { \
|
||||
chi_p = &in._odata[SE->_offset]; \
|
||||
} \
|
||||
} else if ( st.same_node[Dir] ) { \
|
||||
chi_p = &buf[SE->_offset]; \
|
||||
} \
|
||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
||||
}
|
||||
|
||||
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||
nmu++; \
|
||||
chi_p = &buf[SE->_offset]; \
|
||||
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
||||
|
||||
////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Generic implementation; move to different file?
|
||||
////////////////////////////////////////////
|
||||
|
||||
// Int, Ext, Int+Ext cases for comms overlap
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
SiteSpinor *buf, int sF,
|
||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
|
||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out, int dag) {
|
||||
const SiteSpinor *chi_p;
|
||||
SiteSpinor chi;
|
||||
SiteSpinor Uchi;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
int skew = 0;
|
||||
if (threeLink) skew=8;
|
||||
///////////////////////////
|
||||
// Xp
|
||||
///////////////////////////
|
||||
int skew;
|
||||
|
||||
SE = st.GetEntry(ptype, Xp+skew, sF);
|
||||
if (SE->_is_local) {
|
||||
if (SE->_permute) {
|
||||
chi_p = χ
|
||||
permute(chi, in._odata[SE->_offset], ptype);
|
||||
} else {
|
||||
chi_p = &in._odata[SE->_offset];
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
skew = 0;
|
||||
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
||||
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
||||
skew=8;
|
||||
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
||||
if ( dag ) {
|
||||
Uchi = - Uchi;
|
||||
}
|
||||
} else {
|
||||
chi_p = &buf[SE->_offset];
|
||||
vstream(out._odata[sF], Uchi);
|
||||
}
|
||||
Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp);
|
||||
|
||||
///////////////////////////
|
||||
// Yp
|
||||
///////////////////////////
|
||||
SE = st.GetEntry(ptype, Yp+skew, sF);
|
||||
if (SE->_is_local) {
|
||||
if (SE->_permute) {
|
||||
chi_p = χ
|
||||
permute(chi, in._odata[SE->_offset], ptype);
|
||||
} else {
|
||||
chi_p = &in._odata[SE->_offset];
|
||||
}
|
||||
} else {
|
||||
chi_p = &buf[SE->_offset];
|
||||
}
|
||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp);
|
||||
|
||||
///////////////////////////
|
||||
// Zp
|
||||
///////////////////////////
|
||||
SE = st.GetEntry(ptype, Zp+skew, sF);
|
||||
if (SE->_is_local) {
|
||||
if (SE->_permute) {
|
||||
chi_p = χ
|
||||
permute(chi, in._odata[SE->_offset], ptype);
|
||||
} else {
|
||||
chi_p = &in._odata[SE->_offset];
|
||||
}
|
||||
} else {
|
||||
chi_p = &buf[SE->_offset];
|
||||
}
|
||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp);
|
||||
|
||||
///////////////////////////
|
||||
// Tp
|
||||
///////////////////////////
|
||||
SE = st.GetEntry(ptype, Tp+skew, sF);
|
||||
if (SE->_is_local) {
|
||||
if (SE->_permute) {
|
||||
chi_p = χ
|
||||
permute(chi, in._odata[SE->_offset], ptype);
|
||||
} else {
|
||||
chi_p = &in._odata[SE->_offset];
|
||||
}
|
||||
} else {
|
||||
chi_p = &buf[SE->_offset];
|
||||
}
|
||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp);
|
||||
|
||||
///////////////////////////
|
||||
// Xm
|
||||
///////////////////////////
|
||||
SE = st.GetEntry(ptype, Xm+skew, sF);
|
||||
if (SE->_is_local) {
|
||||
if (SE->_permute) {
|
||||
chi_p = χ
|
||||
permute(chi, in._odata[SE->_offset], ptype);
|
||||
} else {
|
||||
chi_p = &in._odata[SE->_offset];
|
||||
}
|
||||
} else {
|
||||
chi_p = &buf[SE->_offset];
|
||||
}
|
||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm);
|
||||
|
||||
///////////////////////////
|
||||
// Ym
|
||||
///////////////////////////
|
||||
SE = st.GetEntry(ptype, Ym+skew, sF);
|
||||
if (SE->_is_local) {
|
||||
if (SE->_permute) {
|
||||
chi_p = χ
|
||||
permute(chi, in._odata[SE->_offset], ptype);
|
||||
} else {
|
||||
chi_p = &in._odata[SE->_offset];
|
||||
}
|
||||
} else {
|
||||
chi_p = &buf[SE->_offset];
|
||||
}
|
||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym);
|
||||
|
||||
///////////////////////////
|
||||
// Zm
|
||||
///////////////////////////
|
||||
SE = st.GetEntry(ptype, Zm+skew, sF);
|
||||
if (SE->_is_local) {
|
||||
if (SE->_permute) {
|
||||
chi_p = χ
|
||||
permute(chi, in._odata[SE->_offset], ptype);
|
||||
} else {
|
||||
chi_p = &in._odata[SE->_offset];
|
||||
}
|
||||
} else {
|
||||
chi_p = &buf[SE->_offset];
|
||||
}
|
||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm);
|
||||
|
||||
///////////////////////////
|
||||
// Tm
|
||||
///////////////////////////
|
||||
SE = st.GetEntry(ptype, Tm+skew, sF);
|
||||
if (SE->_is_local) {
|
||||
if (SE->_permute) {
|
||||
chi_p = χ
|
||||
permute(chi, in._odata[SE->_offset], ptype);
|
||||
} else {
|
||||
chi_p = &in._odata[SE->_offset];
|
||||
}
|
||||
} else {
|
||||
chi_p = &buf[SE->_offset];
|
||||
}
|
||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm);
|
||||
|
||||
vstream(out, Uchi);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Only contributions from interior of our node
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag) {
|
||||
const SiteSpinor *chi_p;
|
||||
SiteSpinor chi;
|
||||
SiteSpinor Uchi;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
int skew ;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
skew = 0;
|
||||
Uchi=zero;
|
||||
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
||||
skew=8;
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||
if ( dag ) {
|
||||
Uchi = - Uchi;
|
||||
}
|
||||
vstream(out._odata[sF], Uchi);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Only contributions from exterior of our node
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag) {
|
||||
const SiteSpinor *chi_p;
|
||||
SiteSpinor chi;
|
||||
SiteSpinor Uchi;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
int nmu=0;
|
||||
int skew ;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
skew = 0;
|
||||
Uchi=zero;
|
||||
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
||||
skew=8;
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||
|
||||
if ( nmu ) {
|
||||
if ( dag ) {
|
||||
out._odata[sF] = out._odata[sF] - Uchi;
|
||||
} else {
|
||||
out._odata[sF] = out._odata[sF] + Uchi;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Driving / wrapping routine to select right kernel
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out) {
|
||||
SiteSpinor naik;
|
||||
SiteSpinor naive;
|
||||
int oneLink =0;
|
||||
int threeLink=1;
|
||||
const FermionField &in, FermionField &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
int dag=1;
|
||||
switch(Opt) {
|
||||
#ifdef AVX512
|
||||
//FIXME; move the sign into the Asm routine
|
||||
case OptInlineAsm:
|
||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
||||
for(int s=0;s<LLs;s++) {
|
||||
int sF=s+LLs*sU;
|
||||
out._odata[sF]=-out._odata[sF];
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
case OptHandUnroll:
|
||||
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
break;
|
||||
case OptGeneric:
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
||||
out._odata[sF] =-naive-naik;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
std::cout<<"Oops Opt = "<<Opt<<std::endl;
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
int dag=0;
|
||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionField &in, FermionField &out)
|
||||
int sU, const FermionField &in, FermionField &out,
|
||||
int dag,int interior,int exterior)
|
||||
{
|
||||
int oneLink =0;
|
||||
int threeLink=1;
|
||||
SiteSpinor naik;
|
||||
SiteSpinor naive;
|
||||
int dag=0;
|
||||
switch(Opt) {
|
||||
#ifdef AVX512
|
||||
case OptInlineAsm:
|
||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
||||
if ( interior && exterior ) {
|
||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else {
|
||||
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
|
||||
assert(0);
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
case OptHandUnroll:
|
||||
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
if ( interior && exterior ) {
|
||||
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( interior ) {
|
||||
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( exterior ) {
|
||||
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
}
|
||||
break;
|
||||
case OptGeneric:
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
// assert(sF<in._odata.size());
|
||||
// assert(sU< U._odata.size());
|
||||
// assert(sF>=0); assert(sU>=0);
|
||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
||||
out._odata[sF] =naive+naik;
|
||||
if ( interior && exterior ) {
|
||||
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( interior ) {
|
||||
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( exterior ) {
|
||||
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
@ -38,8 +38,9 @@ namespace QCD {
|
||||
class StaggeredKernelsStatic {
|
||||
public:
|
||||
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
static int Opt; // these are a temporary hack
|
||||
enum { CommsAndCompute, CommsThenCompute };
|
||||
static int Opt;
|
||||
static int Comms;
|
||||
};
|
||||
|
||||
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
|
||||
@ -53,24 +54,62 @@ public:
|
||||
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
|
||||
|
||||
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Generic Nc kernels
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag);
|
||||
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag);
|
||||
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Nc=3 specific kernels
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag);
|
||||
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag);
|
||||
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag);
|
||||
|
||||
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Asm Nc=3 specific kernels
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag);
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Generic interface; fan out to right routine
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
||||
|
||||
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
|
||||
int LLs, int sU, const FermionField &in, FermionField &out, int dag);
|
||||
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
||||
|
||||
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||
int LLs, int sU, const FermionField &in, FermionField &out);
|
||||
|
||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out);
|
||||
|
||||
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,
|
||||
int LLs, int sU, const FermionField &in, FermionField &out);
|
||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor * buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||
|
||||
public:
|
||||
|
||||
|
@ -560,6 +560,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VSTORE(2,%0,pUChi_02) \
|
||||
: : "r" (out) : "memory" );
|
||||
|
||||
#define nREDUCE(out) \
|
||||
asm ( \
|
||||
VADD(UChi_00,UChi_10,UChi_00) \
|
||||
VADD(UChi_01,UChi_11,UChi_01) \
|
||||
VADD(UChi_02,UChi_12,UChi_02) \
|
||||
VADD(UChi_30,UChi_20,UChi_30) \
|
||||
VADD(UChi_31,UChi_21,UChi_31) \
|
||||
VADD(UChi_32,UChi_22,UChi_32) \
|
||||
VADD(UChi_00,UChi_30,UChi_00) \
|
||||
VADD(UChi_01,UChi_31,UChi_01) \
|
||||
VADD(UChi_02,UChi_32,UChi_02) ); \
|
||||
asm (VZERO(Chi_00) \
|
||||
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||
asm ( \
|
||||
VSTORE(0,%0,pUChi_00) \
|
||||
VSTORE(1,%0,pUChi_01) \
|
||||
VSTORE(2,%0,pUChi_02) \
|
||||
: : "r" (out) : "memory" );
|
||||
|
||||
#define REDUCEa(out) \
|
||||
asm ( \
|
||||
VADD(UChi_00,UChi_10,UChi_00) \
|
||||
@ -571,6 +592,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VSTORE(2,%0,pUChi_02) \
|
||||
: : "r" (out) : "memory" );
|
||||
|
||||
// FIXME is sign right in the VSUB ?
|
||||
#define nREDUCEa(out) \
|
||||
asm ( \
|
||||
VADD(UChi_00,UChi_10,UChi_00) \
|
||||
VADD(UChi_01,UChi_11,UChi_01) \
|
||||
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||
asm (VZERO(Chi_00) \
|
||||
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||
asm ( \
|
||||
VSTORE(0,%0,pUChi_00) \
|
||||
VSTORE(1,%0,pUChi_01) \
|
||||
VSTORE(2,%0,pUChi_02) \
|
||||
: : "r" (out) : "memory" );
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_0,Chi_0);\
|
||||
permute##dir(Chi_1,Chi_1);\
|
||||
@ -581,10 +618,9 @@ namespace QCD {
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionField &in, FermionField &out)
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
assert(0);
|
||||
};
|
||||
@ -645,10 +681,9 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
// This is the single precision 5th direction vectorised kernel
|
||||
#include <simd/Intel512single.h>
|
||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionField &in, FermionField &out)
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||
@ -685,8 +720,12 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
addr0 = (uint64_t) &out._odata[sF];
|
||||
if ( dag ) {
|
||||
nREDUCE(addr0);
|
||||
} else {
|
||||
REDUCE(addr0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
@ -695,10 +734,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
||||
|
||||
#include <simd/Intel512double.h>
|
||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionField &in, FermionField &out)
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||
@ -734,8 +772,12 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
addr0 = (uint64_t) &out._odata[sF];
|
||||
if ( dag ) {
|
||||
nREDUCE(addr0);
|
||||
} else {
|
||||
REDUCE(addr0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
@ -776,10 +818,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
||||
|
||||
#include <simd/Intel512single.h>
|
||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionField &in, FermionField &out)
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||
@ -832,8 +873,12 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
addr0 = (uint64_t) &out._odata[sF];
|
||||
if ( dag ) {
|
||||
nREDUCEa(addr0);
|
||||
} else {
|
||||
REDUCEa(addr0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
@ -841,10 +886,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
||||
|
||||
#include <simd/Intel512double.h>
|
||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionField &in, FermionField &out)
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||
@ -897,8 +941,12 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
addr0 = (uint64_t) &out._odata[sF];
|
||||
if ( dag ) {
|
||||
nREDUCEa(addr0);
|
||||
} else {
|
||||
REDUCEa(addr0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
@ -909,7 +957,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
||||
DoubledGaugeField &U, \
|
||||
DoubledGaugeField &UUU, \
|
||||
SiteSpinor *buf, int LLs, \
|
||||
int sU, const FermionField &in, FermionField &out);
|
||||
int sU, const FermionField &in, FermionField &out,int dag);
|
||||
|
||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
||||
|
@ -28,7 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
|
||||
#define REGISTER
|
||||
|
||||
#define LOAD_CHI(b) \
|
||||
const SiteSpinor & ref (b[offset]); \
|
||||
@ -59,7 +58,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
UChi ## _1 += U_12*Chi_2;\
|
||||
UChi ## _2 += U_22*Chi_2;
|
||||
|
||||
#define MULT_ADD(A,UChi) \
|
||||
#define MULT_ADD(U,A,UChi) \
|
||||
auto & ref(U._odata[sU](A)); \
|
||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||
@ -82,241 +81,319 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_0,Chi_0);\
|
||||
permute##dir(Chi_1,Chi_1);\
|
||||
permute##dir(Chi_0,Chi_0); \
|
||||
permute##dir(Chi_1,Chi_1); \
|
||||
permute##dir(Chi_2,Chi_2);
|
||||
|
||||
|
||||
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHI(in._odata); \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(Perm); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI(buf); \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
|
||||
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||
{ \
|
||||
MULT(Dir,even); \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
|
||||
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||
{ \
|
||||
MULT_ADD(U,Dir,even); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
|
||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHI(in._odata); \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(Perm); \
|
||||
} \
|
||||
} else if ( st.same_node[Dir] ) { \
|
||||
LOAD_CHI(buf); \
|
||||
} \
|
||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||
MULT_ADD(U,Dir,even); \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
|
||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||
nmu++; \
|
||||
{ LOAD_CHI(buf); } \
|
||||
{ MULT_ADD(U,Dir,even); } \
|
||||
}
|
||||
|
||||
namespace Grid {
|
||||
namespace QCD {
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionField &in, FermionField &out, int dag)
|
||||
{
|
||||
SiteSpinor naik;
|
||||
SiteSpinor naive;
|
||||
int oneLink =0;
|
||||
int threeLink=1;
|
||||
int skew(0);
|
||||
Real scale(1.0);
|
||||
|
||||
if(dag) scale = -1.0;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
||||
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
||||
out._odata[sF] =scale*(naive+naik);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
SiteSpinor *buf, int sF,
|
||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink)
|
||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
REGISTER Simd even_0; // 12 regs on knc
|
||||
REGISTER Simd even_1;
|
||||
REGISTER Simd even_2;
|
||||
REGISTER Simd odd_0; // 12 regs on knc
|
||||
REGISTER Simd odd_1;
|
||||
REGISTER Simd odd_2;
|
||||
Simd even_0; // 12 regs on knc
|
||||
Simd even_1;
|
||||
Simd even_2;
|
||||
Simd odd_0; // 12 regs on knc
|
||||
Simd odd_1;
|
||||
Simd odd_2;
|
||||
|
||||
REGISTER Simd Chi_0; // two spinor; 6 regs
|
||||
REGISTER Simd Chi_1;
|
||||
REGISTER Simd Chi_2;
|
||||
Simd Chi_0; // two spinor; 6 regs
|
||||
Simd Chi_1;
|
||||
Simd Chi_2;
|
||||
|
||||
REGISTER Simd U_00; // two rows of U matrix
|
||||
REGISTER Simd U_10;
|
||||
REGISTER Simd U_20;
|
||||
REGISTER Simd U_01;
|
||||
REGISTER Simd U_11;
|
||||
REGISTER Simd U_21; // 2 reg left.
|
||||
REGISTER Simd U_02;
|
||||
REGISTER Simd U_12;
|
||||
REGISTER Simd U_22;
|
||||
|
||||
int skew = 0;
|
||||
if (threeLink) skew=8;
|
||||
Simd U_00; // two rows of U matrix
|
||||
Simd U_10;
|
||||
Simd U_20;
|
||||
Simd U_01;
|
||||
Simd U_11;
|
||||
Simd U_21; // 2 reg left.
|
||||
Simd U_02;
|
||||
Simd U_12;
|
||||
Simd U_22;
|
||||
|
||||
SiteSpinor result;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
StencilEntry *SE;
|
||||
int skew;
|
||||
|
||||
// Xp
|
||||
SE=st.GetEntry(ptype,Xp+skew,sF);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
|
||||
if ( local ) {
|
||||
LOAD_CHI(in._odata);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
skew = 0;
|
||||
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG (U,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG (U,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG (U,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
||||
skew = 8;
|
||||
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
||||
|
||||
if ( dag ) {
|
||||
result()()(0) = - even_0 - odd_0;
|
||||
result()()(1) = - even_1 - odd_1;
|
||||
result()()(2) = - even_2 - odd_2;
|
||||
} else {
|
||||
LOAD_CHI(buf);
|
||||
result()()(0) = even_0 + odd_0;
|
||||
result()()(1) = even_1 + odd_1;
|
||||
result()()(2) = even_2 + odd_2;
|
||||
}
|
||||
{
|
||||
MULT(Xp,even);
|
||||
vstream(out._odata[sF],result);
|
||||
}
|
||||
|
||||
// Yp
|
||||
SE=st.GetEntry(ptype,Yp+skew,sF);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
LOAD_CHI(in._odata);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI(buf);
|
||||
}
|
||||
{
|
||||
MULT(Yp,odd);
|
||||
}
|
||||
|
||||
|
||||
// Zp
|
||||
SE=st.GetEntry(ptype,Zp+skew,sF);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
LOAD_CHI(in._odata);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI(buf);
|
||||
}
|
||||
{
|
||||
MULT_ADD(Zp,even);
|
||||
}
|
||||
|
||||
// Tp
|
||||
SE=st.GetEntry(ptype,Tp+skew,sF);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
LOAD_CHI(in._odata);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI(buf);
|
||||
}
|
||||
{
|
||||
MULT_ADD(Tp,odd);
|
||||
}
|
||||
|
||||
// Xm
|
||||
SE=st.GetEntry(ptype,Xm+skew,sF);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
LOAD_CHI(in._odata);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI(buf);
|
||||
}
|
||||
{
|
||||
MULT_ADD(Xm,even);
|
||||
}
|
||||
|
||||
|
||||
// Ym
|
||||
SE=st.GetEntry(ptype,Ym+skew,sF);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
LOAD_CHI(in._odata);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI(buf);
|
||||
}
|
||||
{
|
||||
MULT_ADD(Ym,odd);
|
||||
}
|
||||
|
||||
// Zm
|
||||
SE=st.GetEntry(ptype,Zm+skew,sF);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
LOAD_CHI(in._odata);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI(buf);
|
||||
}
|
||||
{
|
||||
MULT_ADD(Zm,even);
|
||||
}
|
||||
|
||||
// Tm
|
||||
SE=st.GetEntry(ptype,Tm+skew,sF);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
LOAD_CHI(in._odata);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
LOAD_CHI(buf);
|
||||
}
|
||||
{
|
||||
MULT_ADD(Tm,odd);
|
||||
}
|
||||
|
||||
vstream(out()()(0),even_0+odd_0);
|
||||
vstream(out()()(1),even_1+odd_1);
|
||||
vstream(out()()(2),even_2+odd_2);
|
||||
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
Simd even_0; // 12 regs on knc
|
||||
Simd even_1;
|
||||
Simd even_2;
|
||||
Simd odd_0; // 12 regs on knc
|
||||
Simd odd_1;
|
||||
Simd odd_2;
|
||||
|
||||
Simd Chi_0; // two spinor; 6 regs
|
||||
Simd Chi_1;
|
||||
Simd Chi_2;
|
||||
|
||||
Simd U_00; // two rows of U matrix
|
||||
Simd U_10;
|
||||
Simd U_20;
|
||||
Simd U_01;
|
||||
Simd U_11;
|
||||
Simd U_21; // 2 reg left.
|
||||
Simd U_02;
|
||||
Simd U_12;
|
||||
Simd U_22;
|
||||
|
||||
SiteSpinor result;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
StencilEntry *SE;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
|
||||
even_0 = zero; even_1 = zero; even_2 = zero;
|
||||
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
||||
|
||||
skew = 0;
|
||||
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
||||
skew = 8;
|
||||
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
||||
|
||||
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
||||
if ( dag ) {
|
||||
result()()(0) = - even_0 - odd_0;
|
||||
result()()(1) = - even_1 - odd_1;
|
||||
result()()(2) = - even_2 - odd_2;
|
||||
} else {
|
||||
result()()(0) = even_0 + odd_0;
|
||||
result()()(1) = even_1 + odd_1;
|
||||
result()()(2) = even_2 + odd_2;
|
||||
}
|
||||
vstream(out._odata[sF],result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
Simd even_0; // 12 regs on knc
|
||||
Simd even_1;
|
||||
Simd even_2;
|
||||
Simd odd_0; // 12 regs on knc
|
||||
Simd odd_1;
|
||||
Simd odd_2;
|
||||
|
||||
Simd Chi_0; // two spinor; 6 regs
|
||||
Simd Chi_1;
|
||||
Simd Chi_2;
|
||||
|
||||
Simd U_00; // two rows of U matrix
|
||||
Simd U_10;
|
||||
Simd U_20;
|
||||
Simd U_01;
|
||||
Simd U_11;
|
||||
Simd U_21; // 2 reg left.
|
||||
Simd U_02;
|
||||
Simd U_12;
|
||||
Simd U_22;
|
||||
|
||||
SiteSpinor result;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
StencilEntry *SE;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
|
||||
even_0 = zero; even_1 = zero; even_2 = zero;
|
||||
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
||||
int nmu=0;
|
||||
skew = 0;
|
||||
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
||||
skew = 8;
|
||||
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
||||
|
||||
// Add sum of all exterior connected stencil legs
|
||||
if ( nmu ) {
|
||||
if ( dag ) {
|
||||
result()()(0) = - even_0 - odd_0;
|
||||
result()()(1) = - even_1 - odd_1;
|
||||
result()()(2) = - even_2 - odd_2;
|
||||
} else {
|
||||
result()()(0) = even_0 + odd_0;
|
||||
result()()(1) = even_1 + odd_1;
|
||||
result()()(2) = even_2 + odd_2;
|
||||
}
|
||||
out._odata[sF] = out._odata[sF] + result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||
SiteSpinor *buf, int LLs, \
|
||||
int sU, const FermionField &in, FermionField &out, int dag);
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionField &in, FermionField &out, int dag); \
|
||||
\
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionField &in, FermionField &out, int dag); \
|
||||
\
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionField &in, FermionField &out, int dag); \
|
||||
|
||||
#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \
|
||||
template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \
|
||||
SiteSpinor *buf, int sF, \
|
||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink) ;
|
||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
||||
|
||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
|
||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
|
||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
||||
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -274,41 +274,16 @@ public:
|
||||
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
||||
}
|
||||
|
||||
std::vector<int> same_node;
|
||||
std::vector<int> surface_list;
|
||||
|
||||
WilsonStencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
const std::vector<int> &directions,
|
||||
const std::vector<int> &distances)
|
||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
||||
same_node(npoints)
|
||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
|
||||
{
|
||||
ZeroCountersi();
|
||||
surface_list.resize(0);
|
||||
};
|
||||
|
||||
void BuildSurfaceList(int Ls,int vol4){
|
||||
|
||||
// find same node for SHM
|
||||
// Here we know the distance is 1 for WilsonStencil
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
same_node[point] = this->SameNode(point);
|
||||
}
|
||||
|
||||
for(int site = 0 ;site< vol4;site++){
|
||||
int local = 1;
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
||||
local = 0;
|
||||
}
|
||||
}
|
||||
if(local == 0) {
|
||||
surface_list.push_back(site);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template < class compressor>
|
||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||
@ -369,23 +344,23 @@ public:
|
||||
int dag = compress.dag;
|
||||
int face_idx=0;
|
||||
if ( dag ) {
|
||||
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
||||
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
||||
assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
||||
assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
||||
assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
||||
assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
||||
assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
||||
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
||||
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
||||
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
||||
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
||||
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
||||
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
||||
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
||||
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
||||
} else {
|
||||
assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
||||
assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
||||
assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
||||
assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
||||
assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
||||
assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
||||
assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
||||
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
||||
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
||||
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
||||
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
||||
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
||||
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
||||
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
||||
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
||||
}
|
||||
this->face_table_computed=1;
|
||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||
|
@ -348,15 +348,98 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
|
||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
/*Change starts*/
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag) {
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag) {
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
#ifdef GRID_OMP
|
||||
Compressor compressor;
|
||||
int len = U._grid->oSites();
|
||||
const int LLs = 1;
|
||||
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
st.CommsMergeSHM(compressor);
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
if (tid >= ncomms) {
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = len;
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
// do the compute
|
||||
if (dag == DaggerYes) {
|
||||
|
||||
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||
}
|
||||
} else {
|
||||
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||
}
|
||||
} //else
|
||||
|
||||
} else {
|
||||
st.CommunicateThreaded();
|
||||
}
|
||||
|
||||
Compressor compressor(dag);
|
||||
|
||||
if (dag == DaggerYes) {
|
||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||
}
|
||||
} else {
|
||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||
}
|
||||
}
|
||||
|
||||
} //pragma
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag) {
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
Compressor compressor(dag);
|
||||
st.HaloExchange(in, compressor);
|
||||
|
||||
@ -370,6 +453,7 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
}
|
||||
}
|
||||
};
|
||||
/*Change ends */
|
||||
|
||||
/*******************************************************************************
|
||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||
|
@ -130,6 +130,12 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
// Constructor
|
||||
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
@ -145,6 +151,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
||||
|
||||
// protected:
|
||||
public:
|
||||
virtual RealD Mass(void) { return mass; }
|
||||
virtual int isTrivialEE(void) { return 1; };
|
||||
RealD mass;
|
||||
RealD diag_mass;
|
||||
|
||||
|
@ -445,8 +445,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
}
|
||||
}
|
||||
ptime = usecond() - start;
|
||||
}
|
||||
{
|
||||
} else {
|
||||
double start = usecond();
|
||||
st.CommunicateThreaded();
|
||||
ctime = usecond() - start;
|
||||
|
@ -232,6 +232,7 @@ private:
|
||||
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, const FermionField &in, FermionField &out);
|
||||
|
||||
|
||||
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||
|
||||
|
@ -150,6 +150,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
std::vector<int> _distances;
|
||||
std::vector<int> _comm_buf_size;
|
||||
std::vector<int> _permute_type;
|
||||
std::vector<int> same_node;
|
||||
std::vector<int> surface_list;
|
||||
|
||||
Vector<StencilEntry> _entries;
|
||||
std::vector<Packet> Packets;
|
||||
@ -201,7 +203,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
|
||||
int dimension = _directions[point];
|
||||
int displacement = _distances[point];
|
||||
assert( (displacement==1) || (displacement==-1));
|
||||
|
||||
|
||||
int pd = _grid->_processors[dimension];
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
@ -216,9 +218,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
if ( ! comm_dim ) return 1;
|
||||
|
||||
int nbr_proc;
|
||||
if (displacement==1) nbr_proc = 1;
|
||||
if (displacement>0) nbr_proc = 1;
|
||||
else nbr_proc = pd-1;
|
||||
|
||||
// FIXME this logic needs to be sorted for three link term
|
||||
// assert( (displacement==1) || (displacement==-1));
|
||||
// Present hack only works for >= 4^4 subvol per node
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
||||
@ -539,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
}
|
||||
};
|
||||
|
||||
// Move interior/exterior split into the generic stencil
|
||||
// FIXME Explicit Ls in interface is a pain. Should just use a vol
|
||||
void BuildSurfaceList(int Ls,int vol4){
|
||||
|
||||
// find same node for SHM
|
||||
// Here we know the distance is 1 for WilsonStencil
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
same_node[point] = this->SameNode(point);
|
||||
}
|
||||
|
||||
for(int site = 0 ;site< vol4;site++){
|
||||
int local = 1;
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
||||
local = 0;
|
||||
}
|
||||
}
|
||||
if(local == 0) {
|
||||
surface_list.push_back(site);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CartesianStencil(GridBase *grid,
|
||||
int npoints,
|
||||
int checkerboard,
|
||||
@ -549,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
comm_bytes_thr(npoints),
|
||||
comm_enter_thr(npoints),
|
||||
comm_leave_thr(npoints),
|
||||
comm_time_thr(npoints)
|
||||
comm_time_thr(npoints),
|
||||
same_node(npoints)
|
||||
{
|
||||
face_table_computed=0;
|
||||
_npoints = npoints;
|
||||
@ -557,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
_directions = directions;
|
||||
_distances = distances;
|
||||
_unified_buffer_size=0;
|
||||
surface_list.resize(0);
|
||||
|
||||
int osites = _grid->oSites();
|
||||
|
||||
|
@ -368,8 +368,10 @@ void Grid_init(int *argc,char ***argv)
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
||||
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute;
|
||||
} else {
|
||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute;
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
||||
|
@ -141,6 +141,7 @@ int main (int argc, char ** argv)
|
||||
t1=usecond();
|
||||
|
||||
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
@ -160,7 +161,8 @@ int main (int argc, char ** argv)
|
||||
localConvert(sresult,tmp);
|
||||
|
||||
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
|
||||
@ -181,6 +183,7 @@ int main (int argc, char ** argv)
|
||||
localConvert(sresult,tmp);
|
||||
|
||||
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
||||
|
||||
|
196
tests/core/Test_staggered5DvecF.cc
Normal file
196
tests/core/Test_staggered5DvecF.cc
Normal file
@ -0,0 +1,196 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./benchmarks/Benchmark_wilson.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
std::vector<int> latt_size = GridDefaultLatt();
|
||||
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||
|
||||
const int Ls=16;
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||
|
||||
std::vector<int> seeds({1,2,3,4});
|
||||
|
||||
GridParallelRNG pRNG4(UGrid);
|
||||
GridParallelRNG pRNG5(FGrid);
|
||||
pRNG4.SeedFixedIntegers(seeds);
|
||||
pRNG5.SeedFixedIntegers(seeds);
|
||||
|
||||
typedef typename ImprovedStaggeredFermion5DF::FermionField FermionField;
|
||||
typedef typename ImprovedStaggeredFermion5DF::ComplexField ComplexField;
|
||||
typename ImprovedStaggeredFermion5DF::ImplParams params;
|
||||
|
||||
FermionField src (FGrid);
|
||||
random(pRNG5,src);
|
||||
/*
|
||||
std::vector<int> site({0,1,2,0,0});
|
||||
ColourVector cv = zero;
|
||||
cv()()(0)=1.0;
|
||||
src = zero;
|
||||
pokeSite(cv,src,site);
|
||||
*/
|
||||
FermionField result(FGrid); result=zero;
|
||||
FermionField tmp(FGrid); tmp=zero;
|
||||
FermionField err(FGrid); tmp=zero;
|
||||
FermionField phi (FGrid); random(pRNG5,phi);
|
||||
FermionField chi (FGrid); random(pRNG5,chi);
|
||||
|
||||
LatticeGaugeFieldF Umu(UGrid);
|
||||
SU3::HotConfiguration(pRNG4,Umu);
|
||||
|
||||
/*
|
||||
for(int mu=1;mu<4;mu++){
|
||||
auto tmp = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
tmp = zero;
|
||||
PokeIndex<LorentzIndex>(Umu,tmp,mu);
|
||||
}
|
||||
*/
|
||||
double volume=Ls;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
volume=volume*latt_size[mu];
|
||||
}
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
|
||||
ImprovedStaggeredFermion5DF Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params);
|
||||
ImprovedStaggeredFermionVec5dF sDs(Umu,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,c1,c2,u0,params);
|
||||
|
||||
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation "<<std::endl;
|
||||
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
||||
|
||||
int ncall=1000;
|
||||
int ncall1=1000;
|
||||
double t0(0),t1(0);
|
||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||
|
||||
std::cout<<GridLogMessage << "Calling staggered operator"<<std::endl;
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall1;i++){
|
||||
Ds.Dhop(src,result,0);
|
||||
}
|
||||
t1=usecond();
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
|
||||
|
||||
#ifdef AVX512
|
||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
||||
#else
|
||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
|
||||
#endif
|
||||
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall1;i++){
|
||||
Ds.Dhop(src,tmp,0);
|
||||
}
|
||||
t1=usecond();
|
||||
|
||||
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
err = tmp-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
|
||||
|
||||
FermionField ssrc (sFGrid); localConvert(src,ssrc);
|
||||
FermionField sresult(sFGrid); sresult=zero;
|
||||
|
||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall1;i++){
|
||||
sDs.Dhop(ssrc,sresult,0);
|
||||
}
|
||||
t1=usecond();
|
||||
localConvert(sresult,tmp);
|
||||
|
||||
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
|
||||
#ifdef AVX512
|
||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
||||
#else
|
||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
|
||||
#endif
|
||||
|
||||
err = tmp-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
int extra=1;
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall1*extra;i++){
|
||||
sDs.Dhop(ssrc,sresult,0);
|
||||
}
|
||||
t1=usecond();
|
||||
localConvert(sresult,tmp);
|
||||
|
||||
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
||||
|
||||
err = tmp-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
|
||||
|
||||
|
||||
Grid_finalize();
|
||||
}
|
@ -74,8 +74,16 @@ int main (int argc, char ** argv)
|
||||
|
||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
||||
|
||||
double volume=1;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
volume=volume*latt_size[mu];
|
||||
}
|
||||
|
||||
RealD mass=0.003;
|
||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||
SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
||||
|
||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||
@ -87,14 +95,26 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
|
||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
|
||||
FermionField result4d_o(UrbGrid);
|
||||
|
||||
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
||||
result4d_o=zero;
|
||||
{
|
||||
double t1=usecond();
|
||||
CG(HermOp4d,src4d_o,result4d_o);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
HermOp4d.Report();
|
||||
}
|
||||
Ds4d.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
|
||||
@ -103,7 +123,17 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
Ds.ZeroCounters();
|
||||
result_o=zero;
|
||||
{
|
||||
double t1=usecond();
|
||||
CG(HermOp,src_o,result_o);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete*Ls;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
HermOp.Report();
|
||||
}
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
@ -112,7 +142,18 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
Ds.ZeroCounters();
|
||||
result_o=zero;
|
||||
{
|
||||
double t1=usecond();
|
||||
mCG(HermOp,src_o,result_o);
|
||||
double t2=usecond();
|
||||
double ncall=mCG.IterationsToComplete*Ls;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
HermOp.Report();
|
||||
}
|
||||
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
@ -121,7 +162,17 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
Ds.ZeroCounters();
|
||||
result_o=zero;
|
||||
{
|
||||
double t1=usecond();
|
||||
BCGrQ(HermOp,src_o,result_o);
|
||||
double t2=usecond();
|
||||
double ncall=BCGrQ.IterationsToComplete*Ls;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
HermOp.Report();
|
||||
}
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
|
@ -74,7 +74,16 @@ int main (int argc, char ** argv)
|
||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
||||
|
||||
RealD mass=0.003;
|
||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
|
||||
double volume=1;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
volume=volume*latt_size[mu];
|
||||
}
|
||||
|
||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
||||
|
||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||
@ -86,11 +95,23 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
|
||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||
FermionField result4d(UGrid); result4d=zero;
|
||||
|
||||
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
||||
{
|
||||
double t1=usecond();
|
||||
CG(HermOp4d,src4d,result4d);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
|
||||
@ -98,9 +119,18 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
result=zero;
|
||||
{
|
||||
Ds.ZeroCounters();
|
||||
double t1=usecond();
|
||||
CG(HermOp,src,result);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
Ds.Report();
|
||||
}
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
@ -108,7 +138,16 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
result=zero;
|
||||
Ds.ZeroCounters();
|
||||
{
|
||||
double t1=usecond();
|
||||
mCG(HermOp,src,result);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
}
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
@ -117,7 +156,16 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
result=zero;
|
||||
Ds.ZeroCounters();
|
||||
{
|
||||
double t1=usecond();
|
||||
BCGrQ(HermOp,src,result);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
}
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
|
@ -71,7 +71,10 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
|
||||
RealD mass=0.003;
|
||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||
|
||||
FermionField res_o(&RBGrid);
|
||||
FermionField src_o(&RBGrid);
|
||||
@ -80,7 +83,19 @@ int main (int argc, char ** argv)
|
||||
|
||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||
double t1=usecond();
|
||||
CG(HermOpEO,src_o,res_o);
|
||||
double t2=usecond();
|
||||
|
||||
// Schur solver: uses DeoDoe => volume * 1146
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
|
||||
|
||||
|
||||
FermionField tmp(&RBGrid);
|
||||
|
||||
|
@ -65,7 +65,10 @@ int main (int argc, char ** argv)
|
||||
FermionField resid(&Grid);
|
||||
|
||||
RealD mass=0.1;
|
||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||
|
||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||
SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
|
||||
|
@ -73,7 +73,10 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
|
||||
RealD mass=0.1;
|
||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||
|
||||
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
|
||||
ConjugateGradient<FermionField> CG(1.0e-6,10000);
|
||||
|
121
tests/solver/Test_staggered_multishift.cc
Normal file
121
tests/solver/Test_staggered_multishift.cc
Normal file
@ -0,0 +1,121 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_wilson_cg_unprec.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
|
||||
typename ImprovedStaggeredFermionR::ImplParams params;
|
||||
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
std::vector<int> latt_size = GridDefaultLatt();
|
||||
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
GridRedBlackCartesian RBGrid(&Grid);
|
||||
|
||||
std::vector<int> seeds({1,2,3,4});
|
||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
|
||||
|
||||
|
||||
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
|
||||
|
||||
double volume=1;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
volume=volume*latt_size[mu];
|
||||
}
|
||||
|
||||
////////////////////////////////////////
|
||||
// sqrt
|
||||
////////////////////////////////////////
|
||||
double lo=0.001;
|
||||
double hi=1.0;
|
||||
int precision=64;
|
||||
int degree=10;
|
||||
AlgRemez remez(lo,hi,precision);
|
||||
remez.generateApprox(degree,1,2);
|
||||
MultiShiftFunction Sqrt(remez,1.0e-6,false);
|
||||
std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
|
||||
|
||||
|
||||
////////////////////////////////////////////
|
||||
// Setup staggered
|
||||
////////////////////////////////////////////
|
||||
RealD mass=0.003;
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
|
||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
||||
|
||||
FermionField src(&Grid); random(pRNG,src);
|
||||
FermionField src_o(&RBGrid);
|
||||
pickCheckerboard(Odd,src_o,src);
|
||||
|
||||
|
||||
/////////////////////////////////
|
||||
//Multishift CG
|
||||
/////////////////////////////////
|
||||
std::vector<FermionField> result(degree,&RBGrid);
|
||||
ConjugateGradientMultiShift<FermionField> MSCG(10000,Sqrt);
|
||||
|
||||
double deodoe_flops=(1205+15*degree)*volume; // == 66*16 + == 1146
|
||||
|
||||
double t1=usecond();
|
||||
MSCG(HermOpEO,src_o,result);
|
||||
double t2=usecond();
|
||||
double ncall=MSCG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
// HermOpEO.Report();
|
||||
|
||||
Grid_finalize();
|
||||
}
|
Loading…
Reference in New Issue
Block a user