mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'feature/staggered-comms-compute' into develop
This commit is contained in:
commit
013ea4e8d1
@ -51,7 +51,7 @@ namespace Grid {
|
|||||||
|
|
||||||
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
|
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
|
||||||
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
||||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
|
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
|
||||||
virtual void HermOp(const Field &in, Field &out)=0;
|
virtual void HermOp(const Field &in, Field &out)=0;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -309,36 +309,59 @@ namespace Grid {
|
|||||||
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
||||||
protected:
|
protected:
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
|
Field tmp;
|
||||||
|
RealD mass;
|
||||||
|
double tMpc;
|
||||||
|
double tIP;
|
||||||
|
double tMeo;
|
||||||
|
double taxpby_norm;
|
||||||
|
uint64_t ncall;
|
||||||
public:
|
public:
|
||||||
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
|
void Report(void)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
|
||||||
|
}
|
||||||
|
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
|
||||||
|
{
|
||||||
|
assert( _Mat.isTrivialEE() );
|
||||||
|
mass = _Mat.Mass();
|
||||||
|
tMpc=0;
|
||||||
|
tIP =0;
|
||||||
|
tMeo=0;
|
||||||
|
taxpby_norm=0;
|
||||||
|
ncall=0;
|
||||||
|
}
|
||||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
GridLogIterative.TimingMode(1);
|
ncall++;
|
||||||
std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl;
|
tMpc-=usecond();
|
||||||
n2 = Mpc(in,out);
|
n2 = Mpc(in,out);
|
||||||
std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl;
|
tMpc+=usecond();
|
||||||
|
tIP-=usecond();
|
||||||
ComplexD dot= innerProduct(in,out);
|
ComplexD dot= innerProduct(in,out);
|
||||||
std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl;
|
tIP+=usecond();
|
||||||
n1 = real(dot);
|
n1 = real(dot);
|
||||||
}
|
}
|
||||||
virtual void HermOp(const Field &in, Field &out){
|
virtual void HermOp(const Field &in, Field &out){
|
||||||
std::cout << GridLogIterative << " HermOp "<<std::endl;
|
ncall++;
|
||||||
Mpc(in,out);
|
tMpc-=usecond();
|
||||||
|
_Mat.Meooe(in,out);
|
||||||
|
_Mat.Meooe(out,tmp);
|
||||||
|
tMpc+=usecond();
|
||||||
|
taxpby_norm-=usecond();
|
||||||
|
axpby(out,-1.0,mass*mass,tmp,in);
|
||||||
|
taxpby_norm+=usecond();
|
||||||
}
|
}
|
||||||
virtual RealD Mpc (const Field &in, Field &out) {
|
virtual RealD Mpc (const Field &in, Field &out) {
|
||||||
Field tmp(in._grid);
|
tMeo-=usecond();
|
||||||
Field tmp2(in._grid);
|
|
||||||
|
|
||||||
std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
|
|
||||||
_Mat.Mooee(in,out);
|
|
||||||
_Mat.Mooee(out,tmp);
|
|
||||||
std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
|
|
||||||
|
|
||||||
_Mat.Meooe(in,out);
|
_Mat.Meooe(in,out);
|
||||||
_Mat.Meooe(out,tmp2);
|
_Mat.Meooe(out,tmp);
|
||||||
std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl;
|
tMeo+=usecond();
|
||||||
|
taxpby_norm-=usecond();
|
||||||
RealD nn=axpy_norm(out,-1.0,tmp2,tmp);
|
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
|
||||||
std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl;
|
taxpby_norm+=usecond();
|
||||||
return nn;
|
return nn;
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag (const Field &in, Field &out){
|
virtual RealD MpcDag (const Field &in, Field &out){
|
||||||
|
@ -54,6 +54,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
|
|
||||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
||||||
|
|
||||||
|
|
||||||
psi.checkerboard = src.checkerboard;
|
psi.checkerboard = src.checkerboard;
|
||||||
conformable(psi, src);
|
conformable(psi, src);
|
||||||
|
|
||||||
@ -70,7 +71,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
|
|
||||||
Linop.HermOpAndNorm(psi, mmp, d, b);
|
Linop.HermOpAndNorm(psi, mmp, d, b);
|
||||||
|
|
||||||
|
|
||||||
r = src - mmp;
|
r = src - mmp;
|
||||||
p = r;
|
p = r;
|
||||||
|
|
||||||
@ -96,38 +96,44 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
|
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
|
||||||
|
|
||||||
GridStopWatch LinalgTimer;
|
GridStopWatch LinalgTimer;
|
||||||
|
GridStopWatch InnerTimer;
|
||||||
|
GridStopWatch AxpyNormTimer;
|
||||||
|
GridStopWatch LinearCombTimer;
|
||||||
GridStopWatch MatrixTimer;
|
GridStopWatch MatrixTimer;
|
||||||
GridStopWatch SolverTimer;
|
GridStopWatch SolverTimer;
|
||||||
|
|
||||||
SolverTimer.Start();
|
SolverTimer.Start();
|
||||||
int k;
|
int k;
|
||||||
for (k = 1; k <= MaxIterations; k++) {
|
for (k = 1; k <= MaxIterations*1000; k++) {
|
||||||
c = cp;
|
c = cp;
|
||||||
|
|
||||||
MatrixTimer.Start();
|
MatrixTimer.Start();
|
||||||
Linop.HermOpAndNorm(p, mmp, d, qq);
|
Linop.HermOp(p, mmp);
|
||||||
MatrixTimer.Stop();
|
MatrixTimer.Stop();
|
||||||
|
|
||||||
LinalgTimer.Start();
|
LinalgTimer.Start();
|
||||||
// RealD qqck = norm2(mmp);
|
|
||||||
// ComplexD dck = innerProduct(p,mmp);
|
|
||||||
|
|
||||||
|
InnerTimer.Start();
|
||||||
|
ComplexD dc = innerProduct(p,mmp);
|
||||||
|
InnerTimer.Stop();
|
||||||
|
d = dc.real();
|
||||||
a = c / d;
|
a = c / d;
|
||||||
b_pred = a * (a * qq - d) / c;
|
|
||||||
|
|
||||||
|
AxpyNormTimer.Start();
|
||||||
cp = axpy_norm(r, -a, mmp, r);
|
cp = axpy_norm(r, -a, mmp, r);
|
||||||
|
AxpyNormTimer.Stop();
|
||||||
b = cp / c;
|
b = cp / c;
|
||||||
|
|
||||||
// Fuse these loops ; should be really easy
|
LinearCombTimer.Start();
|
||||||
psi = a * p + psi;
|
parallel_for(int ss=0;ss<src._grid->oSites();ss++){
|
||||||
p = p * b + r;
|
vstream(psi[ss], a * p[ss] + psi[ss]);
|
||||||
|
vstream(p [ss], b * p[ss] + r[ss]);
|
||||||
|
}
|
||||||
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
||||||
<< " residual " << cp << " target " << rsq << std::endl;
|
<< " residual " << cp << " target " << rsq << std::endl;
|
||||||
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
|
|
||||||
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
|
|
||||||
|
|
||||||
// Stopping condition
|
// Stopping condition
|
||||||
if (cp <= rsq) {
|
if (cp <= rsq) {
|
||||||
@ -148,6 +154,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
|
||||||
|
|
||||||
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
||||||
|
|
||||||
|
@ -43,6 +43,7 @@ namespace Grid {
|
|||||||
public:
|
public:
|
||||||
RealD Tolerance;
|
RealD Tolerance;
|
||||||
Integer MaxIterations;
|
Integer MaxIterations;
|
||||||
|
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
|
||||||
int verbose;
|
int verbose;
|
||||||
MultiShiftFunction shifts;
|
MultiShiftFunction shifts;
|
||||||
|
|
||||||
@ -164,6 +165,15 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
|
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////
|
||||||
|
// Timers
|
||||||
|
///////////////////////////////////////
|
||||||
|
GridStopWatch AXPYTimer;
|
||||||
|
GridStopWatch ShiftTimer;
|
||||||
|
GridStopWatch QRTimer;
|
||||||
|
GridStopWatch MatrixTimer;
|
||||||
|
GridStopWatch SolverTimer;
|
||||||
|
SolverTimer.Start();
|
||||||
|
|
||||||
// Iteration loop
|
// Iteration loop
|
||||||
int k;
|
int k;
|
||||||
@ -171,7 +181,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
for (k=1;k<=MaxIterations;k++){
|
for (k=1;k<=MaxIterations;k++){
|
||||||
|
|
||||||
a = c /cp;
|
a = c /cp;
|
||||||
|
AXPYTimer.Start();
|
||||||
axpy(p,a,p,r);
|
axpy(p,a,p,r);
|
||||||
|
AXPYTimer.Stop();
|
||||||
|
|
||||||
// Note to self - direction ps is iterated seperately
|
// Note to self - direction ps is iterated seperately
|
||||||
// for each shift. Does not appear to have any scope
|
// for each shift. Does not appear to have any scope
|
||||||
@ -180,6 +192,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
// However SAME r is used. Could load "r" and update
|
// However SAME r is used. Could load "r" and update
|
||||||
// ALL ps[s]. 2/3 Bandwidth saving
|
// ALL ps[s]. 2/3 Bandwidth saving
|
||||||
// New Kernel: Load r, vector of coeffs, vector of pointers ps
|
// New Kernel: Load r, vector of coeffs, vector of pointers ps
|
||||||
|
AXPYTimer.Start();
|
||||||
for(int s=0;s<nshift;s++){
|
for(int s=0;s<nshift;s++){
|
||||||
if ( ! converged[s] ) {
|
if ( ! converged[s] ) {
|
||||||
if (s==0){
|
if (s==0){
|
||||||
@ -190,22 +203,34 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
AXPYTimer.Stop();
|
||||||
|
|
||||||
cp=c;
|
cp=c;
|
||||||
|
MatrixTimer.Start();
|
||||||
|
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
|
||||||
|
// The below is faster on KNL
|
||||||
|
Linop.HermOp(p,mmp);
|
||||||
|
d=real(innerProduct(p,mmp));
|
||||||
|
|
||||||
Linop.HermOpAndNorm(p,mmp,d,qq);
|
MatrixTimer.Stop();
|
||||||
|
|
||||||
|
AXPYTimer.Start();
|
||||||
axpy(mmp,mass[0],p,mmp);
|
axpy(mmp,mass[0],p,mmp);
|
||||||
|
AXPYTimer.Stop();
|
||||||
RealD rn = norm2(p);
|
RealD rn = norm2(p);
|
||||||
d += rn*mass[0];
|
d += rn*mass[0];
|
||||||
|
|
||||||
bp=b;
|
bp=b;
|
||||||
b=-cp/d;
|
b=-cp/d;
|
||||||
|
|
||||||
|
AXPYTimer.Start();
|
||||||
c=axpy_norm(r,b,mmp,r);
|
c=axpy_norm(r,b,mmp,r);
|
||||||
|
AXPYTimer.Stop();
|
||||||
|
|
||||||
// Toggle the recurrence history
|
// Toggle the recurrence history
|
||||||
bs[0] = b;
|
bs[0] = b;
|
||||||
iz = 1-iz;
|
iz = 1-iz;
|
||||||
|
ShiftTimer.Start();
|
||||||
for(int s=1;s<nshift;s++){
|
for(int s=1;s<nshift;s++){
|
||||||
if((!converged[s])){
|
if((!converged[s])){
|
||||||
RealD z0 = z[s][1-iz];
|
RealD z0 = z[s][1-iz];
|
||||||
@ -215,6 +240,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
|
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ShiftTimer.Stop();
|
||||||
|
|
||||||
for(int s=0;s<nshift;s++){
|
for(int s=0;s<nshift;s++){
|
||||||
int ss = s;
|
int ss = s;
|
||||||
@ -257,6 +283,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
|
|
||||||
if ( all_converged ){
|
if ( all_converged ){
|
||||||
|
|
||||||
|
SolverTimer.Stop();
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
|
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
|
||||||
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
|
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
|
||||||
|
|
||||||
@ -269,8 +298,19 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
RealD cn = norm2(src);
|
RealD cn = norm2(src);
|
||||||
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
|
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
|
||||||
|
|
||||||
|
IterationsToComplete = k;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
// ugly hack
|
// ugly hack
|
||||||
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
||||||
|
@ -244,19 +244,11 @@ namespace Grid {
|
|||||||
|
|
||||||
template<class sobj,class vobj> strong_inline
|
template<class sobj,class vobj> strong_inline
|
||||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||||
ret.checkerboard = x.checkerboard;
|
return axpy_norm_fast(ret,a,x,y);
|
||||||
conformable(ret,x);
|
|
||||||
conformable(x,y);
|
|
||||||
axpy(ret,a,x,y);
|
|
||||||
return norm2(ret);
|
|
||||||
}
|
}
|
||||||
template<class sobj,class vobj> strong_inline
|
template<class sobj,class vobj> strong_inline
|
||||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||||
ret.checkerboard = x.checkerboard;
|
return axpby_norm_fast(ret,a,b,x,y);
|
||||||
conformable(ret,x);
|
|
||||||
conformable(x,y);
|
|
||||||
axpby(ret,a,b,x,y);
|
|
||||||
return norm2(ret); // FIXME implement parallel norm in ss loop
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -33,7 +33,7 @@ namespace Grid {
|
|||||||
// Deterministic Reduction operations
|
// Deterministic Reduction operations
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
||||||
ComplexD nrm = innerProduct(arg,arg);
|
auto nrm = innerProduct(arg,arg);
|
||||||
return std::real(nrm);
|
return std::real(nrm);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,32 +43,85 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
{
|
{
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
scalar_type nrm;
|
|
||||||
|
|
||||||
GridBase *grid = left._grid;
|
GridBase *grid = left._grid;
|
||||||
|
const int pad = 8;
|
||||||
|
|
||||||
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
|
ComplexD inner;
|
||||||
|
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
|
||||||
|
|
||||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
|
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
|
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
|
||||||
}
|
}
|
||||||
sumarray[thr]=TensorRemove(vnrm) ;
|
// All threads sum across SIMD; reduce serial work at end
|
||||||
|
// one write per cacheline with streaming store
|
||||||
|
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
|
||||||
|
vstream(sumarray[thr*pad],tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
vector_type vvnrm; vvnrm=zero; // sum across threads
|
inner=0.0;
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
vvnrm = vvnrm+sumarray[i];
|
inner = inner+sumarray[i*pad];
|
||||||
}
|
}
|
||||||
nrm = Reduce(vvnrm);// sum across simd
|
right._grid->GlobalSum(inner);
|
||||||
right._grid->GlobalSum(nrm);
|
return inner;
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
// Fast axpby_norm
|
||||||
|
// z = a x + b y
|
||||||
|
// return norm z
|
||||||
|
/////////////////////////
|
||||||
|
template<class sobj,class vobj> strong_inline RealD
|
||||||
|
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
|
{
|
||||||
|
sobj one(1.0);
|
||||||
|
return axpby_norm_fast(z,a,one,x,y);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class sobj,class vobj> strong_inline RealD
|
||||||
|
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
|
{
|
||||||
|
const int pad = 8;
|
||||||
|
z.checkerboard = x.checkerboard;
|
||||||
|
conformable(z,x);
|
||||||
|
conformable(x,y);
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
|
RealD nrm;
|
||||||
|
|
||||||
|
GridBase *grid = x._grid;
|
||||||
|
|
||||||
|
Vector<RealD> sumarray(grid->SumArraySize()*pad);
|
||||||
|
|
||||||
|
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
|
int nwork, mywork, myoff;
|
||||||
|
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
|
// private to thread; sub summation
|
||||||
|
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
|
||||||
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
|
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
||||||
|
vnrm = vnrm + innerProductD(tmp,tmp);
|
||||||
|
vstream(z._odata[ss],tmp);
|
||||||
|
}
|
||||||
|
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
nrm = 0.0; // sum across threads; linear in thread count but fast
|
||||||
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
|
nrm = nrm+sumarray[i*pad];
|
||||||
|
}
|
||||||
|
z._grid->GlobalSum(nrm);
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Op,class T1>
|
template<class Op,class T1>
|
||||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
||||||
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
|
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
|
||||||
|
@ -49,7 +49,8 @@ inline double usecond(void) {
|
|||||||
|
|
||||||
typedef std::chrono::system_clock GridClock;
|
typedef std::chrono::system_clock GridClock;
|
||||||
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
||||||
typedef std::chrono::milliseconds GridTime;
|
typedef std::chrono::milliseconds GridMillisecs;
|
||||||
|
typedef std::chrono::microseconds GridTime;
|
||||||
typedef std::chrono::microseconds GridUsecs;
|
typedef std::chrono::microseconds GridUsecs;
|
||||||
|
|
||||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
||||||
@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli
|
|||||||
stream << time.count()<<" ms";
|
stream << time.count()<<" ms";
|
||||||
return stream;
|
return stream;
|
||||||
}
|
}
|
||||||
|
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
|
||||||
|
{
|
||||||
|
stream << time.count()<<" usec";
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
class GridStopWatch {
|
class GridStopWatch {
|
||||||
private:
|
private:
|
||||||
|
@ -63,9 +63,12 @@ namespace Grid {
|
|||||||
virtual RealD M (const FermionField &in, FermionField &out)=0;
|
virtual RealD M (const FermionField &in, FermionField &out)=0;
|
||||||
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
|
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
|
||||||
|
|
||||||
// half checkerboard operaions
|
// Query the even even properties to make algorithmic decisions
|
||||||
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
|
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
|
||||||
|
virtual int isTrivialEE(void) { return 0; };
|
||||||
|
virtual RealD Mass(void) {return 0.0;};
|
||||||
|
|
||||||
|
// half checkerboard operaions
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
||||||
virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
|
virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
|
||||||
virtual void Mooee (const FermionField &in, FermionField &out)=0;
|
virtual void Mooee (const FermionField &in, FermionField &out)=0;
|
||||||
|
@ -765,6 +765,11 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
reg = memory;
|
reg = memory;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void InsertGaugeField(DoubledGaugeField &U_ds,
|
||||||
|
const GaugeLinkField &U,int mu)
|
||||||
|
{
|
||||||
|
PokeIndex<LorentzIndex>(U_ds, U, mu);
|
||||||
|
}
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,
|
inline void DoubleStore(GridBase *GaugeGrid,
|
||||||
DoubledGaugeField &UUUds, // for Naik term
|
DoubledGaugeField &UUUds, // for Naik term
|
||||||
DoubledGaugeField &Uds,
|
DoubledGaugeField &Uds,
|
||||||
@ -803,8 +808,10 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
U = U *phases;
|
U = U *phases;
|
||||||
Udag = Udag *phases;
|
Udag = Udag *phases;
|
||||||
|
|
||||||
PokeIndex<LorentzIndex>(Uds, U, mu);
|
InsertGaugeField(Uds,U,mu);
|
||||||
PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
InsertGaugeField(Uds,Udag,mu+4);
|
||||||
|
// PokeIndex<LorentzIndex>(Uds, U, mu);
|
||||||
|
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
||||||
|
|
||||||
// 3 hop based on thin links. Crazy huh ?
|
// 3 hop based on thin links. Crazy huh ?
|
||||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||||
@ -816,8 +823,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
UUU = UUU *phases;
|
UUU = UUU *phases;
|
||||||
UUUdag = UUUdag *phases;
|
UUUdag = UUUdag *phases;
|
||||||
|
|
||||||
PokeIndex<LorentzIndex>(UUUds, UUU, mu);
|
InsertGaugeField(UUUds,UUU,mu);
|
||||||
PokeIndex<LorentzIndex>(UUUds, UUUdag, mu+4);
|
InsertGaugeField(UUUds,UUUdag,mu+4);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -910,6 +917,23 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
mac(&phi(), &UU(), &chi());
|
mac(&phi(), &UU(), &chi());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
||||||
|
{
|
||||||
|
GridBase *GaugeGrid = U_ds._grid;
|
||||||
|
parallel_for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||||
|
|
||||||
|
SiteScalarGaugeLink ScalarU;
|
||||||
|
SiteDoubledGaugeField ScalarUds;
|
||||||
|
|
||||||
|
std::vector<int> lcoor;
|
||||||
|
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
peekLocalSite(ScalarUds, U_ds, lcoor);
|
||||||
|
|
||||||
|
peekLocalSite(ScalarU, U, lcoor);
|
||||||
|
ScalarUds(mu) = ScalarU();
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,
|
inline void DoubleStore(GridBase *GaugeGrid,
|
||||||
DoubledGaugeField &UUUds, // for Naik term
|
DoubledGaugeField &UUUds, // for Naik term
|
||||||
DoubledGaugeField &Uds,
|
DoubledGaugeField &Uds,
|
||||||
@ -951,23 +975,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
U = U *phases;
|
U = U *phases;
|
||||||
Udag = Udag *phases;
|
Udag = Udag *phases;
|
||||||
|
|
||||||
|
InsertGaugeField(Uds,U,mu);
|
||||||
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
InsertGaugeField(Uds,Udag,mu+4);
|
||||||
SiteScalarGaugeLink ScalarU;
|
|
||||||
SiteDoubledGaugeField ScalarUds;
|
|
||||||
|
|
||||||
std::vector<int> lcoor;
|
|
||||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
|
||||||
peekLocalSite(ScalarUds, Uds, lcoor);
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, U, lcoor);
|
|
||||||
ScalarUds(mu) = ScalarU();
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, Udag, lcoor);
|
|
||||||
ScalarUds(mu + 4) = ScalarU();
|
|
||||||
|
|
||||||
pokeLocalSite(ScalarUds, Uds, lcoor);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3 hop based on thin links. Crazy huh ?
|
// 3 hop based on thin links. Crazy huh ?
|
||||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||||
@ -979,24 +988,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
UUU = UUU *phases;
|
UUU = UUU *phases;
|
||||||
UUUdag = UUUdag *phases;
|
UUUdag = UUUdag *phases;
|
||||||
|
|
||||||
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
InsertGaugeField(UUUds,UUU,mu);
|
||||||
|
InsertGaugeField(UUUds,UUUdag,mu+4);
|
||||||
SiteScalarGaugeLink ScalarU;
|
|
||||||
SiteDoubledGaugeField ScalarUds;
|
|
||||||
|
|
||||||
std::vector<int> lcoor;
|
|
||||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
|
||||||
|
|
||||||
peekLocalSite(ScalarUds, UUUds, lcoor);
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, UUU, lcoor);
|
|
||||||
ScalarUds(mu) = ScalarU();
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, UUUdag, lcoor);
|
|
||||||
ScalarUds(mu + 4) = ScalarU();
|
|
||||||
|
|
||||||
pokeLocalSite(ScalarUds, UUUds, lcoor);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -44,6 +44,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3,
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
||||||
RealD _mass,
|
RealD _mass,
|
||||||
|
RealD _c1, RealD _c2,RealD _u0,
|
||||||
const ImplParams &p)
|
const ImplParams &p)
|
||||||
: Kernels(p),
|
: Kernels(p),
|
||||||
_grid(&Fgrid),
|
_grid(&Fgrid),
|
||||||
@ -62,6 +63,16 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
|
|||||||
UUUmuOdd(&Hgrid) ,
|
UUUmuOdd(&Hgrid) ,
|
||||||
_tmp(&Hgrid)
|
_tmp(&Hgrid)
|
||||||
{
|
{
|
||||||
|
int vol4;
|
||||||
|
int LLs=1;
|
||||||
|
c1=_c1;
|
||||||
|
c2=_c2;
|
||||||
|
u0=_u0;
|
||||||
|
vol4= _grid->oSites();
|
||||||
|
Stencil.BuildSurfaceList(LLs,vol4);
|
||||||
|
vol4= _cbgrid->oSites();
|
||||||
|
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||||
|
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -69,22 +80,10 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
|
|||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
RealD _c1, RealD _c2,RealD _u0,
|
RealD _c1, RealD _c2,RealD _u0,
|
||||||
const ImplParams &p)
|
const ImplParams &p)
|
||||||
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
|
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
|
||||||
{
|
{
|
||||||
c1=_c1;
|
|
||||||
c2=_c2;
|
|
||||||
u0=_u0;
|
|
||||||
ImportGauge(_Uthin,_Ufat);
|
ImportGauge(_Uthin,_Ufat);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
|
||||||
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
|
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
|
||||||
const ImplParams &p)
|
|
||||||
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
|
|
||||||
{
|
|
||||||
ImportGaugeSimple(_Utriple,_Ufat);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Momentum space propagator should be
|
// Momentum space propagator should be
|
||||||
@ -98,11 +97,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,Gaug
|
|||||||
// of above link to implmement fourier based solver.
|
// of above link to implmement fourier based solver.
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin)
|
|
||||||
{
|
|
||||||
ImportGauge(_Uthin,_Uthin);
|
|
||||||
};
|
|
||||||
template <class Impl>
|
|
||||||
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
||||||
{
|
{
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
@ -125,6 +119,20 @@ void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utripl
|
|||||||
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
|
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
CopyGaugeCheckerboards();
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
||||||
|
{
|
||||||
|
|
||||||
|
Umu = _U;
|
||||||
|
UUUmu = _UUU;
|
||||||
|
CopyGaugeCheckerboards();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
|
||||||
|
{
|
||||||
pickCheckerboard(Even, UmuEven, Umu);
|
pickCheckerboard(Even, UmuEven, Umu);
|
||||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||||
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
||||||
@ -160,10 +168,7 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
|
|||||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||||
}
|
}
|
||||||
|
|
||||||
pickCheckerboard(Even, UmuEven, Umu);
|
CopyGaugeCheckerboards();
|
||||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
|
||||||
pickCheckerboard(Even, UUUmuEven, UUUmu);
|
|
||||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@ -322,6 +327,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
||||||
|
DhopCalls+=2;
|
||||||
conformable(in._grid, _grid); // verifies full grid
|
conformable(in._grid, _grid); // verifies full grid
|
||||||
conformable(in._grid, out._grid);
|
conformable(in._grid, out._grid);
|
||||||
|
|
||||||
@ -332,6 +338,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
||||||
|
DhopCalls+=1;
|
||||||
conformable(in._grid, _cbgrid); // verifies half grid
|
conformable(in._grid, _cbgrid); // verifies half grid
|
||||||
conformable(in._grid, out._grid); // drops the cb check
|
conformable(in._grid, out._grid); // drops the cb check
|
||||||
|
|
||||||
@ -343,6 +350,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
|
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
|
||||||
|
DhopCalls+=1;
|
||||||
conformable(in._grid, _cbgrid); // verifies half grid
|
conformable(in._grid, _cbgrid); // verifies half grid
|
||||||
conformable(in._grid, out._grid); // drops the cb check
|
conformable(in._grid, out._grid); // drops the cb check
|
||||||
|
|
||||||
@ -374,25 +382,193 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
|
|||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag) {
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
Compressor compressor;
|
||||||
|
int len = U._grid->oSites();
|
||||||
|
const int LLs = 1;
|
||||||
|
|
||||||
|
DhopTotalTime -= usecond();
|
||||||
|
|
||||||
|
DhopFaceTime -= usecond();
|
||||||
|
st.Prepare();
|
||||||
|
st.HaloGather(in,compressor);
|
||||||
|
st.CommsMergeSHM(compressor);
|
||||||
|
DhopFaceTime += usecond();
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
DhopComputeTime -= usecond();
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
|
int nthreads = omp_get_num_threads();
|
||||||
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = len;
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the compute
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
int sU = ss;
|
||||||
|
// Interior = 1; Exterior = 0; must implement for staggered
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
// Interior = 1; Exterior = 0;
|
||||||
|
int sU = ss;
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime += usecond();
|
||||||
|
|
||||||
|
// First to enter, last to leave timing
|
||||||
|
DhopFaceTime -= usecond();
|
||||||
|
st.CommsMerge(compressor);
|
||||||
|
DhopFaceTime -= usecond();
|
||||||
|
|
||||||
|
DhopComputeTime2 -= usecond();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime2 += usecond();
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
|
DhopTotalTime -= usecond();
|
||||||
|
|
||||||
|
DhopCommTime -= usecond();
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
|
DhopCommTime += usecond();
|
||||||
|
|
||||||
|
DhopComputeTime -= usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
DhopComputeTime += usecond();
|
||||||
|
DhopTotalTime += usecond();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Reporting
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::Report(void)
|
||||||
|
{
|
||||||
|
std::vector<int> latt = GridDefaultLatt();
|
||||||
|
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||||
|
RealD NP = _grid->_Nprocessors;
|
||||||
|
RealD NN = _grid->NodeCount();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
|
||||||
|
<< DhopCalls << std::endl;
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
|
||||||
|
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
|
||||||
|
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
|
||||||
|
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||||
|
|
||||||
|
// Average the compute time
|
||||||
|
_grid->GlobalSum(DhopComputeTime);
|
||||||
|
DhopComputeTime/=NP;
|
||||||
|
|
||||||
|
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||||
|
|
||||||
|
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
||||||
|
}
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
|
||||||
|
{
|
||||||
|
DhopCalls = 0;
|
||||||
|
DhopTotalTime = 0;
|
||||||
|
DhopCommTime = 0;
|
||||||
|
DhopComputeTime = 0;
|
||||||
|
DhopFaceTime = 0;
|
||||||
|
|
||||||
|
Stencil.ZeroCounters();
|
||||||
|
StencilEven.ZeroCounters();
|
||||||
|
StencilOdd.ZeroCounters();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Conserved current - not yet implemented.
|
// Conserved current - not yet implemented.
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
|
@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
|||||||
FermionField _tmp;
|
FermionField _tmp;
|
||||||
FermionField &tmp(void) { return _tmp; }
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
|
////////////////////////////////////////
|
||||||
|
// Performance monitoring
|
||||||
|
////////////////////////////////////////
|
||||||
|
void Report(void);
|
||||||
|
void ZeroCounters(void);
|
||||||
|
double DhopTotalTime;
|
||||||
|
double DhopCalls;
|
||||||
|
double DhopCommTime;
|
||||||
|
double DhopComputeTime;
|
||||||
|
double DhopComputeTime2;
|
||||||
|
double DhopFaceTime;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -105,25 +117,34 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
|||||||
|
|
||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
// Constructor
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
// Grid own interface Constructor
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
|
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
|
RealD _c1, RealD _c2,RealD _u0,
|
||||||
const ImplParams &p = ImplParams());
|
|
||||||
|
|
||||||
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
|
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
|
||||||
const ImplParams &p = ImplParams());
|
const ImplParams &p = ImplParams());
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
// MILC constructor no gauge fields
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
|
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
|
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
|
||||||
const ImplParams &p = ImplParams());
|
const ImplParams &p = ImplParams());
|
||||||
|
|
||||||
|
|
||||||
// DoubleStore impl dependent
|
// DoubleStore impl dependent
|
||||||
void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat);
|
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
|
||||||
void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
|
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
|
||||||
void ImportGauge(const GaugeField &_Uthin);
|
void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U);
|
||||||
|
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
|
||||||
|
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
||||||
|
DoubledGaugeField &GetUUU(void) { return UUUmu; };
|
||||||
|
void CopyGaugeCheckerboards(void);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
@ -132,7 +153,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
|||||||
// protected:
|
// protected:
|
||||||
public:
|
public:
|
||||||
// any other parameters of action ???
|
// any other parameters of action ???
|
||||||
|
virtual int isTrivialEE(void) { return 1; };
|
||||||
|
virtual RealD Mass(void) { return mass; }
|
||||||
RealD mass;
|
RealD mass;
|
||||||
RealD u0;
|
RealD u0;
|
||||||
RealD c1;
|
RealD c1;
|
||||||
|
@ -41,8 +41,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3,
|
|||||||
|
|
||||||
// 5d lattice for DWF.
|
// 5d lattice for DWF.
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
|
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
|
||||||
GridCartesian &FiveDimGrid,
|
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
GridCartesian &FourDimGrid,
|
GridCartesian &FourDimGrid,
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
@ -121,16 +120,74 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
|||||||
assert(FiveDimGrid._simd_layout[0] ==1);
|
assert(FiveDimGrid._simd_layout[0] ==1);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
int LLs = FiveDimGrid._rdimensions[0];
|
||||||
|
int vol4= FourDimGrid.oSites();
|
||||||
|
Stencil.BuildSurfaceList(LLs,vol4);
|
||||||
|
|
||||||
// Allocate the required comms buffer
|
vol4=FourDimRedBlackGrid.oSites();
|
||||||
|
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||||
|
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
|
||||||
|
{
|
||||||
|
pickCheckerboard(Even, UmuEven, Umu);
|
||||||
|
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||||
|
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
||||||
|
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
||||||
|
}
|
||||||
|
template<class Impl>
|
||||||
|
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
|
||||||
|
GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
|
RealD _mass,
|
||||||
|
RealD _c1,RealD _c2, RealD _u0,
|
||||||
|
const ImplParams &p) :
|
||||||
|
ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
|
||||||
|
FourDimGrid,FourDimRedBlackGrid,
|
||||||
|
_mass,_c1,_c2,_u0,p)
|
||||||
|
{
|
||||||
ImportGauge(_Uthin,_Ufat);
|
ImportGauge(_Uthin,_Ufat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// For MILC use; pass three link U's and 1 link U
|
||||||
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin)
|
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
||||||
{
|
{
|
||||||
ImportGauge(_Uthin,_Uthin);
|
/////////////////////////////////////////////////////////////////
|
||||||
};
|
// Trivial import; phases and fattening and such like preapplied
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
|
|
||||||
|
auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
|
||||||
|
Impl::InsertGaugeField(UUUmu,U,mu);
|
||||||
|
|
||||||
|
U = adj( Cshift(U, mu, -3));
|
||||||
|
Impl::InsertGaugeField(UUUmu,-U,mu+4);
|
||||||
|
|
||||||
|
U = PeekIndex<LorentzIndex>(_Ufat, mu);
|
||||||
|
Impl::InsertGaugeField(Umu,U,mu);
|
||||||
|
|
||||||
|
U = adj( Cshift(U, mu, -1));
|
||||||
|
Impl::InsertGaugeField(Umu,-U,mu+4);
|
||||||
|
|
||||||
|
}
|
||||||
|
CopyGaugeCheckerboards();
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
||||||
|
{
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// Trivial import; phases and fattening and such like preapplied
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
Umu = _U;
|
||||||
|
UUUmu = _UUU;
|
||||||
|
CopyGaugeCheckerboards();
|
||||||
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
|
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
|
||||||
{
|
{
|
||||||
@ -159,10 +216,7 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,cons
|
|||||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||||
}
|
}
|
||||||
|
|
||||||
pickCheckerboard(Even, UmuEven, Umu);
|
CopyGaugeCheckerboards();
|
||||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
|
||||||
pickCheckerboard(Even, UUUmuEven, UUUmu);
|
|
||||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
|
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
|
||||||
@ -223,6 +277,162 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*CHANGE */
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
|
||||||
|
Compressor compressor;
|
||||||
|
|
||||||
|
int LLs = in._grid->_rdimensions[0];
|
||||||
|
int len = U._grid->oSites();
|
||||||
|
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
|
st.Prepare();
|
||||||
|
st.HaloGather(in,compressor);
|
||||||
|
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
||||||
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
|
double ctime=0;
|
||||||
|
double ptime=0;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||||
|
{
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
|
int nthreads = omp_get_num_threads();
|
||||||
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
double start = usecond();
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = U._grid->oSites(); // 4d vol
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the compute
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
int sU = ss;
|
||||||
|
// Interior = 1; Exterior = 0; must implement for staggered
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
// Interior = 1; Exterior = 0;
|
||||||
|
int sU = ss;
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ptime = usecond() - start;
|
||||||
|
} else {
|
||||||
|
double start = usecond();
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
ctime = usecond() - start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopCommTime += ctime;
|
||||||
|
DhopComputeTime+=ptime;
|
||||||
|
|
||||||
|
// First to enter, last to leave timing
|
||||||
|
st.CollateThreads();
|
||||||
|
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
|
st.CommsMerge(compressor);
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
|
DhopComputeTime2-=usecond();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime2+=usecond();
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
Compressor compressor;
|
||||||
|
int LLs = in._grid->_rdimensions[0];
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//double t1=usecond();
|
||||||
|
DhopTotalTime -= usecond();
|
||||||
|
DhopCommTime -= usecond();
|
||||||
|
st.HaloExchange(in,compressor);
|
||||||
|
DhopCommTime += usecond();
|
||||||
|
|
||||||
|
DhopComputeTime -= usecond();
|
||||||
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
|
int sU=ss;
|
||||||
|
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
|
int sU=ss;
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime += usecond();
|
||||||
|
DhopTotalTime += usecond();
|
||||||
|
//double t2=usecond();
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
/*CHANGE END*/
|
||||||
|
|
||||||
|
/* ORG
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
@ -254,6 +464,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
|||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
DhopTotalTime += usecond();
|
DhopTotalTime += usecond();
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -336,6 +547,9 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
|
|||||||
DhopTotalTime = 0;
|
DhopTotalTime = 0;
|
||||||
DhopCommTime = 0;
|
DhopCommTime = 0;
|
||||||
DhopComputeTime = 0;
|
DhopComputeTime = 0;
|
||||||
|
DhopFaceTime = 0;
|
||||||
|
|
||||||
|
|
||||||
Stencil.ZeroCounters();
|
Stencil.ZeroCounters();
|
||||||
StencilEven.ZeroCounters();
|
StencilEven.ZeroCounters();
|
||||||
StencilOdd.ZeroCounters();
|
StencilOdd.ZeroCounters();
|
||||||
|
@ -64,6 +64,8 @@ namespace QCD {
|
|||||||
double DhopCalls;
|
double DhopCalls;
|
||||||
double DhopCommTime;
|
double DhopCommTime;
|
||||||
double DhopComputeTime;
|
double DhopComputeTime;
|
||||||
|
double DhopComputeTime2;
|
||||||
|
double DhopFaceTime;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
@ -119,7 +121,27 @@ namespace QCD {
|
|||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
|
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out,
|
||||||
|
int dag);
|
||||||
|
|
||||||
|
void DhopInternalSerialComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out,
|
||||||
|
int dag);
|
||||||
|
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Grid internal interface -- Thin link and fat link, with coefficients
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
||||||
GaugeField &_Ufat,
|
GaugeField &_Ufat,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
@ -127,18 +149,38 @@ namespace QCD {
|
|||||||
GridCartesian &FourDimGrid,
|
GridCartesian &FourDimGrid,
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
double _mass,
|
double _mass,
|
||||||
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
|
RealD _c1, RealD _c2,RealD _u0,
|
||||||
|
const ImplParams &p= ImplParams());
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// MILC constructor ; triple links, no rescale factors; must be externally pre multiplied
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
|
double _mass,
|
||||||
|
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
|
||||||
const ImplParams &p= ImplParams());
|
const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
// DoubleStore
|
// DoubleStore gauge field in operator
|
||||||
void ImportGauge(const GaugeField &_U);
|
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
|
||||||
void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
|
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
|
||||||
|
void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
|
||||||
|
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
|
||||||
|
// Give a reference; can be used to do an assignment or copy back out after import
|
||||||
|
// if Carleton wants to cache them and not use the ImportSimple
|
||||||
|
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
||||||
|
DoubledGaugeField &GetUUU(void) { return UUUmu; };
|
||||||
|
void CopyGaugeCheckerboards(void);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
virtual int isTrivialEE(void) { return 1; };
|
||||||
|
virtual RealD Mass(void) { return mass; }
|
||||||
|
|
||||||
GridBase *_FourDimGrid;
|
GridBase *_FourDimGrid;
|
||||||
GridBase *_FourDimRedBlackGrid;
|
GridBase *_FourDimRedBlackGrid;
|
||||||
GridBase *_FiveDimGrid;
|
GridBase *_FiveDimGrid;
|
||||||
|
@ -32,223 +32,241 @@ namespace Grid {
|
|||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
||||||
|
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
|
||||||
|
|
||||||
|
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
|
||||||
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
|
if (SE->_is_local ) { \
|
||||||
|
if (SE->_permute) { \
|
||||||
|
chi_p = χ \
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype); \
|
||||||
|
} else { \
|
||||||
|
chi_p = &in._odata[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
chi_p = &buf[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
multLink(Uchi, U._odata[sU], *chi_p, Dir);
|
||||||
|
|
||||||
|
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
|
||||||
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
|
if (SE->_is_local ) { \
|
||||||
|
if (SE->_permute) { \
|
||||||
|
chi_p = χ \
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype); \
|
||||||
|
} else { \
|
||||||
|
chi_p = &in._odata[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
} else if ( st.same_node[Dir] ) { \
|
||||||
|
chi_p = &buf[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||||
|
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
||||||
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
|
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||||
|
nmu++; \
|
||||||
|
chi_p = &buf[SE->_offset]; \
|
||||||
|
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
||||||
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Generic implementation; move to different file?
|
// Generic implementation; move to different file?
|
||||||
////////////////////////////////////////////
|
// Int, Ext, Int+Ext cases for comms overlap
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||||
SiteSpinor *buf, int sF,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out, int dag) {
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
SiteSpinor chi;
|
SiteSpinor chi;
|
||||||
SiteSpinor Uchi;
|
SiteSpinor Uchi;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
int skew = 0;
|
int skew;
|
||||||
if (threeLink) skew=8;
|
|
||||||
///////////////////////////
|
|
||||||
// Xp
|
|
||||||
///////////////////////////
|
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Xp+skew, sF);
|
for(int s=0;s<LLs;s++){
|
||||||
if (SE->_is_local) {
|
int sF=LLs*sU+s;
|
||||||
if (SE->_permute) {
|
skew = 0;
|
||||||
chi_p = χ
|
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
||||||
} else {
|
GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
|
||||||
chi_p = &in._odata[SE->_offset];
|
GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
skew=8;
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
if ( dag ) {
|
||||||
|
Uchi = - Uchi;
|
||||||
}
|
}
|
||||||
} else {
|
vstream(out._odata[sF], Uchi);
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
}
|
||||||
Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Yp
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Yp+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Zp
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Zp+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Tp
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Tp+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Xm
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Xm+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Ym
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Ym+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Zm
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Zm+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Tm
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Tm+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm);
|
|
||||||
|
|
||||||
vstream(out, Uchi);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// Only contributions from interior of our node
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag) {
|
||||||
|
const SiteSpinor *chi_p;
|
||||||
|
SiteSpinor chi;
|
||||||
|
SiteSpinor Uchi;
|
||||||
|
StencilEntry *SE;
|
||||||
|
int ptype;
|
||||||
|
int skew ;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=LLs*sU+s;
|
||||||
|
skew = 0;
|
||||||
|
Uchi=zero;
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
skew=8;
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
if ( dag ) {
|
||||||
|
Uchi = - Uchi;
|
||||||
|
}
|
||||||
|
vstream(out._odata[sF], Uchi);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// Only contributions from exterior of our node
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag) {
|
||||||
|
const SiteSpinor *chi_p;
|
||||||
|
SiteSpinor chi;
|
||||||
|
SiteSpinor Uchi;
|
||||||
|
StencilEntry *SE;
|
||||||
|
int ptype;
|
||||||
|
int nmu=0;
|
||||||
|
int skew ;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=LLs*sU+s;
|
||||||
|
skew = 0;
|
||||||
|
Uchi=zero;
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
skew=8;
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
|
||||||
|
if ( nmu ) {
|
||||||
|
if ( dag ) {
|
||||||
|
out._odata[sF] = out._odata[sF] - Uchi;
|
||||||
|
} else {
|
||||||
|
out._odata[sF] = out._odata[sF] + Uchi;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Driving / wrapping routine to select right kernel
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionField &in, FermionField &out) {
|
const FermionField &in, FermionField &out,
|
||||||
SiteSpinor naik;
|
int interior,int exterior)
|
||||||
SiteSpinor naive;
|
{
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
|
||||||
int dag=1;
|
int dag=1;
|
||||||
switch(Opt) {
|
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||||
#ifdef AVX512
|
};
|
||||||
//FIXME; move the sign into the Asm routine
|
|
||||||
case OptInlineAsm:
|
template <class Impl>
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
for(int s=0;s<LLs;s++) {
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
int sF=s+LLs*sU;
|
const FermionField &in, FermionField &out,
|
||||||
out._odata[sF]=-out._odata[sF];
|
int interior,int exterior)
|
||||||
}
|
{
|
||||||
break;
|
int dag=0;
|
||||||
#endif
|
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||||
case OptHandUnroll:
|
|
||||||
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
break;
|
|
||||||
case OptGeneric:
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sF=s+LLs*sU;
|
|
||||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
|
||||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =-naive-naik;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
std::cout<<"Oops Opt = "<<Opt<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int LLs,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, FermionField &out)
|
int sU, const FermionField &in, FermionField &out,
|
||||||
|
int dag,int interior,int exterior)
|
||||||
{
|
{
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
|
||||||
SiteSpinor naik;
|
|
||||||
SiteSpinor naive;
|
|
||||||
int dag=0;
|
|
||||||
switch(Opt) {
|
switch(Opt) {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
case OptInlineAsm:
|
case OptInlineAsm:
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
if ( interior && exterior ) {
|
||||||
|
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else {
|
||||||
|
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case OptHandUnroll:
|
case OptHandUnroll:
|
||||||
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
if ( interior && exterior ) {
|
||||||
|
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( interior ) {
|
||||||
|
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( exterior ) {
|
||||||
|
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case OptGeneric:
|
case OptGeneric:
|
||||||
for(int s=0;s<LLs;s++){
|
if ( interior && exterior ) {
|
||||||
int sF=LLs*sU+s;
|
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
// assert(sF<in._odata.size());
|
} else if ( interior ) {
|
||||||
// assert(sU< U._odata.size());
|
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
// assert(sF>=0); assert(sU>=0);
|
} else if ( exterior ) {
|
||||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =naive+naik;
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -38,8 +38,9 @@ namespace QCD {
|
|||||||
class StaggeredKernelsStatic {
|
class StaggeredKernelsStatic {
|
||||||
public:
|
public:
|
||||||
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
enum { CommsAndCompute, CommsThenCompute };
|
||||||
static int Opt; // these are a temporary hack
|
static int Opt;
|
||||||
|
static int Comms;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
|
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
|
||||||
@ -53,24 +54,62 @@ public:
|
|||||||
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
|
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
|
||||||
|
|
||||||
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
|
// Generic Nc kernels
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Nc=3 specific kernels
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
|
// Asm Nc=3 specific kernels
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Generic interface; fan out to right routine
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
||||||
|
|
||||||
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
|
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
|
||||||
int LLs, int sU, const FermionField &in, FermionField &out, int dag);
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
||||||
|
|
||||||
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||||
int LLs, int sU, const FermionField &in, FermionField &out);
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,
|
|
||||||
int LLs, int sU, const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -560,16 +560,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSTORE(2,%0,pUChi_02) \
|
VSTORE(2,%0,pUChi_02) \
|
||||||
: : "r" (out) : "memory" );
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
|
#define nREDUCE(out) \
|
||||||
|
asm ( \
|
||||||
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
|
VADD(UChi_02,UChi_12,UChi_02) \
|
||||||
|
VADD(UChi_30,UChi_20,UChi_30) \
|
||||||
|
VADD(UChi_31,UChi_21,UChi_31) \
|
||||||
|
VADD(UChi_32,UChi_22,UChi_32) \
|
||||||
|
VADD(UChi_00,UChi_30,UChi_00) \
|
||||||
|
VADD(UChi_01,UChi_31,UChi_01) \
|
||||||
|
VADD(UChi_02,UChi_32,UChi_02) ); \
|
||||||
|
asm (VZERO(Chi_00) \
|
||||||
|
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||||
|
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||||
|
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
#define REDUCEa(out) \
|
#define REDUCEa(out) \
|
||||||
asm ( \
|
asm ( \
|
||||||
VADD(UChi_00,UChi_10,UChi_00) \
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
VADD(UChi_01,UChi_11,UChi_01) \
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
VADD(UChi_02,UChi_12,UChi_02) ); \
|
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
|
// FIXME is sign right in the VSUB ?
|
||||||
|
#define nREDUCEa(out) \
|
||||||
asm ( \
|
asm ( \
|
||||||
VSTORE(0,%0,pUChi_00) \
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
VSTORE(1,%0,pUChi_01) \
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
VSTORE(2,%0,pUChi_02) \
|
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||||
: : "r" (out) : "memory" );
|
asm (VZERO(Chi_00) \
|
||||||
|
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||||
|
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||||
|
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_0,Chi_0);\
|
permute##dir(Chi_0,Chi_0);\
|
||||||
@ -581,10 +618,9 @@ namespace QCD {
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
};
|
};
|
||||||
@ -645,10 +681,9 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -685,7 +720,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCE(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCE(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCE(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -695,10 +734,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -734,7 +772,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCE(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCE(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCE(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -776,10 +818,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
|
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -832,7 +873,11 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCEa(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCEa(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCEa(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -841,10 +886,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -897,7 +941,11 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCEa(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCEa(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCEa(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -909,7 +957,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
DoubledGaugeField &U, \
|
DoubledGaugeField &U, \
|
||||||
DoubledGaugeField &UUU, \
|
DoubledGaugeField &UUU, \
|
||||||
SiteSpinor *buf, int LLs, \
|
SiteSpinor *buf, int LLs, \
|
||||||
int sU, const FermionField &in, FermionField &out);
|
int sU, const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
||||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
||||||
|
@ -28,7 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
|
|
||||||
#define REGISTER
|
|
||||||
|
|
||||||
#define LOAD_CHI(b) \
|
#define LOAD_CHI(b) \
|
||||||
const SiteSpinor & ref (b[offset]); \
|
const SiteSpinor & ref (b[offset]); \
|
||||||
@ -59,7 +58,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
UChi ## _1 += U_12*Chi_2;\
|
UChi ## _1 += U_12*Chi_2;\
|
||||||
UChi ## _2 += U_22*Chi_2;
|
UChi ## _2 += U_22*Chi_2;
|
||||||
|
|
||||||
#define MULT_ADD(A,UChi) \
|
#define MULT_ADD(U,A,UChi) \
|
||||||
auto & ref(U._odata[sU](A)); \
|
auto & ref(U._odata[sU](A)); \
|
||||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||||
@ -82,241 +81,319 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_0,Chi_0);\
|
permute##dir(Chi_0,Chi_0); \
|
||||||
permute##dir(Chi_1,Chi_1);\
|
permute##dir(Chi_1,Chi_1); \
|
||||||
permute##dir(Chi_2,Chi_2);
|
permute##dir(Chi_2,Chi_2);
|
||||||
|
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHI(in._odata); \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(Perm); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
LOAD_CHI(buf); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
|
||||||
|
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
|
{ \
|
||||||
|
MULT(Dir,even); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
|
||||||
|
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
|
{ \
|
||||||
|
MULT_ADD(U,Dir,even); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
|
||||||
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHI(in._odata); \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(Perm); \
|
||||||
|
} \
|
||||||
|
} else if ( st.same_node[Dir] ) { \
|
||||||
|
LOAD_CHI(buf); \
|
||||||
|
} \
|
||||||
|
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||||
|
MULT_ADD(U,Dir,even); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
|
||||||
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||||
|
nmu++; \
|
||||||
|
{ LOAD_CHI(buf); } \
|
||||||
|
{ MULT_ADD(U,Dir,even); } \
|
||||||
|
}
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||||
SiteSpinor *buf, int LLs,
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
int sU, const FermionField &in, FermionField &out, int dag)
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
{
|
const FermionField &in, FermionField &out,int dag)
|
||||||
SiteSpinor naik;
|
|
||||||
SiteSpinor naive;
|
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
|
||||||
int skew(0);
|
|
||||||
Real scale(1.0);
|
|
||||||
|
|
||||||
if(dag) scale = -1.0;
|
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sF=s+LLs*sU;
|
|
||||||
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
|
||||||
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =scale*(naive+naik);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink)
|
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
REGISTER Simd even_0; // 12 regs on knc
|
Simd even_0; // 12 regs on knc
|
||||||
REGISTER Simd even_1;
|
Simd even_1;
|
||||||
REGISTER Simd even_2;
|
Simd even_2;
|
||||||
REGISTER Simd odd_0; // 12 regs on knc
|
Simd odd_0; // 12 regs on knc
|
||||||
REGISTER Simd odd_1;
|
Simd odd_1;
|
||||||
REGISTER Simd odd_2;
|
Simd odd_2;
|
||||||
|
|
||||||
REGISTER Simd Chi_0; // two spinor; 6 regs
|
Simd Chi_0; // two spinor; 6 regs
|
||||||
REGISTER Simd Chi_1;
|
Simd Chi_1;
|
||||||
REGISTER Simd Chi_2;
|
Simd Chi_2;
|
||||||
|
|
||||||
REGISTER Simd U_00; // two rows of U matrix
|
Simd U_00; // two rows of U matrix
|
||||||
REGISTER Simd U_10;
|
Simd U_10;
|
||||||
REGISTER Simd U_20;
|
Simd U_20;
|
||||||
REGISTER Simd U_01;
|
Simd U_01;
|
||||||
REGISTER Simd U_11;
|
Simd U_11;
|
||||||
REGISTER Simd U_21; // 2 reg left.
|
Simd U_21; // 2 reg left.
|
||||||
REGISTER Simd U_02;
|
Simd U_02;
|
||||||
REGISTER Simd U_12;
|
Simd U_12;
|
||||||
REGISTER Simd U_22;
|
Simd U_22;
|
||||||
|
|
||||||
int skew = 0;
|
|
||||||
if (threeLink) skew=8;
|
|
||||||
|
|
||||||
|
SiteSpinor result;
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
int skew;
|
||||||
|
|
||||||
// Xp
|
for(int s=0;s<LLs;s++){
|
||||||
SE=st.GetEntry(ptype,Xp+skew,sF);
|
int sF=s+LLs*sU;
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
skew = 0;
|
||||||
LOAD_CHI(in._odata);
|
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
||||||
if ( perm) {
|
HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
HAND_STENCIL_LEG (U,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG (U,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG (U,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
||||||
|
skew = 8;
|
||||||
|
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
||||||
|
|
||||||
|
if ( dag ) {
|
||||||
|
result()()(0) = - even_0 - odd_0;
|
||||||
|
result()()(1) = - even_1 - odd_1;
|
||||||
|
result()()(2) = - even_2 - odd_2;
|
||||||
|
} else {
|
||||||
|
result()()(0) = even_0 + odd_0;
|
||||||
|
result()()(1) = even_1 + odd_1;
|
||||||
|
result()()(2) = even_2 + odd_2;
|
||||||
}
|
}
|
||||||
} else {
|
vstream(out._odata[sF],result);
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
}
|
||||||
{
|
|
||||||
MULT(Xp,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Yp
|
|
||||||
SE=st.GetEntry(ptype,Yp+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT(Yp,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Zp
|
|
||||||
SE=st.GetEntry(ptype,Zp+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Zp,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tp
|
|
||||||
SE=st.GetEntry(ptype,Tp+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Tp,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Xm
|
|
||||||
SE=st.GetEntry(ptype,Xm+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Xm,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Ym
|
|
||||||
SE=st.GetEntry(ptype,Ym+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Ym,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Zm
|
|
||||||
SE=st.GetEntry(ptype,Zm+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Zm,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tm
|
|
||||||
SE=st.GetEntry(ptype,Tm+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Tm,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
vstream(out()()(0),even_0+odd_0);
|
|
||||||
vstream(out()()(1),even_1+odd_1);
|
|
||||||
vstream(out()()(2),even_2+odd_2);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
Simd even_0; // 12 regs on knc
|
||||||
|
Simd even_1;
|
||||||
|
Simd even_2;
|
||||||
|
Simd odd_0; // 12 regs on knc
|
||||||
|
Simd odd_1;
|
||||||
|
Simd odd_2;
|
||||||
|
|
||||||
|
Simd Chi_0; // two spinor; 6 regs
|
||||||
|
Simd Chi_1;
|
||||||
|
Simd Chi_2;
|
||||||
|
|
||||||
|
Simd U_00; // two rows of U matrix
|
||||||
|
Simd U_10;
|
||||||
|
Simd U_20;
|
||||||
|
Simd U_01;
|
||||||
|
Simd U_11;
|
||||||
|
Simd U_21; // 2 reg left.
|
||||||
|
Simd U_02;
|
||||||
|
Simd U_12;
|
||||||
|
Simd U_22;
|
||||||
|
|
||||||
|
SiteSpinor result;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int skew;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=s+LLs*sU;
|
||||||
|
|
||||||
|
even_0 = zero; even_1 = zero; even_2 = zero;
|
||||||
|
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
||||||
|
|
||||||
|
skew = 0;
|
||||||
|
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
||||||
|
skew = 8;
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
||||||
|
|
||||||
|
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
||||||
|
if ( dag ) {
|
||||||
|
result()()(0) = - even_0 - odd_0;
|
||||||
|
result()()(1) = - even_1 - odd_1;
|
||||||
|
result()()(2) = - even_2 - odd_2;
|
||||||
|
} else {
|
||||||
|
result()()(0) = even_0 + odd_0;
|
||||||
|
result()()(1) = even_1 + odd_1;
|
||||||
|
result()()(2) = even_2 + odd_2;
|
||||||
|
}
|
||||||
|
vstream(out._odata[sF],result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
Simd even_0; // 12 regs on knc
|
||||||
|
Simd even_1;
|
||||||
|
Simd even_2;
|
||||||
|
Simd odd_0; // 12 regs on knc
|
||||||
|
Simd odd_1;
|
||||||
|
Simd odd_2;
|
||||||
|
|
||||||
|
Simd Chi_0; // two spinor; 6 regs
|
||||||
|
Simd Chi_1;
|
||||||
|
Simd Chi_2;
|
||||||
|
|
||||||
|
Simd U_00; // two rows of U matrix
|
||||||
|
Simd U_10;
|
||||||
|
Simd U_20;
|
||||||
|
Simd U_01;
|
||||||
|
Simd U_11;
|
||||||
|
Simd U_21; // 2 reg left.
|
||||||
|
Simd U_02;
|
||||||
|
Simd U_12;
|
||||||
|
Simd U_22;
|
||||||
|
|
||||||
|
SiteSpinor result;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int skew;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=s+LLs*sU;
|
||||||
|
|
||||||
|
even_0 = zero; even_1 = zero; even_2 = zero;
|
||||||
|
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
||||||
|
int nmu=0;
|
||||||
|
skew = 0;
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
||||||
|
skew = 8;
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
||||||
|
|
||||||
|
// Add sum of all exterior connected stencil legs
|
||||||
|
if ( nmu ) {
|
||||||
|
if ( dag ) {
|
||||||
|
result()()(0) = - even_0 - odd_0;
|
||||||
|
result()()(1) = - even_1 - odd_1;
|
||||||
|
result()()(2) = - even_2 - odd_2;
|
||||||
|
} else {
|
||||||
|
result()()(0) = even_0 + odd_0;
|
||||||
|
result()()(1) = even_1 + odd_1;
|
||||||
|
result()()(2) = even_2 + odd_2;
|
||||||
|
}
|
||||||
|
out._odata[sF] = out._odata[sF] + result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
SiteSpinor *buf, int LLs, \
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
int sU, const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionField &in, FermionField &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionField &in, FermionField &out, int dag); \
|
||||||
|
|
||||||
#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \
|
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \
|
|
||||||
SiteSpinor *buf, int sF, \
|
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink) ;
|
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
||||||
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
|
||||||
|
|
||||||
}}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -274,41 +274,16 @@ public:
|
|||||||
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> same_node;
|
|
||||||
std::vector<int> surface_list;
|
|
||||||
|
|
||||||
WilsonStencil(GridBase *grid,
|
WilsonStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
|
||||||
same_node(npoints)
|
|
||||||
{
|
{
|
||||||
ZeroCountersi();
|
ZeroCountersi();
|
||||||
surface_list.resize(0);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void BuildSurfaceList(int Ls,int vol4){
|
|
||||||
|
|
||||||
// find same node for SHM
|
|
||||||
// Here we know the distance is 1 for WilsonStencil
|
|
||||||
for(int point=0;point<this->_npoints;point++){
|
|
||||||
same_node[point] = this->SameNode(point);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int site = 0 ;site< vol4;site++){
|
|
||||||
int local = 1;
|
|
||||||
for(int point=0;point<this->_npoints;point++){
|
|
||||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
|
||||||
local = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(local == 0) {
|
|
||||||
surface_list.push_back(site);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template < class compressor>
|
template < class compressor>
|
||||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
@ -369,23 +344,23 @@ public:
|
|||||||
int dag = compress.dag;
|
int dag = compress.dag;
|
||||||
int face_idx=0;
|
int face_idx=0;
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
||||||
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
||||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
||||||
assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
||||||
assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
||||||
assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
||||||
assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
||||||
assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
||||||
} else {
|
} else {
|
||||||
assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
||||||
assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
||||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
||||||
assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
||||||
assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
||||||
assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
||||||
assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
||||||
assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
||||||
}
|
}
|
||||||
this->face_table_computed=1;
|
this->face_table_computed=1;
|
||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
|
@ -348,15 +348,98 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
|
|||||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
/*Change starts*/
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag) {
|
FermionField &out, int dag) {
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
#ifdef GRID_OMP
|
||||||
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag) {
|
||||||
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
Compressor compressor;
|
||||||
|
int len = U._grid->oSites();
|
||||||
|
const int LLs = 1;
|
||||||
|
|
||||||
|
st.Prepare();
|
||||||
|
st.HaloGather(in,compressor);
|
||||||
|
st.CommsMergeSHM(compressor);
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
|
int nthreads = omp_get_num_threads();
|
||||||
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = len;
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
// do the compute
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
|
||||||
|
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||||
|
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||||
|
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
|
}
|
||||||
|
} //else
|
||||||
|
|
||||||
|
} else {
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
}
|
||||||
|
|
||||||
|
Compressor compressor(dag);
|
||||||
|
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
|
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
|
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} //pragma
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag) {
|
||||||
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
|
|
||||||
@ -370,6 +453,7 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
/*Change ends */
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||||
|
@ -130,6 +130,12 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
|||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
|
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
@ -145,6 +151,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
|||||||
|
|
||||||
// protected:
|
// protected:
|
||||||
public:
|
public:
|
||||||
|
virtual RealD Mass(void) { return mass; }
|
||||||
|
virtual int isTrivialEE(void) { return 1; };
|
||||||
RealD mass;
|
RealD mass;
|
||||||
RealD diag_mass;
|
RealD diag_mass;
|
||||||
|
|
||||||
|
@ -445,8 +445,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
ptime = usecond() - start;
|
ptime = usecond() - start;
|
||||||
}
|
} else {
|
||||||
{
|
|
||||||
double start = usecond();
|
double start = usecond();
|
||||||
st.CommunicateThreaded();
|
st.CommunicateThreaded();
|
||||||
ctime = usecond() - start;
|
ctime = usecond() - start;
|
||||||
|
@ -70,27 +70,27 @@ public:
|
|||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case OptHandUnroll:
|
case OptHandUnroll:
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case OptGeneric:
|
case OptGeneric:
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
||||||
else assert(0);
|
else assert(0);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -232,6 +232,7 @@ private:
|
|||||||
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
int sF, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
|
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
|
@ -150,6 +150,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
std::vector<int> _distances;
|
std::vector<int> _distances;
|
||||||
std::vector<int> _comm_buf_size;
|
std::vector<int> _comm_buf_size;
|
||||||
std::vector<int> _permute_type;
|
std::vector<int> _permute_type;
|
||||||
|
std::vector<int> same_node;
|
||||||
|
std::vector<int> surface_list;
|
||||||
|
|
||||||
Vector<StencilEntry> _entries;
|
Vector<StencilEntry> _entries;
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
@ -201,7 +203,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
|
|
||||||
int dimension = _directions[point];
|
int dimension = _directions[point];
|
||||||
int displacement = _distances[point];
|
int displacement = _distances[point];
|
||||||
assert( (displacement==1) || (displacement==-1));
|
|
||||||
|
|
||||||
int pd = _grid->_processors[dimension];
|
int pd = _grid->_processors[dimension];
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
@ -216,9 +218,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
if ( ! comm_dim ) return 1;
|
if ( ! comm_dim ) return 1;
|
||||||
|
|
||||||
int nbr_proc;
|
int nbr_proc;
|
||||||
if (displacement==1) nbr_proc = 1;
|
if (displacement>0) nbr_proc = 1;
|
||||||
else nbr_proc = pd-1;
|
else nbr_proc = pd-1;
|
||||||
|
|
||||||
|
// FIXME this logic needs to be sorted for three link term
|
||||||
|
// assert( (displacement==1) || (displacement==-1));
|
||||||
|
// Present hack only works for >= 4^4 subvol per node
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
||||||
@ -539,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Move interior/exterior split into the generic stencil
|
||||||
|
// FIXME Explicit Ls in interface is a pain. Should just use a vol
|
||||||
|
void BuildSurfaceList(int Ls,int vol4){
|
||||||
|
|
||||||
|
// find same node for SHM
|
||||||
|
// Here we know the distance is 1 for WilsonStencil
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
same_node[point] = this->SameNode(point);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int site = 0 ;site< vol4;site++){
|
||||||
|
int local = 1;
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
||||||
|
local = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(local == 0) {
|
||||||
|
surface_list.push_back(site);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
CartesianStencil(GridBase *grid,
|
CartesianStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
int checkerboard,
|
int checkerboard,
|
||||||
@ -549,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
comm_bytes_thr(npoints),
|
comm_bytes_thr(npoints),
|
||||||
comm_enter_thr(npoints),
|
comm_enter_thr(npoints),
|
||||||
comm_leave_thr(npoints),
|
comm_leave_thr(npoints),
|
||||||
comm_time_thr(npoints)
|
comm_time_thr(npoints),
|
||||||
|
same_node(npoints)
|
||||||
{
|
{
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_npoints = npoints;
|
_npoints = npoints;
|
||||||
@ -557,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
_directions = directions;
|
_directions = directions;
|
||||||
_distances = distances;
|
_distances = distances;
|
||||||
_unified_buffer_size=0;
|
_unified_buffer_size=0;
|
||||||
|
surface_list.resize(0);
|
||||||
|
|
||||||
int osites = _grid->oSites();
|
int osites = _grid->oSites();
|
||||||
|
|
||||||
|
@ -368,8 +368,10 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
||||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
||||||
|
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute;
|
||||||
} else {
|
} else {
|
||||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||||
|
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute;
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
||||||
|
@ -141,6 +141,7 @@ int main (int argc, char ** argv)
|
|||||||
t1=usecond();
|
t1=usecond();
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
@ -160,7 +161,8 @@ int main (int argc, char ** argv)
|
|||||||
localConvert(sresult,tmp);
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
@ -181,6 +183,7 @@ int main (int argc, char ** argv)
|
|||||||
localConvert(sresult,tmp);
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
||||||
|
|
||||||
|
196
tests/core/Test_staggered5DvecF.cc
Normal file
196
tests/core/Test_staggered5DvecF.cc
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./benchmarks/Benchmark_wilson.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
std::vector<int> latt_size = GridDefaultLatt();
|
||||||
|
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||||
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
|
||||||
|
const int Ls=16;
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
std::vector<int> seeds({1,2,3,4});
|
||||||
|
|
||||||
|
GridParallelRNG pRNG4(UGrid);
|
||||||
|
GridParallelRNG pRNG5(FGrid);
|
||||||
|
pRNG4.SeedFixedIntegers(seeds);
|
||||||
|
pRNG5.SeedFixedIntegers(seeds);
|
||||||
|
|
||||||
|
typedef typename ImprovedStaggeredFermion5DF::FermionField FermionField;
|
||||||
|
typedef typename ImprovedStaggeredFermion5DF::ComplexField ComplexField;
|
||||||
|
typename ImprovedStaggeredFermion5DF::ImplParams params;
|
||||||
|
|
||||||
|
FermionField src (FGrid);
|
||||||
|
random(pRNG5,src);
|
||||||
|
/*
|
||||||
|
std::vector<int> site({0,1,2,0,0});
|
||||||
|
ColourVector cv = zero;
|
||||||
|
cv()()(0)=1.0;
|
||||||
|
src = zero;
|
||||||
|
pokeSite(cv,src,site);
|
||||||
|
*/
|
||||||
|
FermionField result(FGrid); result=zero;
|
||||||
|
FermionField tmp(FGrid); tmp=zero;
|
||||||
|
FermionField err(FGrid); tmp=zero;
|
||||||
|
FermionField phi (FGrid); random(pRNG5,phi);
|
||||||
|
FermionField chi (FGrid); random(pRNG5,chi);
|
||||||
|
|
||||||
|
LatticeGaugeFieldF Umu(UGrid);
|
||||||
|
SU3::HotConfiguration(pRNG4,Umu);
|
||||||
|
|
||||||
|
/*
|
||||||
|
for(int mu=1;mu<4;mu++){
|
||||||
|
auto tmp = PeekIndex<LorentzIndex>(Umu,mu);
|
||||||
|
tmp = zero;
|
||||||
|
PokeIndex<LorentzIndex>(Umu,tmp,mu);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
double volume=Ls;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
volume=volume*latt_size[mu];
|
||||||
|
}
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
|
||||||
|
ImprovedStaggeredFermion5DF Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params);
|
||||||
|
ImprovedStaggeredFermionVec5dF sDs(Umu,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,c1,c2,u0,params);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
||||||
|
|
||||||
|
int ncall=1000;
|
||||||
|
int ncall1=1000;
|
||||||
|
double t0(0),t1(0);
|
||||||
|
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling staggered operator"<<std::endl;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall1;i++){
|
||||||
|
Ds.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
|
||||||
|
|
||||||
|
#ifdef AVX512
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
||||||
|
#else
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall1;i++){
|
||||||
|
Ds.Dhop(src,tmp,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
err = tmp-result;
|
||||||
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
FermionField ssrc (sFGrid); localConvert(src,ssrc);
|
||||||
|
FermionField sresult(sFGrid); sresult=zero;
|
||||||
|
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall1;i++){
|
||||||
|
sDs.Dhop(ssrc,sresult,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef AVX512
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
||||||
|
#else
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
err = tmp-result;
|
||||||
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
int extra=1;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall1*extra;i++){
|
||||||
|
sDs.Dhop(ssrc,sresult,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
||||||
|
|
||||||
|
err = tmp-result;
|
||||||
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
@ -74,8 +74,16 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
||||||
|
|
||||||
|
double volume=1;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
volume=volume*latt_size[mu];
|
||||||
|
}
|
||||||
|
|
||||||
RealD mass=0.003;
|
RealD mass=0.003;
|
||||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
||||||
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
@ -87,14 +95,26 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
|
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||||
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
|
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
|
||||||
FermionField result4d_o(UrbGrid);
|
FermionField result4d_o(UrbGrid);
|
||||||
|
|
||||||
|
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
||||||
result4d_o=zero;
|
result4d_o=zero;
|
||||||
CG(HermOp4d,src4d_o,result4d_o);
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
CG(HermOp4d,src4d_o,result4d_o);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
HermOp4d.Report();
|
||||||
|
}
|
||||||
|
Ds4d.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
@ -103,7 +123,17 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
result_o=zero;
|
result_o=zero;
|
||||||
CG(HermOp,src_o,result_o);
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
CG(HermOp,src_o,result_o);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete*Ls;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
HermOp.Report();
|
||||||
|
}
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
@ -112,7 +142,18 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
result_o=zero;
|
result_o=zero;
|
||||||
mCG(HermOp,src_o,result_o);
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
mCG(HermOp,src_o,result_o);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=mCG.IterationsToComplete*Ls;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
HermOp.Report();
|
||||||
|
}
|
||||||
|
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
@ -121,7 +162,17 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
result_o=zero;
|
result_o=zero;
|
||||||
BCGrQ(HermOp,src_o,result_o);
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
BCGrQ(HermOp,src_o,result_o);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=BCGrQ.IterationsToComplete*Ls;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
HermOp.Report();
|
||||||
|
}
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
@ -74,7 +74,16 @@ int main (int argc, char ** argv)
|
|||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
||||||
|
|
||||||
RealD mass=0.003;
|
RealD mass=0.003;
|
||||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
|
||||||
|
double volume=1;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
volume=volume*latt_size[mu];
|
||||||
|
}
|
||||||
|
|
||||||
|
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||||
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
||||||
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
@ -86,11 +95,23 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
|
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||||
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||||
FermionField result4d(UGrid); result4d=zero;
|
FermionField result4d(UGrid); result4d=zero;
|
||||||
CG(HermOp4d,src4d,result4d);
|
|
||||||
|
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
||||||
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
CG(HermOp4d,src4d,result4d);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
@ -98,9 +119,18 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
|
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
result=zero;
|
result=zero;
|
||||||
|
{
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
|
double t1=usecond();
|
||||||
CG(HermOp,src,result);
|
CG(HermOp,src,result);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
|
}
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
@ -108,7 +138,16 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
result=zero;
|
result=zero;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
|
{
|
||||||
|
double t1=usecond();
|
||||||
mCG(HermOp,src,result);
|
mCG(HermOp,src,result);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
}
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
@ -117,7 +156,16 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
result=zero;
|
result=zero;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
|
{
|
||||||
|
double t1=usecond();
|
||||||
BCGrQ(HermOp,src,result);
|
BCGrQ(HermOp,src,result);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
}
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
@ -71,7 +71,10 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
RealD mass=0.003;
|
RealD mass=0.003;
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||||
|
|
||||||
FermionField res_o(&RBGrid);
|
FermionField res_o(&RBGrid);
|
||||||
FermionField src_o(&RBGrid);
|
FermionField src_o(&RBGrid);
|
||||||
@ -80,7 +83,19 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
|
double t1=usecond();
|
||||||
CG(HermOpEO,src_o,res_o);
|
CG(HermOpEO,src_o,res_o);
|
||||||
|
double t2=usecond();
|
||||||
|
|
||||||
|
// Schur solver: uses DeoDoe => volume * 1146
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
FermionField tmp(&RBGrid);
|
FermionField tmp(&RBGrid);
|
||||||
|
|
||||||
|
@ -65,7 +65,10 @@ int main (int argc, char ** argv)
|
|||||||
FermionField resid(&Grid);
|
FermionField resid(&Grid);
|
||||||
|
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||||
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
|
SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
|
||||||
|
@ -73,7 +73,10 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||||
|
|
||||||
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
|
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
|
||||||
ConjugateGradient<FermionField> CG(1.0e-6,10000);
|
ConjugateGradient<FermionField> CG(1.0e-6,10000);
|
||||||
|
121
tests/solver/Test_staggered_multishift.cc
Normal file
121
tests/solver/Test_staggered_multishift.cc
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./tests/Test_wilson_cg_unprec.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
template<class d>
|
||||||
|
struct scal {
|
||||||
|
d internal;
|
||||||
|
};
|
||||||
|
|
||||||
|
Gamma::Algebra Gmu [] = {
|
||||||
|
Gamma::Algebra::GammaX,
|
||||||
|
Gamma::Algebra::GammaY,
|
||||||
|
Gamma::Algebra::GammaZ,
|
||||||
|
Gamma::Algebra::GammaT
|
||||||
|
};
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
|
||||||
|
typename ImprovedStaggeredFermionR::ImplParams params;
|
||||||
|
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
std::vector<int> latt_size = GridDefaultLatt();
|
||||||
|
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||||
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
|
||||||
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
GridRedBlackCartesian RBGrid(&Grid);
|
||||||
|
|
||||||
|
std::vector<int> seeds({1,2,3,4});
|
||||||
|
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
|
||||||
|
|
||||||
|
|
||||||
|
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
|
||||||
|
|
||||||
|
double volume=1;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
volume=volume*latt_size[mu];
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////
|
||||||
|
// sqrt
|
||||||
|
////////////////////////////////////////
|
||||||
|
double lo=0.001;
|
||||||
|
double hi=1.0;
|
||||||
|
int precision=64;
|
||||||
|
int degree=10;
|
||||||
|
AlgRemez remez(lo,hi,precision);
|
||||||
|
remez.generateApprox(degree,1,2);
|
||||||
|
MultiShiftFunction Sqrt(remez,1.0e-6,false);
|
||||||
|
std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
// Setup staggered
|
||||||
|
////////////////////////////////////////////
|
||||||
|
RealD mass=0.003;
|
||||||
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
|
||||||
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||||
|
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
||||||
|
|
||||||
|
FermionField src(&Grid); random(pRNG,src);
|
||||||
|
FermionField src_o(&RBGrid);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////
|
||||||
|
//Multishift CG
|
||||||
|
/////////////////////////////////
|
||||||
|
std::vector<FermionField> result(degree,&RBGrid);
|
||||||
|
ConjugateGradientMultiShift<FermionField> MSCG(10000,Sqrt);
|
||||||
|
|
||||||
|
double deodoe_flops=(1205+15*degree)*volume; // == 66*16 + == 1146
|
||||||
|
|
||||||
|
double t1=usecond();
|
||||||
|
MSCG(HermOpEO,src_o,result);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=MSCG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
// HermOpEO.Report();
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user