1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Merge branch 'develop' into sycl

This commit is contained in:
Peter Boyle 2020-06-09 04:00:12 -04:00
commit cdf0a04fc5
85 changed files with 2632 additions and 1334 deletions

View File

@ -252,19 +252,14 @@ public:
/////////////////////// ///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
RealD M (const CoarseVector &in, CoarseVector &out) void M (const CoarseVector &in, CoarseVector &out)
{ {
conformable(_grid,in.Grid()); conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid()); conformable(in.Grid(),out.Grid());
// RealD Nin = norm2(in);
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
double comms_usec = -usecond();
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
comms_usec += usecond();
autoView( in_v , in, AcceleratorRead); autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite); autoView( out_v , out, AcceleratorWrite);
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
@ -278,12 +273,7 @@ public:
typedef decltype(coalescedRead(in_v[0])) calcVector; typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex; typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
GridStopWatch ArithmeticTimer;
int osites=Grid()->oSites(); int osites=Grid()->oSites();
// double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
// double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
double usecs =-usecond();
// assert(geom.npoint==9);
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis; int ss = sss/nbasis;
@ -310,18 +300,11 @@ public:
} }
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
usecs +=usecond();
double nrm_usec=-usecond();
RealD Nout= norm2(out);
nrm_usec+=usecond();
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
return Nout;
}; };
RealD Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
{ {
if(hermitian) { if(hermitian) {
// corresponds to Petrov-Galerkin coarsening // corresponds to Petrov-Galerkin coarsening
@ -332,7 +315,6 @@ public:
G5C(tmp, in); G5C(tmp, in);
M(tmp, out); M(tmp, out);
G5C(out, out); G5C(out, out);
return norm2(out);
} }
}; };
void MdirComms(const CoarseVector &in) void MdirComms(const CoarseVector &in)
@ -553,8 +535,6 @@ public:
autoView( A_self , A[self_stencil], AcceleratorWrite); autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
// if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
// accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); });
} }
} }

View File

@ -43,7 +43,6 @@ NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class LinearOperatorBase { template<class Field> class LinearOperatorBase {
public: public:
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
@ -94,7 +93,10 @@ public:
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2); _Mat.MdagM(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
_Mat.MdagM(in,out); _Mat.MdagM(in,out);
@ -131,17 +133,14 @@ public:
assert(0); assert(0);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2); HermOp(in,out);
out = out + _shift*in; ComplexD dot = innerProduct(in,out);
ComplexD dot;
dot= innerProduct(in,out);
n1=real(dot); n1=real(dot);
n2=norm2(out); n2=norm2(out);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
RealD n1,n2; _Mat.MdagM(in,out);
HermOpAndNorm(in,out,n1,n2); out = out + _shift*in;
} }
}; };
@ -170,7 +169,7 @@ public:
_Mat.M(in,out); _Mat.M(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.M(in,out); HermOp(in,out);
ComplexD dot= innerProduct(in,out); n1=real(dot); ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out); n2=norm2(out);
} }
@ -216,21 +215,24 @@ public:
template<class Field> template<class Field>
class SchurOperatorBase : public LinearOperatorBase<Field> { class SchurOperatorBase : public LinearOperatorBase<Field> {
public: public:
virtual RealD Mpc (const Field &in, Field &out) =0; virtual void Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0; virtual void MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MpcDagMpc(const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard(); tmp.Checkerboard() = in.Checkerboard();
ni=Mpc(in,tmp); Mpc(in,tmp);
no=MpcDag(tmp,out); MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
MpcDagMpc(in,out,n1,n2); MpcDagMpc(in,out);
ComplexD dot= innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
RealD n1,n2; out.Checkerboard() = in.Checkerboard();
HermOpAndNorm(in,out,n1,n2); MpcDagMpc(in,out);
} }
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
Mpc(in,out); Mpc(in,out);
@ -254,28 +256,24 @@ public:
public: public:
Matrix &_Mat; Matrix &_Mat;
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
tmp.Checkerboard() = !in.Checkerboard(); tmp.Checkerboard() = !in.Checkerboard();
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp); _Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out); _Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl;
_Mat.Mooee(in,out); _Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out); axpy(out,-1.0,tmp,out);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MeooeDag(in,tmp); _Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out); _Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp); _Mat.MeooeDag(out,tmp);
_Mat.MooeeDag(in,out); _Mat.MooeeDag(in,out);
return axpy_norm(out,-1.0,tmp,out); axpy(out,-1.0,tmp,out);
} }
}; };
template<class Matrix,class Field> template<class Matrix,class Field>
@ -285,25 +283,23 @@ public:
public: public:
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp); _Mat.MooeeInv(out,tmp);
_Mat.Meooe(tmp,out); _Mat.Meooe(tmp,out);
_Mat.MooeeInv(out,tmp); _Mat.MooeeInv(out,tmp);
axpy(out,-1.0,tmp,in);
return axpy_norm(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MooeeInvDag(in,out); _Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp); _Mat.MeooeDag(out,tmp);
_Mat.MooeeInvDag(tmp,out); _Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp); _Mat.MeooeDag(out,tmp);
axpy(out,-1.0,tmp,in);
return axpy_norm(out,-1.0,tmp,in);
} }
}; };
template<class Matrix,class Field> template<class Matrix,class Field>
@ -313,7 +309,7 @@ public:
public: public:
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MooeeInv(in,out); _Mat.MooeeInv(in,out);
@ -321,9 +317,9 @@ public:
_Mat.MooeeInv(tmp,out); _Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
return axpy_norm(out,-1.0,tmp,in); axpy(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MeooeDag(in,out); _Mat.MeooeDag(in,out);
@ -331,7 +327,7 @@ public:
_Mat.MeooeDag(tmp,out); _Mat.MeooeDag(tmp,out);
_Mat.MooeeInvDag(out,tmp); _Mat.MooeeInvDag(out,tmp);
return axpy_norm(out,-1.0,tmp,in); axpy(out,-1.0,tmp,in);
} }
}; };
@ -339,13 +335,13 @@ public:
class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field> class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
{ {
public: public:
virtual RealD Mpc (const Field& in, Field& out) = 0; virtual void Mpc (const Field& in, Field& out) = 0;
virtual RealD MpcDag (const Field& in, Field& out) = 0; virtual void MpcDag (const Field& in, Field& out) = 0;
virtual void MpcDagMpc(const Field& in, Field& out, RealD& ni, RealD& no) { virtual void MpcDagMpc(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard(); tmp.Checkerboard() = in.Checkerboard();
ni = Mpc(in,tmp); Mpc(in,tmp);
no = MpcDag(tmp,out); MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) { virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
assert(0); assert(0);
@ -366,6 +362,9 @@ public:
void OpDir(const Field& in, Field& out, int dir, int disp) { void OpDir(const Field& in, Field& out, int dir, int disp) {
assert(0); assert(0);
} }
void OpDirAll(const Field& in, std::vector<Field>& out){
assert(0);
};
}; };
template<class Matrix, class Field> template<class Matrix, class Field>
@ -374,7 +373,7 @@ public:
public: public:
Matrix& _Mat; Matrix& _Mat;
NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){}; NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
virtual RealD Mpc(const Field& in, Field& out) { virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
tmp.Checkerboard() = !in.Checkerboard(); tmp.Checkerboard() = !in.Checkerboard();
@ -384,9 +383,9 @@ public:
_Mat.Mooee(in, out); _Mat.Mooee(in, out);
return axpy_norm(out, -1.0, tmp, out); axpy(out, -1.0, tmp, out);
} }
virtual RealD MpcDag(const Field& in, Field& out) { virtual void MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MeooeDag(in, tmp); _Mat.MeooeDag(in, tmp);
@ -395,7 +394,7 @@ public:
_Mat.MooeeDag(in, out); _Mat.MooeeDag(in, out);
return axpy_norm(out, -1.0, tmp, out); axpy(out, -1.0, tmp, out);
} }
}; };
@ -407,7 +406,7 @@ public:
public: public:
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){}; NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
virtual RealD Mpc(const Field& in, Field& out) { virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.Meooe(in, out); _Mat.Meooe(in, out);
@ -415,9 +414,9 @@ public:
_Mat.Meooe(tmp, out); _Mat.Meooe(tmp, out);
_Mat.MooeeInv(out, tmp); _Mat.MooeeInv(out, tmp);
return axpy_norm(out, -1.0, tmp, in); axpy(out, -1.0, tmp, in);
} }
virtual RealD MpcDag(const Field& in, Field& out) { virtual void MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MooeeInvDag(in, out); _Mat.MooeeInvDag(in, out);
@ -425,7 +424,7 @@ public:
_Mat.MooeeInvDag(tmp, out); _Mat.MooeeInvDag(tmp, out);
_Mat.MeooeDag(out, tmp); _Mat.MeooeDag(out, tmp);
return axpy_norm(out, -1.0, tmp, in); axpy(out, -1.0, tmp, in);
} }
}; };
@ -438,7 +437,7 @@ public:
public: public:
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){}; NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
virtual RealD Mpc(const Field& in, Field& out) { virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MooeeInv(in, out); _Mat.MooeeInv(in, out);
@ -446,9 +445,9 @@ public:
_Mat.MooeeInv(tmp, out); _Mat.MooeeInv(tmp, out);
_Mat.Meooe(out, tmp); _Mat.Meooe(out, tmp);
return axpy_norm(out, -1.0, tmp, in); axpy(out, -1.0, tmp, in);
} }
virtual RealD MpcDag(const Field& in, Field& out) { virtual void MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MeooeDag(in, out); _Mat.MeooeDag(in, out);
@ -456,7 +455,7 @@ public:
_Mat.MeooeDag(tmp, out); _Mat.MeooeDag(tmp, out);
_Mat.MooeeInvDag(out, tmp); _Mat.MooeeInvDag(out, tmp);
return axpy_norm(out, -1.0, tmp, in); axpy(out, -1.0, tmp, in);
} }
}; };
@ -475,71 +474,38 @@ public:
Matrix &_Mat; Matrix &_Mat;
Field tmp; Field tmp;
RealD mass; RealD mass;
double tMpc;
double tIP;
double tMeo;
double taxpby_norm;
uint64_t ncall;
public: public:
void Report(void)
{
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
}
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{ {
assert( _Mat.isTrivialEE() ); assert( _Mat.isTrivialEE() );
mass = _Mat.Mass(); mass = _Mat.Mass();
tMpc=0;
tIP =0;
tMeo=0;
taxpby_norm=0;
ncall=0;
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
ncall++; Mpc(in,out);
tMpc-=usecond();
n2 = Mpc(in,out);
tMpc+=usecond();
tIP-=usecond();
ComplexD dot= innerProduct(in,out); ComplexD dot= innerProduct(in,out);
tIP+=usecond();
n1 = real(dot); n1 = real(dot);
n2 =0.0;
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
ncall++; Mpc(in,out);
tMpc-=usecond(); // _Mat.Meooe(in,out);
_Mat.Meooe(in,out); // _Mat.Meooe(out,tmp);
_Mat.Meooe(out,tmp); // axpby(out,-1.0,mass*mass,tmp,in);
tMpc+=usecond();
taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
} }
virtual RealD Mpc (const Field &in, Field &out) virtual void Mpc (const Field &in, Field &out)
{ {
Field tmp(in.Grid()); Field tmp(in.Grid());
Field tmp2(in.Grid()); Field tmp2(in.Grid());
// std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl; // _Mat.Mooee(in,out);
_Mat.Mooee(in,out); // _Mat.Mooee(out,tmp);
_Mat.Mooee(out,tmp);
// std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
tMeo-=usecond();
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
tMeo+=usecond(); axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm-=usecond();
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
return nn;
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
return Mpc(in,out); Mpc(in,out);
} }
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
assert(0);// Never need with staggered assert(0);// Never need with staggered
@ -547,7 +513,6 @@ public:
}; };
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>; template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for functions of operators // Base classes for functions of operators
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////

View File

@ -38,16 +38,12 @@ template<class Field> class SparseMatrixBase {
public: public:
virtual GridBase *Grid(void) =0; virtual GridBase *Grid(void) =0;
// Full checkerboar operations // Full checkerboar operations
virtual RealD M (const Field &in, Field &out)=0; virtual void M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0; virtual void Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in.Grid());
ni=M(in,tmp);
no=Mdag(tmp,out);
}
virtual void MdagM(const Field &in, Field &out) { virtual void MdagM(const Field &in, Field &out) {
RealD ni, no; Field tmp (in.Grid());
MdagM(in,out,ni,no); M(in,tmp);
Mdag(tmp,out);
} }
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;

View File

@ -234,10 +234,8 @@ public:
GridBase *grid=in.Grid(); GridBase *grid=in.Grid();
// std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites(); int vol=grid->gSites();
typedef typename Field::vector_type vector_type;
Field T0(grid); T0 = in; Field T0(grid); T0 = in;
Field T1(grid); Field T1(grid);
@ -260,12 +258,26 @@ public:
for(int n=2;n<order;n++){ for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y); Linop.HermOp(*Tn,y);
// y=xscale*y+mscale*(*Tn); #if 0
// *Tnp=2.0*y-(*Tnm); auto y_v = y.View();
// out=out+Coeffs[n]* (*Tnp); auto Tn_v = Tn->View();
auto Tnp_v = Tnp->View();
auto Tnm_v = Tnm->View();
constexpr int Nsimd = vector_type::Nsimd();
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
#else
axpby(y,xscale,mscale,y,(*Tn)); axpby(y,xscale,mscale,y,(*Tn));
axpby(*Tnp,2.0,-1.0,y,(*Tnm)); axpby(*Tnp,2.0,-1.0,y,(*Tnm));
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out); axpy(out,Coeffs[n],*Tnp,out);
}
#endif
// Cycle pointers to avoid copies // Cycle pointers to avoid copies
Field *swizzle = Tnm; Field *swizzle = Tnm;
Tnm =Tn; Tnm =Tn;

View File

@ -37,218 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
////////////////////////////////////////////////////////
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
// If assume basis[j] are already orthonormal,
// can take all inner products in parallel saving 2x bandwidth
// Save 3x bandwidth on the second line of loop.
// perhaps 2.5x speed up.
// 2x overall in Multigrid Lanczos
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
GridBase* grid = basis[0].Grid();
typedef typename Field::vector_object vobj;
typedef decltype(basis[0].View(CpuWrite)) View;
Vector<View> basis_v; basis_v.reserve(basis.size());
for(int k=0;k<basis.size();k++) basis_v.push_back(basis[k].View(CpuWrite));
View *basis_vp = &basis_v[0];
#if 1
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
thread_region
{
vobj* B = Bt.data() + Nm * thread_num();
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
int nrot = j1-j0;
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
// printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of Eigen matrix
Vector<double> Qt_jv(Nm*Nm);
double *Qt_p = & Qt_jv[0];
for(int k=0;k<Nm;++k){
for(int j=0;j<Nm;++j){
Qt_p[j*Nm+k]=Qt(j,k);
}
}
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
// remaining work in this block
int ssites=MIN(siteBlock,oSites-s);
// zero out the accumulators
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
auto z=coalescedRead(Bp[ss]);
z=Zero();
coalescedWrite(Bp[ss],z);
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss]));
}
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
#endif
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
GridBase* grid = basis[0].Grid();
typedef typename Field::vector_object vobj;
typedef decltype(basis[0].View(AcceleratorWrite)) View;
result.Checkerboard() = basis[0].Checkerboard();
autoView(result_v,result, AcceleratorWrite);
Vector<View> basis_v; basis_v.reserve(basis.size());
View * basis_vp = &basis_v[0];
for(int k=0;k<basis.size();k++) basis_v.push_back(basis[k].View(AcceleratorRead));
Vector<double> Qt_jv(Nm); double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(basis_vp[k0][ss]);
B=Zero();
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
}
coalescedWrite(result_v[ss], B);
});
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Implicitly restarted lanczos // Implicitly restarted lanczos
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////

View File

@ -6,72 +6,6 @@ NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr; MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false; bool MemoryProfiler::debug = false;
#ifdef GRID_CUDA
#define SMALL_LIMIT (0)
#else
#define SMALL_LIMIT (4096)
#endif
#ifdef ALLOCATION_CACHE
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < SMALL_LIMIT ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<Ncache;e++) {
if ( Entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%Ncache;
}
if ( Entries[v].valid ) {
ret = Entries[v].address;
Entries[v].valid = 0;
Entries[v].address = NULL;
Entries[v].bytes = 0;
}
Entries[v].address=ptr;
Entries[v].bytes =bytes;
Entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes) {
if (bytes < SMALL_LIMIT ) return NULL;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<Ncache;e++){
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
Entries[e].valid = 0;
return Entries[e].address;
}
}
return NULL;
}
#endif
void check_huge_pages(void *Buf,uint64_t BYTES) void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
#ifdef __linux__ #ifdef __linux__

View File

@ -114,6 +114,7 @@ public:
void GlobalSumVector(RealD *,int N); void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &); void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &); void GlobalSum(uint64_t &);
void GlobalSumVector(uint64_t*,int N);
void GlobalSum(ComplexF &c); void GlobalSum(ComplexF &c);
void GlobalSumVector(ComplexF *c,int N); void GlobalSumVector(ComplexF *c,int N);
void GlobalSum(ComplexD &c); void GlobalSum(ComplexD &c);

View File

@ -255,6 +255,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){ void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0); assert(ierr==0);

View File

@ -70,9 +70,10 @@ CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){} void CartesianCommunicator::GlobalSum(double &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){}
void CartesianCommunicator::GlobalSum(uint32_t &){} void CartesianCommunicator::GlobalSum(uint32_t &){}
void CartesianCommunicator::GlobalSum(uint64_t &){} void CartesianCommunicator::GlobalSum(uint64_t &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){} void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){}

View File

@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
if (heap_bytes >= heap_size) { if (heap_bytes >= heap_size) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl; std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl; std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(heap_bytes<heap_size); assert(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl; //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;

View File

@ -49,4 +49,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
#endif #endif
NAMESPACE_BEGIN(Grid);
template<typename Op, typename T1>
auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2>
auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2, class T3>
auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, expr.arg2),
eval(0, expr.arg3)))>
{
return Cshift(closure(expr),dim,shift);
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_local.h> #include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h> #include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h> #include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h> //#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_comparison_utils.h> #include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h> #include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h> #include <Grid/lattice/Lattice_coordinate.h>
@ -44,4 +44,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_rng.h> #include <Grid/lattice/Lattice_rng.h>
#include <Grid/lattice/Lattice_unary.h> #include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h> #include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>

View File

@ -9,6 +9,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp> Author: neo <cossu@post.kek.jp>
Author: Christoph Lehner <christoph@lhnr.de
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -99,7 +100,7 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{ {
auto view = arg.View(); auto view = arg.View(AcceleratorRead);
return view[ss]; return view[ss];
} }
#endif #endif

View File

@ -7,6 +7,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by

View File

@ -9,6 +9,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -91,6 +92,7 @@ public:
// The view is trivially copy constructible and may be copied to an accelerator device // The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas // in device lambdas
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (ViewMode mode) const LatticeView<vobj> View (ViewMode mode) const
{ {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode); LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);

View File

@ -0,0 +1,226 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_basis.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
// If assume basis[j] are already orthonormal,
// can take all inner products in parallel saving 2x bandwidth
// Save 3x bandwidth on the second line of loop.
// perhaps 2.5x speed up.
// 2x overall in Multigrid Lanczos
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class VField, class Matrix>
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size());
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
View *basis_vp = &basis_v[0];
int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda
return;
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of matrix
Vector<double> Qt_jv(Nm*Nm);
double *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{
int j = i/Nm;
int k = i%Nm;
Qt_p[i]=Qt(j,k);
});
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
// remaining work in this block
int ssites=MIN(siteBlock,oSites-s);
// zero out the accumulators
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
decltype(coalescedRead(Bp[ss])) z;
z=Zero();
coalescedWrite(Bp[ss],z);
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
}
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View(AcceleratorRead)) View;
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard();
Vector<View> basis_v; basis_v.reserve(basis.size());
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorRead));
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
}
coalescedWrite(result_v[ss], B);
});
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
NAMESPACE_END(Grid);

View File

@ -40,8 +40,11 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead); autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite); autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss))); coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
}); });
@ -50,8 +53,11 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead); autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite); autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
}); });

View File

@ -5,6 +5,7 @@
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or the Free Software Foundation; either version 2 of the License, or
@ -64,6 +65,37 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
return ssum; return ssum;
} }
template<class vobj>
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
@ -74,6 +106,15 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
return sum_cpu(arg,osites); return sum_cpu(arg,osites);
#endif #endif
} }
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
@ -101,7 +142,7 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
// Double inner product // Double inner product
template<class vobj> template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
@ -113,32 +154,37 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
// Might make all code paths go this way. // Might make all code paths go this way.
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead); autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead); autoView( right_v,right, AcceleratorRead);
// GPU - SIMT lane compliance... // GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss); auto x_l = left_v(ss);
auto y_l = right_v(ss); auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
}) })
}
// This is in single precision and fails some tests // This is in single precision and fails some tests
// Need a sumD that sums in double // Need a sumD that sums in double
#if defined(GRID_CUDA)||defined(GRID_HIP) nrm = TensorRemove(sumD(inner_tmp_v,sites));
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
nrm = TensorRemove(sum_cpu(inner_tmp_v,sites));
#endif
grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
ComplexD nrm = rankInnerProduct(left,right);
grid->GlobalSum(nrm);
return nrm;
}
///////////////////////// /////////////////////////
// Fast axpby_norm // Fast axpby_norm
// z = a x + b y // z = a x + b y
@ -181,16 +227,50 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
}); });
#if defined(GRID_CUDA)||defined(GRID_HIP) nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class vobj> strong_inline void
innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
{
conformable(left,right);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2);
GridBase *grid = left.Grid();
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
// GPU
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
typedef decltype(innerProduct(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
{
autoView(left_v,left, AcceleratorRead);
autoView(right_v,right,AcceleratorRead);
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
});
}
tmp[0] = TensorRemove(sumD(inner_tmp_v,sites));
tmp[1] = TensorRemove(sumD(norm_tmp_v,sites));
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
ip = tmp[0];
nrm = real(tmp[1]);
}
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)

View File

@ -37,6 +37,7 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace // Trace
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
/*
template<class vobj> template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))> inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{ {
@ -48,6 +49,7 @@ inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
}); });
return ret; return ret;
}; };
*/
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace Index level dependent operation // Trace Index level dependent operation

View File

@ -6,6 +6,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -84,30 +85,136 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
}); });
} }
template<class vobj,class CComplex,int nbasis> ////////////////////////////////////////////////////////////////////////////////////////////
// Flexible Type Conversion for internal promotion to double as well as graceful
// treatment of scalar-compatible types
////////////////////////////////////////////////////////////////////////////////////////////
accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
out = in;
}
accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
out = in;
}
#ifdef GRID_SIMT
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
}
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
}
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
}
#endif
accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
}
accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
}
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
convertType(out,in._internal);
}
template<typename T1,typename T2>
accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
convertType(out._internal,in);
}
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in) {
for (int i=0;i<N;i++)
for (int j=0;j<N;j++)
convertType(out._internal[i][j],in._internal[i][j]);
}
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in) {
for (int i=0;i<N;i++)
convertType(out._internal[i],in._internal[i]);
}
template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
accelerator_inline void convertType(T & out, const T & in) {
out = in;
}
template<typename T1,typename T2>
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
autoView( out_v , out,AcceleratorWrite);
autoView( in_v , in ,AcceleratorRead);
accelerator_for(ss,out_v.size(),T1::Nsimd(),{
convertType(out_v[ss],in_v(ss));
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// precision-promoted local inner product
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
{
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
Lattice<iScalar<t_inner>> ret(lhs.Grid());
{
autoView(ret_v, ret,AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
});
}
return ret;
}
////////////////////////////////////////////////////////////////////////////////////////////
// block routines
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData, const Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis) const VLattice &Basis)
{ {
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
Lattice<CComplex> ip(coarse); Lattice<iScalar<CComplex>> ip(coarse);
Lattice<vobj> fineDataRed = fineData;
autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite); autoView( ip_ , ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
blockInnerProduct(ip,Basis[v],fineData); blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
coalescedWrite(coarseData_[sc](v),ip_(sc)); convertType(coarseData_[sc](v),ip_[sc]);
}); });
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
} }
} }
template<class vobj,class CComplex>
template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, inline void blockZAXPY(Lattice<vobj> &fineZ,
const Lattice<CComplex> &coarseA, const Lattice<CComplex> &coarseA,
const Lattice<vobj> &fineX, const Lattice<vobj2> &fineX,
const Lattice<vobj> &fineY) const Lattice<vobj> &fineY)
{ {
GridBase * fine = fineZ.Grid(); GridBase * fine = fineZ.Grid();
@ -145,13 +252,50 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
// z = A x + y // z = A x + y
coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf)); #ifdef GRID_SIMT
typename vobj2::tensor_reduced::scalar_object cA;
typename vobj::scalar_object cAx;
#else
typename vobj2::tensor_reduced cA;
vobj cAx;
#endif
convertType(cA,TensorRemove(coarseA_(sc)));
auto prod = cA*fineX_(sf);
convertType(cAx,prod);
coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
}); });
return; return;
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY)
{
typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
GridBase *coarse(CoarseInner.Grid());
GridBase *fine (fineX.Grid());
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> coarse_inner(coarse);
// Precision promotion
fine_inner = localInnerProductD(fineX,fineY);
blockSum(coarse_inner,fine_inner);
{
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
});
}
}
template<class vobj,class CComplex> // deprecate
inline void blockInnerProduct(Lattice<CComplex> &CoarseInner, inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
const Lattice<vobj> &fineX, const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY) const Lattice<vobj> &fineY)
@ -167,12 +311,15 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
// Precision promotion? // Precision promotion?
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
{
autoView( CoarseInner_ , CoarseInner, AcceleratorWrite); autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner, AcceleratorRead); autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, { accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss]; CoarseInner_[ss] = coarse_inner_[ss];
}); });
} }
}
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX) inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
{ {
@ -229,6 +376,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
return; return;
} }
template<class vobj> template<class vobj>
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor) inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
{ {
@ -250,8 +398,8 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
} }
} }
template<class vobj,class CComplex> template<class CComplex,class VLattice>
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
{ {
GridBase *coarse = ip.Grid(); GridBase *coarse = ip.Grid();
GridBase *fine = Basis[0].Grid(); GridBase *fine = Basis[0].Grid();
@ -267,15 +415,22 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
for(int u=0;u<v;u++) { for(int u=0;u<v;u++) {
//Inner product & remove component //Inner product & remove component
blockInnerProduct(ip,Basis[u],Basis[v]); blockInnerProductD(ip,Basis[u],Basis[v]);
ip = -ip; ip = -ip;
blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]); blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
} }
blockNormalise(ip,Basis[v]); blockNormalise(ip,Basis[v]);
} }
} }
template<class vobj,class CComplex>
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
{
blockOrthonormalize(ip,Basis);
}
#if 0 #if 0
// TODO: CPU optimized version here
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,
@ -320,17 +475,17 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
#else #else
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis) const VLattice &Basis)
{ {
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
fineData=Zero(); fineData=Zero();
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i); Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
Lattice<CComplex> cip(coarse); Lattice<CComplex> cip(coarse);
autoView( cip_ , cip, AcceleratorWrite); autoView( cip_ , cip, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorRead); autoView( ip_ , ip, AcceleratorRead);
@ -407,6 +562,7 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions; Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride; Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride; Coordinate ost = Tg->_ostride;
autoView( t_v , To, AcceleratorWrite); autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead); autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{ accelerator_for(idx,Fg->lSites(),1,{

View File

@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose // Transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
/*
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
@ -48,6 +49,7 @@ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
}); });
return ret; return ret;
}; };
*/
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose // Index level dependent transpose

View File

@ -341,7 +341,7 @@ class BinaryIO {
int ieee32big = (format == std::string("IEEE32BIG")); int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
assert(ieee64||ieee32|ieee64big||ieee32big); assert(ieee64||ieee32|ieee64big||ieee32big);
assert((ieee64+ieee32+ieee64big+ieee32big)==1); assert((ieee64+ieee32+ieee64big+ieee32big)==1);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////

View File

@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger {
}; };
}; };
template<class fobj,class sobj>
struct GaugeDoubleStoredMunger{
void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nds; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template <class fobj, class sobj>
struct GaugeDoubleStoredUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nds; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template<class fobj,class sobj> template<class fobj,class sobj>
struct Gauge3x2munger{ struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){ void operator() (fobj &in,sobj &out){

View File

@ -146,7 +146,7 @@ public:
int ieee32big = (format == std::string("IEEE32BIG")); int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger; // depending on datatype, set up munger;

224
Grid/parallelIO/OpenQcdIO.h Normal file
View File

@ -0,0 +1,224 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/OpenQcdIO.h
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
struct OpenQcdHeader : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader,
int, Nt,
int, Nx,
int, Ny,
int, Nz,
double, plaq);
};
class OpenQcdIO : public BinaryIO {
public:
static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6
static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) {
OpenQcdHeader header;
{
std::ifstream fin(file, std::ios::in | std::ios::binary);
fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
assert(!fin.fail());
field.data_start = fin.tellg();
fin.close();
}
header.plaq /= normalisationFactor;
// sanity check (should trigger on endian issues)
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
field.dimension[2] = header.Nz;
field.dimension[3] = header.Nt;
std::cout << GridLogDebug << "header: " << header << std::endl;
std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
assert(grid->_ndimension == Nd);
for(int d = 0; d < Nd; d++)
assert(grid->_fdimensions[d] == field.dimension[d]);
field.plaquette = header.plaq;
return field.data_start;
}
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
FieldMetaData& header,
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
assert(grid != nullptr); assert(grid->_ndimension == Nd);
uint64_t offset = readHeader(file, Umu.Grid(), header);
FieldMetaData clone(header);
std::string format("IEEE64"); // they always store little endian double precsision
uint32_t nersc_csum, scidac_csuma, scidac_csumb;
GridCartesian* grid_openqcd = createOpenQcdGrid(grid);
GridRedBlackCartesian* grid_rb = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
typedef DoubleStoredColourMatrixD fobj;
typedef typename DoubleStoredGaugeField::vector_object::scalar_object sobj;
typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word;
word w = 0;
std::vector<fobj> iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here
std::vector<sobj> scalardata(grid->lSites());
IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC,
nersc_csum, scidac_csuma, scidac_csumb);
GridStopWatch timer;
timer.Start();
DoubleStoredGaugeField Umu_ds(grid);
auto munge = GaugeDoubleStoredMunger<DoubleStoredColourMatrixD, DoubleStoredColourMatrix>();
Coordinate ldim = grid->LocalDimensions();
thread_for(idx_g, grid->lSites(), {
Coordinate coor;
grid->LocalIndexToLocalCoor(idx_g, coor);
bool isOdd = grid_rb->CheckerBoard(coor) == Odd;
if(!isOdd) continue;
int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir]
+ coor[Xdir] * ldim[Ydir] * ldim[Zdir]
+ coor[Ydir] * ldim[Zdir]
+ coor[Zdir])/2;
munge(iodata[idx_o], scalardata[idx_g]);
});
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl;
timer.Reset(); timer.Start();
vectorizeFromLexOrdArray(scalardata, Umu_ds);
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl;
timer.Reset(); timer.Start();
undoDoubleStore(Umu, Umu_ds);
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
// clang-format off
std::cout << GridLogMessage << "OpenQcd Configuration " << file
<< " plaquette " << clone.plaquette
<< " header " << header.plaquette
<< " difference " << plaq_diff
<< std::endl;
// clang-format on
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
std::string file) {
std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl;
exit(EXIT_FAILURE);
}
private:
static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) {
// exploit GridCartesian to be able to still use IOobject
Coordinate gdim = grid->GlobalDimensions();
Coordinate ldim = grid->LocalDimensions();
Coordinate pcoor = grid->ThisProcessorCoor();
// openqcd does rb on the z direction
gdim[Zdir] /= 2;
ldim[Zdir] /= 2;
// and has the order T X Y Z (from slowest to fastest)
std::swap(gdim[Xdir], gdim[Zdir]);
std::swap(ldim[Xdir], ldim[Zdir]);
std::swap(pcoor[Xdir], pcoor[Zdir]);
GridCartesian* ret = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid());
ret->_ldimensions = ldim;
ret->_processor_coor = pcoor;
return ret;
}
template<class vsimd>
static inline void undoDoubleStore(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
conformable(Umu.Grid(), Umu_ds.Grid());
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
// they store T+, T-, X+, X-, Y+, Y-, Z+, Z-
for(int mu_g = 0; mu_g < Nd; ++mu_g) {
int mu_o = (mu_g + 1) % Nd;
U = PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o)
+ Cshift(PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o + 1), mu_g, +1);
PokeIndex<LorentzIndex>(Umu, U, mu_g);
}
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,281 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <ios>
#include <iostream>
#include <limits>
#include <iomanip>
#include <mpi.h>
#include <ostream>
#include <string>
#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;}
#define CHECK_VAR(a) { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; }
// #undef CHECK
// #define CHECK
NAMESPACE_BEGIN(Grid);
class ParRdr {
private:
bool const swap;
MPI_Status status;
MPI_File fp;
int err;
MPI_Datatype oddSiteType;
MPI_Datatype fileViewType;
GridBase* grid;
public:
ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr)
: swap(false)
, grid(gridPtr) {
err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
assert(err == MPI_SUCCESS);
}
virtual ~ParRdr() { MPI_File_close(&fp); }
inline void errInfo(int const err, std::string const& func) {
static char estring[MPI_MAX_ERROR_STRING];
int eclass = -1, len = 0;
MPI_Error_class(err, &eclass);
MPI_Error_string(err, estring, &len);
std::cerr << func << " - Error " << eclass << ": " << estring << std::endl;
}
int readHeader(FieldMetaData& field) {
assert((grid->_ndimension == Nd) && (Nd == 4));
assert(Nc == 3);
OpenQcdHeader header;
readBlock(reinterpret_cast<char*>(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR);
header.plaq /= 3.; // TODO change this into normalizationfactor
// sanity check (should trigger on endian issues) TODO remove?
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
field.dimension[2] = header.Nz;
field.dimension[3] = header.Nt;
for(int d = 0; d < Nd; d++)
assert(grid->FullDimensions()[d] == field.dimension[d]);
field.plaquette = header.plaq;
field.data_start = sizeof(OpenQcdHeader);
return field.data_start;
}
void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) {
err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status);
errInfo(err, "MPI_File_read_at_all");
// CHECK_VAR(err)
int read = -1;
MPI_Get_count(&status, datatype, &read);
// CHECK_VAR(read)
assert(nbytes == (uint64_t)read);
assert(err == MPI_SUCCESS);
}
void createTypes() {
constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
Coordinate const L = grid->GlobalDimensions();
Coordinate const l = grid->LocalDimensions();
Coordinate const i = grid->ThisProcessorCoor();
Coordinate sizes({L[2] / 2, L[1], L[0], L[3]});
Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
}
void freeTypes() {
err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
}
bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
auto hdr_offset = readHeader(meta);
CHECK
createTypes();
err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
CHECK
int const domainSites = grid->lSites();
domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
// the actual READ
constexpr uint64_t cm_size = 2 * Nc * Nc * sizeof(double); // 2_complex
constexpr uint64_t os_size = Nd * 2 * cm_size; // 2_fwdbwd
constexpr uint64_t max_elems = std::numeric_limits<int>::max(); // int adressable elems: floor is fine
uint64_t const n_os = domainSites / 2;
for(uint64_t os_idx = 0; os_idx < n_os;) {
uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx;
uint64_t const cm = os_idx * Nd * 2;
readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType);
os_idx += read_os;
}
CHECK
err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
errInfo(err, "MPI_File_set_view1");
assert(err == MPI_SUCCESS);
freeTypes();
std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
return true;
}
};
class OpenQcdIOChromaReference : public BinaryIO {
public:
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Grid::FieldMetaData& header,
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = Umu.Grid();
typedef ColourMatrixD fobj;
std::vector<fobj> iodata(
Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file
{
ParRdr rdr(MPI_COMM_WORLD, file, grid);
rdr.readGauge(iodata, header);
} // equivalent to using binaryio
std::vector<iDoubleStoredColourMatrix<typename vsimd::scalar_type>> Umu_ds_scalar(grid->lSites());
copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging
DoubledGaugeField Umu_ds(grid);
vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds);
redistribute(Umu, Umu_ds); // equivalent to undoDoublestore
FieldMetaData clone(header);
GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
// clang-format off
std::cout << GridLogMessage << "OpenQcd Configuration " << file
<< " plaquette " << clone.plaquette
<< " header " << header.plaquette
<< " difference " << plaq_diff
<< std::endl;
// clang-format on
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
private:
template<class vsimd>
static inline void redistribute(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
Grid::conformable(Umu.Grid(), Umu_ds.Grid());
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
U = PeekIndex<LorentzIndex>(Umu_ds, 2) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 3), 0, +1); PokeIndex<LorentzIndex>(Umu, U, 0);
U = PeekIndex<LorentzIndex>(Umu_ds, 4) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 5), 1, +1); PokeIndex<LorentzIndex>(Umu, U, 1);
U = PeekIndex<LorentzIndex>(Umu_ds, 6) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 7), 2, +1); PokeIndex<LorentzIndex>(Umu, U, 2);
U = PeekIndex<LorentzIndex>(Umu_ds, 0) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 1), 3, +1); PokeIndex<LorentzIndex>(Umu, U, 3);
}
static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
std::vector<ColourMatrixD> const& node_buff,
GridBase* grid) {
assert(node_buff.size() == Nd * grid->lSites());
Coordinate const& l = grid->LocalDimensions();
Coordinate coord(Nd);
int& x = coord[0];
int& y = coord[1];
int& z = coord[2];
int& t = coord[3];
int buff_idx = 0;
for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering
for(x = 0; x < l[0]; ++x)
for(y = 0; y < l[1]; ++y)
for(z = 0; z < l[2]; ++z) {
if((t + z + y + x) % 2 == 0) continue;
int local_idx;
Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions());
for(int mu = 0; mu < 2 * Nd; ++mu)
for(int c1 = 0; c1 < Nc; ++c1) {
for(int c2 = 0; c2 < Nc; ++c2) {
u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2);
}
}
buff_idx += 2 * Nd;
}
assert(node_buff.size() == buff_idx);
}
};
NAMESPACE_END(Grid);

View File

@ -110,15 +110,15 @@ public:
#endif #endif
accumulator = std::chrono::duration_cast<GridUsecs>(start-start); accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
} }
GridTime Elapsed(void) { GridTime Elapsed(void) const {
assert(running == false); assert(running == false);
return std::chrono::duration_cast<GridTime>( accumulator ); return std::chrono::duration_cast<GridTime>( accumulator );
} }
uint64_t useconds(void){ uint64_t useconds(void) const {
assert(running == false); assert(running == false);
return (uint64_t) accumulator.count(); return (uint64_t) accumulator.count();
} }
bool isRunning(void){ bool isRunning(void) const {
return running; return running;
} }
}; };

View File

@ -40,8 +40,8 @@ public:
public: public:
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out); virtual void M (const FermionField &in, FermionField &out);
virtual RealD Mdag (const FermionField &in, FermionField &out); virtual void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operations // half checkerboard operations
virtual void Meooe (const FermionField &in, FermionField &out); virtual void Meooe (const FermionField &in, FermionField &out);

View File

@ -41,8 +41,8 @@ public:
public: public:
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out); virtual void M (const FermionField &in, FermionField &out);
virtual RealD Mdag (const FermionField &in, FermionField &out); virtual void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operaions // half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out); virtual void Meooe (const FermionField &in, FermionField &out);

View File

@ -53,8 +53,8 @@ public:
virtual void DtildeInv (const FermionField& in, FermionField& out); virtual void DtildeInv (const FermionField& in, FermionField& out);
// override multiply // override multiply
virtual RealD M (const FermionField& in, FermionField& out); virtual void M (const FermionField& in, FermionField& out);
virtual RealD Mdag (const FermionField& in, FermionField& out); virtual void Mdag (const FermionField& in, FermionField& out);
// half checkerboard operations // half checkerboard operations
virtual void Mooee (const FermionField& in, FermionField& out); virtual void Mooee (const FermionField& in, FermionField& out);

View File

@ -58,8 +58,8 @@ public:
virtual GridBase *GaugeRedBlackGrid(void) =0; virtual GridBase *GaugeRedBlackGrid(void) =0;
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out)=0; virtual void M (const FermionField &in, FermionField &out)=0;
virtual RealD Mdag (const FermionField &in, FermionField &out)=0; virtual void Mdag (const FermionField &in, FermionField &out)=0;
// half checkerboard operaions // half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out)=0; virtual void Meooe (const FermionField &in, FermionField &out)=0;
@ -86,7 +86,6 @@ public:
virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0; virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0; virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

View File

@ -71,8 +71,8 @@ public:
// override multiply; cut number routines if pass dagger argument // override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent // and also make interface more uniformly consistent
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
RealD M(const FermionField &in, FermionField &out); void M(const FermionField &in, FermionField &out);
RealD Mdag(const FermionField &in, FermionField &out); void Mdag(const FermionField &in, FermionField &out);
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// half checkerboard operations // half checkerboard operations

View File

@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -74,8 +73,8 @@ public:
GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;}
// full checkerboard operations; leave unimplemented as abstract for now // full checkerboard operations; leave unimplemented as abstract for now
RealD M (const FermionField &in, FermionField &out); void M (const FermionField &in, FermionField &out);
RealD Mdag (const FermionField &in, FermionField &out); void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operations // half checkerboard operations
void Meooe (const FermionField &in, FermionField &out); void Meooe (const FermionField &in, FermionField &out);

View File

@ -56,8 +56,8 @@ public:
virtual void DtildeInv (const FermionField& in, FermionField& out); virtual void DtildeInv (const FermionField& in, FermionField& out);
// override multiply // override multiply
virtual RealD M (const FermionField& in, FermionField& out); virtual void M (const FermionField& in, FermionField& out);
virtual RealD Mdag (const FermionField& in, FermionField& out); virtual void Mdag (const FermionField& in, FermionField& out);
// half checkerboard operations // half checkerboard operations
virtual void Mooee (const FermionField& in, FermionField& out); virtual void Mooee (const FermionField& in, FermionField& out);

View File

@ -59,7 +59,7 @@ public:
{ {
RealD eps = 1.0; RealD eps = 1.0;
std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl; // std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
assert(zdata->n==this->Ls); assert(zdata->n==this->Ls);

View File

@ -71,8 +71,8 @@ public:
// override multiply; cut number routines if pass dagger argument // override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent // and also make interface more uniformly consistent
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
RealD M(const FermionField &in, FermionField &out); void M(const FermionField &in, FermionField &out);
RealD Mdag(const FermionField &in, FermionField &out); void Mdag(const FermionField &in, FermionField &out);
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// half checkerboard operations // half checkerboard operations

View File

@ -47,8 +47,8 @@ public:
void M_internal(const FermionField &in, FermionField &out,int dag); void M_internal(const FermionField &in, FermionField &out,int dag);
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out); virtual void M (const FermionField &in, FermionField &out);
virtual RealD Mdag (const FermionField &in, FermionField &out); virtual void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operaions // half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out); virtual void Meooe (const FermionField &in, FermionField &out);

View File

@ -109,9 +109,8 @@ public:
ImportGauge(_Umu); ImportGauge(_Umu);
} }
virtual RealD M(const FermionField &in, FermionField &out); virtual void M(const FermionField &in, FermionField &out);
virtual RealD Mdag(const FermionField &in, FermionField &out); virtual void Mdag(const FermionField &in, FermionField &out);
virtual void Mooee(const FermionField &in, FermionField &out); virtual void Mooee(const FermionField &in, FermionField &out);
virtual void MooeeDag(const FermionField &in, FermionField &out); virtual void MooeeDag(const FermionField &in, FermionField &out);
virtual void MooeeInv(const FermionField &in, FermionField &out); virtual void MooeeInv(const FermionField &in, FermionField &out);

View File

@ -78,8 +78,8 @@ public:
// override multiply; cut number routines if pass dagger argument // override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent // and also make interface more uniformly consistent
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
virtual RealD M(const FermionField &in, FermionField &out); virtual void M(const FermionField &in, FermionField &out);
virtual RealD Mdag(const FermionField &in, FermionField &out); virtual void Mdag(const FermionField &in, FermionField &out);
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// half checkerboard operations // half checkerboard operations

View File

@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -99,8 +98,8 @@ public:
GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;}
// full checkerboard operations; leave unimplemented as abstract for now // full checkerboard operations; leave unimplemented as abstract for now
virtual RealD M (const FermionField &in, FermionField &out){assert(0); return 0.0;}; virtual void M (const FermionField &in, FermionField &out){assert(0);};
virtual RealD Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;}; virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
// half checkerboard operations; leave unimplemented as abstract for now // half checkerboard operations; leave unimplemented as abstract for now
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);}; virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};

View File

@ -120,7 +120,8 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
} }
} }
virtual RealD M(const FermionField &in, FermionField &out) { virtual void M(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
this->Dhop(in, out, DaggerNo); this->Dhop(in, out, DaggerNo);
FermionField tmp(out.Grid()); FermionField tmp(out.Grid());
@ -129,11 +130,12 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
ComplexD b(0.0,this->mu[s]); ComplexD b(0.0,this->mu[s]);
axpbg5y_ssp(tmp,a,in,b,in,s,s); axpbg5y_ssp(tmp,a,in,b,in,s,s);
} }
return axpy_norm(out, 1.0, tmp, out); axpy(out, 1.0, tmp, out);
} }
// needed for fast PV // needed for fast PV
void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) { void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu)
{
assert(_mass.size() == _mu.size()); assert(_mass.size() == _mu.size());
assert(_mass.size() == this->FermionGrid()->_fdimensions[0]); assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
this->mass = _mass; this->mass = _mass;

View File

@ -323,7 +323,7 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
} }
template<class Impl> template<class Impl>
RealD CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -335,11 +335,10 @@ RealD CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
axpby(chi,1.0,1.0,chi,psi); axpby(chi,1.0,1.0,chi,psi);
M5D(psi,chi); M5D(psi,chi);
return(norm2(chi));
} }
template<class Impl> template<class Impl>
RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
{ {
// Under adjoint // Under adjoint
//D1+ D1- P- -> D1+^dag P+ D2-^dag //D1+ D1- P- -> D1+^dag P+ D2-^dag
@ -354,7 +353,6 @@ RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
M5Ddag(psi,chi); M5Ddag(psi,chi);
// ((b D_W + D_w hop terms +1) on s-diag // ((b D_W + D_w hop terms +1) on s-diag
axpby (chi,1.0,1.0,chi,psi); axpby (chi,1.0,1.0,chi,psi);
return norm2(chi);
} }
// half checkerboard operations // half checkerboard operations

View File

@ -94,7 +94,7 @@ void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Ap
template<class Impl> template<class Impl>
RealD ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, FermionField &chi) void ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
{ {
int Ls = this->Ls; int Ls = this->Ls;
@ -116,15 +116,14 @@ RealD ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, F
} }
sign=-sign; sign=-sign;
} }
return norm2(chi);
} }
template<class Impl> template<class Impl>
RealD ContinuedFractionFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi) void ContinuedFractionFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
{ {
// This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
// The rest of matrix is symmetric. // The rest of matrix is symmetric.
// Can ignore "dag" // Can ignore "dag"
return M(psi,chi); M(psi,chi);
} }
template<class Impl> template<class Impl>
void ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ void ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){

View File

@ -89,7 +89,7 @@ void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionFiel
/*****************************************************************************************************/ /*****************************************************************************************************/
template<class Impl> template<class Impl>
RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi) void DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -97,11 +97,10 @@ RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
this->DW(Din, chi, DaggerNo); this->DW(Din, chi, DaggerNo);
axpby(chi, 1.0, 1.0, chi, psi); axpby(chi, 1.0, 1.0, chi, psi);
this->M5D(psi, chi); this->M5D(psi, chi);
return(norm2(chi));
} }
template<class Impl> template<class Impl>
RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi) void DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -109,7 +108,6 @@ RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& c
this->MeooeDag5D(Din, chi); this->MeooeDag5D(Din, chi);
this->M5Ddag(psi, chi); this->M5Ddag(psi, chi);
axpby(chi, 1.0, 1.0, chi, psi); axpby(chi, 1.0, 1.0, chi, psi);
return(norm2(chi));
} }
/******************************************************************** /********************************************************************

View File

@ -470,21 +470,24 @@ void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vect
assert(0); assert(0);
} }
template <class Impl> template <class Impl>
RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo); Dhop(in, out, DaggerNo);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>
RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes); Dhop(in, out, DaggerYes);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo); DhopEO(in, out, DaggerNo);
} else { } else {
@ -492,7 +495,8 @@ void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionFiel
} }
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes); DhopEO(in, out, DaggerYes);
} else { } else {
@ -501,27 +505,30 @@ void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionF
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(mass); typename FermionField::scalar_type scal(mass);
out = scal * in; out = scal * in;
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Mooee(in, out); Mooee(in, out);
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
out = (1.0 / (mass)) * in; out = (1.0 / (mass)) * in;
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in, void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,FermionField &out)
FermionField &out) { {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
MooeeInv(in, out); MooeeInv(in, out);
} }

View File

@ -171,21 +171,24 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
///////////////////////////// /////////////////////////////
template <class Impl> template <class Impl>
RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo); Dhop(in, out, DaggerNo);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>
RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes); Dhop(in, out, DaggerYes);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo); DhopEO(in, out, DaggerNo);
} else { } else {
@ -193,7 +196,8 @@ void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField
} }
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes); DhopEO(in, out, DaggerYes);
} else { } else {
@ -202,27 +206,30 @@ void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionFie
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(mass); typename FermionField::scalar_type scal(mass);
out = scal * in; out = scal * in;
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Mooee(in, out); Mooee(in, out);
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
out = (1.0 / (mass)) * in; out = (1.0 / (mass)) * in;
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,FermionField &out)
FermionField &out) { {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
MooeeInv(in, out); MooeeInv(in, out);
} }
@ -234,7 +241,8 @@ void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU,
GaugeField & mat, GaugeField & mat,
const FermionField &A, const FermionField &B, int dag) { const FermionField &A, const FermionField &B, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor; Compressor compressor;
@ -284,8 +292,8 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
conformable(U.Grid(), _grid); conformable(U.Grid(), _grid);
conformable(U.Grid(), V.Grid()); conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid()); conformable(U.Grid(), mat.Grid());
@ -296,8 +304,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionFie
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
conformable(U.Grid(), _cbgrid); conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid()); conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid()); conformable(U.Grid(), mat.Grid());
@ -310,8 +318,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionF
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
conformable(U.Grid(), _cbgrid); conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid()); conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid()); conformable(U.Grid(), mat.Grid());

View File

@ -166,7 +166,7 @@ void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& c
/*****************************************************************************************************/ /*****************************************************************************************************/
template<class Impl> template<class Impl>
RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi) void MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -174,11 +174,10 @@ RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
this->DW(Din, chi, DaggerNo); this->DW(Din, chi, DaggerNo);
axpby(chi, 1.0, 1.0, chi, psi); axpby(chi, 1.0, 1.0, chi, psi);
this->M5D(psi, chi); this->M5D(psi, chi);
return(norm2(chi));
} }
template<class Impl> template<class Impl>
RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi) void MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -186,7 +185,6 @@ RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
this->MeooeDag5D(Din, chi); this->MeooeDag5D(Din, chi);
this->M5Ddag(psi, chi); this->M5Ddag(psi, chi);
axpby(chi, 1.0, 1.0, chi, psi); axpby(chi, 1.0, 1.0, chi, psi);
return(norm2(chi));
} }
/******************************************************************** /********************************************************************

View File

@ -128,17 +128,17 @@ void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U)
///////////////////////////// /////////////////////////////
template <class Impl> template <class Impl>
RealD NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) { void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo); Dhop(in, out, DaggerNo);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>
RealD NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) { void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes); Dhop(in, out, DaggerYes);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>

View File

@ -269,16 +269,14 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
} }
template<class Impl> template<class Impl>
RealD PartialFractionFermion5D<Impl>::M (const FermionField &in, FermionField &out) void PartialFractionFermion5D<Impl>::M (const FermionField &in, FermionField &out)
{ {
M_internal(in,out,DaggerNo); M_internal(in,out,DaggerNo);
return norm2(out);
} }
template<class Impl> template<class Impl>
RealD PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out) void PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
{ {
M_internal(in,out,DaggerYes); M_internal(in,out,DaggerYes);
return norm2(out);
} }
template<class Impl> template<class Impl>

View File

@ -35,7 +35,7 @@ NAMESPACE_BEGIN(Grid);
// *NOT* EO // *NOT* EO
template <class Impl> template <class Impl>
RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out) void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
{ {
FermionField temp(out.Grid()); FermionField temp(out.Grid());
@ -47,11 +47,10 @@ RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
Mooee(in, temp); Mooee(in, temp);
out += temp; out += temp;
return norm2(out);
} }
template <class Impl> template <class Impl>
RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out) void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
{ {
FermionField temp(out.Grid()); FermionField temp(out.Grid());
@ -63,7 +62,6 @@ RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
MooeeDag(in, temp); MooeeDag(in, temp);
out += temp; out += temp;
return norm2(out);
} }
template <class Impl> template <class Impl>
@ -132,14 +130,14 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm)); pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm)); pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv); pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv); pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
} }
template <class Impl> template <class Impl>

View File

@ -102,21 +102,24 @@ void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
///////////////////////////// /////////////////////////////
template <class Impl> template <class Impl>
RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) { void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo); Dhop(in, out, DaggerNo);
return axpy_norm(out, diag_mass, in, out); axpy(out, diag_mass, in, out);
} }
template <class Impl> template <class Impl>
RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) { void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes); Dhop(in, out, DaggerYes);
return axpy_norm(out, diag_mass, in, out); axpy(out, diag_mass, in, out);
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) { void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo); DhopEO(in, out, DaggerNo);
} else { } else {
@ -125,7 +128,8 @@ void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) { void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes); DhopEO(in, out, DaggerYes);
} else { } else {
@ -134,26 +138,30 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) { void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(diag_mass); typename FermionField::scalar_type scal(diag_mass);
out = scal * in; out = scal * in;
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) { void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Mooee(in, out); Mooee(in, out);
} }
template<class Impl> template<class Impl>
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) { void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
out = (1.0/(diag_mass))*in; out = (1.0/(diag_mass))*in;
} }
template<class Impl> template<class Impl>
void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) { void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
MooeeInv(in,out); MooeeInv(in,out);
} }
@ -249,7 +257,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
conformable(U.Grid(), _grid); conformable(U.Grid(), _grid);
conformable(U.Grid(), V.Grid()); conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid()); conformable(U.Grid(), mat.Grid());
@ -260,7 +269,8 @@ void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, cons
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
conformable(U.Grid(), _cbgrid); conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid()); conformable(U.Grid(), V.Grid());
//conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido) //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
@ -274,7 +284,8 @@ void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, co
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
conformable(U.Grid(), _cbgrid); conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid()); conformable(U.Grid(), V.Grid());
//conformable(U.Grid(), mat.Grid()); //conformable(U.Grid(), mat.Grid());
@ -287,7 +298,8 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) { void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid()); conformable(in.Grid(), out.Grid());
@ -297,7 +309,8 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) { void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@ -308,7 +321,8 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) { void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@ -386,7 +400,8 @@ template <class Impl>
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, const FermionField &in,
FermionField &out, int dag) { FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
@ -436,7 +451,8 @@ template <class Impl>
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, const FermionField &in,
FermionField &out, int dag) { FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);

View File

@ -447,19 +447,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif #endif
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif #endif
} }
assert(0 && " Kernel optimisation case not covered "); assert(0 && " Kernel optimisation case not covered ");

View File

@ -59,7 +59,7 @@ public:
} }
static inline GaugeLinkField static inline GaugeLinkField
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
return Cshift(adj(Link), mu, -1); return Cshift(closure(adj(Link)), mu, -1);
} }
static inline GaugeLinkField static inline GaugeLinkField
CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {

View File

@ -39,6 +39,10 @@ directory
#include <Grid/parallelIO/IldgIOtypes.h> #include <Grid/parallelIO/IldgIOtypes.h>
#include <Grid/parallelIO/IldgIO.h> #include <Grid/parallelIO/IldgIO.h>
#include <Grid/parallelIO/NerscIO.h> #include <Grid/parallelIO/NerscIO.h>
#include <Grid/parallelIO/OpenQcdIO.h>
#if !defined(GRID_COMMS_NONE)
#include <Grid/parallelIO/OpenQcdIOChromaReference.h>
#endif
NAMESPACE_CHECK(Ildg); NAMESPACE_CHECK(Ildg);
#include <Grid/qcd/hmc/checkpointers/CheckPointers.h> #include <Grid/qcd/hmc/checkpointers/CheckPointers.h>

View File

@ -80,6 +80,7 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,
static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>, static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient");
static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>, static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB"); HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB");
static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>, static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,

View File

@ -46,7 +46,7 @@ public:
typedef typename SpinMatrixField::vector_object sobj; typedef typename SpinMatrixField::vector_object sobj;
static const int epsilon[6][3] ; static const int epsilon[6][3] ;
static const Complex epsilon_sgn[6]; static const Real epsilon_sgn[6];
private: private:
template <class mobj, class robj> template <class mobj, class robj>
@ -151,14 +151,18 @@ public:
template <class FImpl> template <class FImpl>
const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
template <class FImpl> /*template <class FImpl>
const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1), const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
Complex(1), Complex(1),
Complex(1), Complex(1),
Complex(-1), Complex(-1),
Complex(-1), Complex(-1),
Complex(-1)}; Complex(-1)};
*/
template <class FImpl>
const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
//This is the old version
template <class FImpl> template <class FImpl>
template <class mobj, class robj> template <class mobj, class robj>
void BaryonUtils<FImpl>::baryon_site(const mobj &D1, void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
@ -174,11 +178,13 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
{ {
Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
auto gD1a = GammaA_left * GammaA_right * D1; auto gD1a = GammaA_left * GammaA_right * D1;
auto gD1b = GammaA_left * g4 * GammaA_right * D1; auto gD1b = GammaA_left * g4 * GammaA_right * D1;
auto pD1 = 0.5* (gD1a + (double)parity * gD1b); auto pD1 = 0.5* (gD1a + (Real)parity * gD1b);
auto gD3 = GammaB_right * D3; auto gD3 = GammaB_right * D3;
auto D2g = D2 * GammaB_left;
auto pD1g = pD1 * GammaB_left;
auto gD3g = gD3 * GammaB_left;
for (int ie_left=0; ie_left < 6 ; ie_left++){ for (int ie_left=0; ie_left < 6 ; ie_left++){
int a_left = epsilon[ie_left][0]; //a int a_left = epsilon[ie_left][0]; //a
@ -188,59 +194,78 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
int a_right = epsilon[ie_right][0]; //a' int a_right = epsilon[ie_right][0]; //a'
int b_right = epsilon[ie_right][1]; //b' int b_right = epsilon[ie_right][1]; //b'
int c_right = epsilon[ie_right][2]; //c' int c_right = epsilon[ie_right][2]; //c'
Real ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
//This is the \delta_{456}^{123} part //This is the \delta_{456}^{123} part
if (wick_contraction[0]){ if (wick_contraction[0]){
auto D2g = D2 * GammaB_left; for (int gamma_left=0; gamma_left<Ns; gamma_left++){
auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
for (int alpha_right=0; alpha_right<Ns; alpha_right++){ for (int alpha_right=0; alpha_right<Ns; alpha_right++){
for (int beta_left=0; beta_left<Ns; beta_left++){ for (int beta_left=0; beta_left<Ns; beta_left++){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){ auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left); auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
}}} result()()() += eepD1*D2g_ab*gD3_ab;
}}
}
} }
//This is the \delta_{456}^{231} part //This is the \delta_{456}^{231} part
if (wick_contraction[1]){ if (wick_contraction[1]){
auto pD1g = pD1 * GammaB_left;
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
for (int beta_left=0; beta_left<Ns; beta_left++){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){ for (int gamma_left=0; gamma_left<Ns; gamma_left++){
result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left); for (int alpha_right=0; alpha_right<Ns; alpha_right++){
}}} auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
for (int beta_left=0; beta_left<Ns; beta_left++){
auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
result()()() += eepD1g_gb*D2_ab*gD3_ag;
}
}}
} }
//This is the \delta_{456}^{312} part //This is the \delta_{456}^{312} part
if (wick_contraction[2]){ if (wick_contraction[2]){
auto gD3g = gD3 * GammaB_left;
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
for (int beta_left=0; beta_left<Ns; beta_left++){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){ for (int gamma_left=0; gamma_left<Ns; gamma_left++){
result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left); for (int alpha_right=0; alpha_right<Ns; alpha_right++){
}}} auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
for (int beta_left=0; beta_left<Ns; beta_left++){
auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
result()()() += eepD1_gb*D2_ag*gD3g_ab;
}
}}
} }
//This is the \delta_{456}^{132} part //This is the \delta_{456}^{132} part
if (wick_contraction[3]){ if (wick_contraction[3]){
auto gD3g = gD3 * GammaB_left; for (int gamma_left=0; gamma_left<Ns; gamma_left++){
auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
for (int alpha_right=0; alpha_right<Ns; alpha_right++){ for (int alpha_right=0; alpha_right<Ns; alpha_right++){
for (int beta_left=0; beta_left<Ns; beta_left++){ for (int beta_left=0; beta_left<Ns; beta_left++){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){ auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left); auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
}}} result()()() -= eepD1*D2_ab*gD3g_ab;
}}
}
} }
//This is the \delta_{456}^{321} part //This is the \delta_{456}^{321} part
if (wick_contraction[4]){ if (wick_contraction[4]){
auto D2g = D2 * GammaB_left;
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
for (int beta_left=0; beta_left<Ns; beta_left++){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){ for (int gamma_left=0; gamma_left<Ns; gamma_left++){
result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left); for (int alpha_right=0; alpha_right<Ns; alpha_right++){
}}} auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
for (int beta_left=0; beta_left<Ns; beta_left++){
auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
result()()() -= eepD1_gb*D2g_ab*gD3_ag;
}
}}
} }
//This is the \delta_{456}^{213} part //This is the \delta_{456}^{213} part
if (wick_contraction[5]){ if (wick_contraction[5]){
auto pD1g = pD1 * GammaB_left;
for (int alpha_right=0; alpha_right<Ns; alpha_right++){
for (int beta_left=0; beta_left<Ns; beta_left++){
for (int gamma_left=0; gamma_left<Ns; gamma_left++){ for (int gamma_left=0; gamma_left<Ns; gamma_left++){
result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left); for (int alpha_right=0; alpha_right<Ns; alpha_right++){
}}} auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
for (int beta_left=0; beta_left<Ns; beta_left++){
auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
result()()() -= eepD1g_gb*D2_ag*gD3_ab;
}
}}
} }
} }
} }
@ -259,6 +284,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
const int parity, const int parity,
ComplexField &baryon_corr) ComplexField &baryon_corr)
{ {
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
@ -278,18 +307,32 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
autoView( v2 , q2_left, CpuRead); autoView( v2 , q2_left, CpuRead);
autoView( v3 , q3_left, CpuRead); autoView( v3 , q3_left, CpuRead);
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), { Real bytes =0.;
thread_for(ss,grid->oSites(),{ bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
//for(int ss=0; ss < grid->oSites(); ss++){ for (int ie=0; ie < 6 ; ie++){
if(ie==0 or ie==3){
bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contraction[ie];
}
else{
bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contraction[ie];
}
}
Real t=0.;
t =-usecond();
accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
auto D1 = v1[ss]; auto D1 = v1[ss];
auto D2 = v2[ss]; auto D2 = v2[ss];
auto D3 = v3[ss]; auto D3 = v3[ss];
vobj result=Zero(); vobj result=Zero();
baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
vbaryon_corr[ss] = result; vbaryon_corr[ss] = result;
} );//end loop over lattice sites } );//end loop over lattice sites
t += usecond();
std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
} }
template <class FImpl> template <class FImpl>
template <class mobj, class robj> template <class mobj, class robj>
@ -305,6 +348,10 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
const int parity, const int parity,
robj &result) robj &result)
{ {
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl;
std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl;
@ -318,7 +365,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
result=Zero(); result=Zero();
baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
} }
/*********************************************************************** /***********************************************************************
@ -558,6 +605,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
const std::string op, const std::string op,
SpinMatrixField &stn_corr) SpinMatrixField &stn_corr)
{ {
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
GridBase *grid = qs_ti.Grid(); GridBase *grid = qs_ti.Grid();
autoView( vcorr, stn_corr, CpuWrite); autoView( vcorr, stn_corr, CpuWrite);
@ -565,8 +616,7 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
autoView( vd_tf , qd_tf, CpuRead); autoView( vd_tf , qd_tf, CpuRead);
autoView( vs_ti , qs_ti, CpuRead); autoView( vs_ti , qs_ti, CpuRead);
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), { accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
thread_for(ss,grid->oSites(),{
auto Dq_loop = vq_loop[ss]; auto Dq_loop = vq_loop[ss];
auto Dd_tf = vd_tf[ss]; auto Dd_tf = vd_tf[ss];
auto Ds_ti = vs_ti[ss]; auto Ds_ti = vs_ti[ss];
@ -595,6 +645,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
const std::string op, const std::string op,
SpinMatrixField &stn_corr) SpinMatrixField &stn_corr)
{ {
assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
GridBase *grid = qs_ti.Grid(); GridBase *grid = qs_ti.Grid();
autoView( vcorr , stn_corr, CpuWrite); autoView( vcorr , stn_corr, CpuWrite);

View File

@ -52,6 +52,26 @@ namespace PeriodicBC {
tmp = adj(Link)*field; tmp = adj(Link)*field;
return Cshift(tmp,mu,-1);// moves towards positive mu return Cshift(tmp,mu,-1);// moves towards positive mu
} }
template<class gauge,typename Op, typename T1> auto
CovShiftForward(const Lattice<gauge> &Link,
int mu,
const LatticeUnaryExpression<Op,T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
return CovShiftForward(Link,mu,arg);
}
template<class gauge,typename Op, typename T1> auto
CovShiftBackward(const Lattice<gauge> &Link,
int mu,
const LatticeUnaryExpression<Op,T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
return CovShiftForward(Link,mu,arg);
}
} }
@ -122,6 +142,26 @@ namespace ConjugateBC {
return Cshift(tmp,mu,-1);// moves towards positive mu return Cshift(tmp,mu,-1);// moves towards positive mu
} }
template<class gauge,typename Op, typename T1> auto
CovShiftForward(const Lattice<gauge> &Link,
int mu,
const LatticeUnaryExpression<Op,T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
return CovShiftForward(Link,mu,arg);
}
template<class gauge,typename Op, typename T1> auto
CovShiftBackward(const Lattice<gauge> &Link,
int mu,
const LatticeUnaryExpression<Op,T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> arg(expr);
return CovShiftForward(Link,mu,arg);
}
} }

View File

@ -485,7 +485,7 @@ public:
// Up staple ___ ___ // Up staple ___ ___
// | | // | |
tmp = Cshift(adj(U[nu]), nu, -1); tmp = Cshift(closure(adj(U[nu])), nu, -1);
tmp = adj(U2[mu]) * tmp; tmp = adj(U2[mu]) * tmp;
tmp = Cshift(tmp, mu, -2); tmp = Cshift(tmp, mu, -2);
@ -519,7 +519,7 @@ public:
// //
// | | // | |
tmp = Cshift(adj(U2[nu]), nu, -2); tmp = Cshift(closure(adj(U2[nu])), nu, -2);
tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp); tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
tmp = U2[nu] * Cshift(tmp, nu, 2); tmp = U2[nu] * Cshift(tmp, nu, 2);
Stap += Cshift(tmp, mu, 1); Stap += Cshift(tmp, mu, 1);

View File

@ -87,11 +87,7 @@ namespace Grid {
template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType> template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
struct is_tensor_fixed<Eigen::TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType>> struct is_tensor_fixed<Eigen::TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType>>
: public std::true_type {}; : public std::true_type {};
template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType, template<typename T> struct is_tensor_fixed<Eigen::TensorMap<T>> : public std::true_type {};
int MapOptions_, template <class> class MapPointer_>
struct is_tensor_fixed<Eigen::TensorMap<Eigen::TensorFixedSize<Scalar_, Dimensions_,
Options_, IndexType>, MapOptions_, MapPointer_>>
: public std::true_type {};
// Is this a variable-size Eigen tensor // Is this a variable-size Eigen tensor
template<typename T, typename V = void> struct is_tensor_variable : public std::false_type {}; template<typename T, typename V = void> struct is_tensor_variable : public std::false_type {};

View File

@ -114,7 +114,8 @@ THE SOFTWARE.
#define GRID_MACRO_WRITE_MEMBER(A,B) ::Grid::write(WR,#B,obj. B); #define GRID_MACRO_WRITE_MEMBER(A,B) ::Grid::write(WR,#B,obj. B);
#define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)\ #define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)\
std::string SerialisableClassName(void) const {return std::string(#cname);} \ static inline std::string SerialisableClassName(void) {return std::string(#cname);} \
static constexpr bool isEnum = false; \
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\
template <typename T>\ template <typename T>\
static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \ static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
@ -162,6 +163,8 @@ public:\
public:\ public:\
accelerator name(void) : value_(undefname) {}; \ accelerator name(void) : value_(undefname) {}; \
accelerator name(int value): value_(value) {}; \ accelerator name(int value): value_(value) {}; \
static inline std::string SerialisableClassName(void) {return std::string(#name);}\
static constexpr bool isEnum = true; \
template <typename T>\ template <typename T>\
static inline void write(::Grid::Writer<T> &WR,const std::string &s, const name &obj) \ static inline void write(::Grid::Writer<T> &WR,const std::string &s, const name &obj) \
{\ {\

View File

@ -432,12 +432,10 @@ namespace Grid {
std::vector<T> strToVec(const std::string s) std::vector<T> strToVec(const std::string s)
{ {
std::istringstream sstr(s); std::istringstream sstr(s);
T buf;
std::vector<T> v; std::vector<T> v;
while(!sstr.eof()) for(T buf; sstr >> buf;)
{ {
sstr >> buf;
v.push_back(buf); v.push_back(buf);
} }

View File

@ -6,6 +6,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Michael Marshall <michael.marshall@ed.ac.au> Author: Michael Marshall <michael.marshall@ed.ac.au>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -55,6 +56,7 @@ class GridTensorBase {};
using Complexified = typename Traits::Complexified; \ using Complexified = typename Traits::Complexified; \
using Realified = typename Traits::Realified; \ using Realified = typename Traits::Realified; \
using DoublePrecision = typename Traits::DoublePrecision; \ using DoublePrecision = typename Traits::DoublePrecision; \
using DoublePrecision2= typename Traits::DoublePrecision2; \
static constexpr int TensorLevel = Traits::TensorLevel static constexpr int TensorLevel = Traits::TensorLevel
template <class vtype> template <class vtype>

View File

@ -8,6 +8,7 @@
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -194,6 +195,79 @@ auto innerProductD (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decl
ret._internal = innerProductD(lhs._internal,rhs._internal); ret._internal = innerProductD(lhs._internal,rhs._internal);
return ret; return ret;
} }
//////////////////////////////////////
// innerProductD2: precision promotion without inner sum
//////////////////////////////////////
accelerator_inline vComplexD2 TensorRemove(const vComplexD2 & x) { return x; };
accelerator_inline vRealD2 TensorRemove(const vRealD2 & x) { return x; };
accelerator_inline ComplexD innerProductD2(const ComplexF &l,const ComplexF &r){ return innerProduct(l,r); }
accelerator_inline ComplexD innerProductD2(const ComplexD &l,const ComplexD &r){ return innerProduct(l,r); }
accelerator_inline RealD innerProductD2(const RealD &l,const RealD &r){ return innerProduct(l,r); }
accelerator_inline RealD innerProductD2(const RealF &l,const RealF &r){ return innerProduct(l,r); }
accelerator_inline vComplexD innerProductD2(const vComplexD &l,const vComplexD &r){ return innerProduct(l,r); }
accelerator_inline vRealD innerProductD2(const vRealD &l,const vRealD &r){ return innerProduct(l,r); }
accelerator_inline vComplexD2 innerProductD2(const vComplexF &l,const vComplexF &r)
{
vComplexD la,lb;
vComplexD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
vComplexD2 ret;
ret._internal[0] = innerProduct(la,ra);
ret._internal[1] = innerProduct(lb,rb);
return ret;
}
accelerator_inline vRealD2 innerProductD2(const vRealF &l,const vRealF &r)
{
vRealD la,lb;
vRealD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
vRealD2 ret;
ret._internal[0]=innerProduct(la,ra);
ret._internal[1]=innerProduct(lb,rb);
return ret;
}
// Now do it for vector, matrix, scalar
template<class l,class r,int N> accelerator_inline
auto innerProductD2 (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProductD2(lhs._internal[0],rhs._internal[0]))>
{
typedef decltype(innerProductD2(lhs._internal[0],rhs._internal[0])) ret_t;
iScalar<ret_t> ret;
zeroit(ret);
for(int c1=0;c1<N;c1++){
ret._internal += innerProductD2(lhs._internal[c1],rhs._internal[c1]);
}
return ret;
}
template<class l,class r,int N> accelerator_inline
auto innerProductD2 (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProductD2(lhs._internal[0][0],rhs._internal[0][0]))>
{
typedef decltype(innerProductD2(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
iScalar<ret_t> ret;
ret=Zero();
for(int c1=0;c1<N;c1++){
for(int c2=0;c2<N;c2++){
ret._internal+=innerProductD2(lhs._internal[c1][c2],rhs._internal[c1][c2]);
}}
return ret;
}
template<class l,class r> accelerator_inline
auto innerProductD2 (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProductD2(lhs._internal,rhs._internal))>
{
typedef decltype(innerProductD2(lhs._internal,rhs._internal)) ret_t;
iScalar<ret_t> ret;
ret._internal = innerProductD2(lhs._internal,rhs._internal);
return ret;
}
////////////////////// //////////////////////
// Keep same precison // Keep same precison
////////////////////// //////////////////////

View File

@ -6,6 +6,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@phys.columbia.edu> Author: Christopher Kelly <ckelly@phys.columbia.edu>
Author: Michael Marshall <michael.marshall@ed.ac.au> Author: Michael Marshall <michael.marshall@ed.ac.au>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or the Free Software Foundation; either version 2 of the License, or
@ -37,6 +38,60 @@ NAMESPACE_BEGIN(Grid);
template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type { static constexpr bool notvalue = false; }; template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type { static constexpr bool notvalue = false; };
template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type { static constexpr bool notvalue = false; }; template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type { static constexpr bool notvalue = false; };
// Traits to identify scalars
template<typename T> struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; };
template<class T> struct isGridScalar<iScalar<T>> : public std::true_type { static constexpr bool notvalue = false; };
// Store double-precision data in single-precision grids for precision promoted localInnerProductD
template<typename T>
class TypePair {
public:
T _internal[2];
TypePair<T>& operator=(const Grid::Zero& o) {
_internal[0] = Zero();
_internal[1] = Zero();
return *this;
}
TypePair<T> operator+(const TypePair<T>& o) const {
TypePair<T> r;
r._internal[0] = _internal[0] + o._internal[0];
r._internal[1] = _internal[1] + o._internal[1];
return r;
}
TypePair<T>& operator+=(const TypePair<T>& o) {
_internal[0] += o._internal[0];
_internal[1] += o._internal[1];
return *this;
}
friend accelerator_inline void add(TypePair<T>* ret, const TypePair<T>* a, const TypePair<T>* b) {
add(&ret->_internal[0],&a->_internal[0],&b->_internal[0]);
add(&ret->_internal[1],&a->_internal[1],&b->_internal[1]);
}
};
typedef TypePair<ComplexD> ComplexD2;
typedef TypePair<RealD> RealD2;
typedef TypePair<vComplexD> vComplexD2;
typedef TypePair<vRealD> vRealD2;
// Traits to identify fundamental data types
template<typename T> struct isGridFundamental : public std::false_type { static constexpr bool notvalue = true; };
template<> struct isGridFundamental<vComplexF> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vComplexD> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vRealF> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vRealD> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<ComplexF> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<ComplexD> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<RealF> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<RealD> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vComplexD2> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vRealD2> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<ComplexD2> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<RealD2> : public std::true_type { static constexpr bool notvalue = false; };
////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////
// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD. // Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
// Use of a helper class like this allows us to template specialise and "dress" // Use of a helper class like this allows us to template specialise and "dress"
@ -81,6 +136,7 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexF Complexified; typedef ComplexF Complexified;
typedef RealF Realified; typedef RealF Realified;
typedef RealD DoublePrecision; typedef RealD DoublePrecision;
typedef RealD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base {
typedef RealD scalar_type; typedef RealD scalar_type;
@ -93,6 +149,20 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexD Complexified; typedef ComplexD Complexified;
typedef RealD Realified; typedef RealD Realified;
typedef RealD DoublePrecision; typedef RealD DoublePrecision;
typedef RealD DoublePrecision2;
};
template<> struct GridTypeMapper<RealD2> : public GridTypeMapper_Base {
typedef RealD2 scalar_type;
typedef RealD2 scalar_typeD;
typedef RealD2 vector_type;
typedef RealD2 vector_typeD;
typedef RealD2 tensor_reduced;
typedef RealD2 scalar_object;
typedef RealD2 scalar_objectD;
typedef ComplexD2 Complexified;
typedef RealD2 Realified;
typedef RealD2 DoublePrecision;
typedef RealD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base { template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base {
typedef ComplexF scalar_type; typedef ComplexF scalar_type;
@ -105,6 +175,7 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexF Complexified; typedef ComplexF Complexified;
typedef RealF Realified; typedef RealF Realified;
typedef ComplexD DoublePrecision; typedef ComplexD DoublePrecision;
typedef ComplexD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base {
typedef ComplexD scalar_type; typedef ComplexD scalar_type;
@ -117,6 +188,20 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexD Complexified; typedef ComplexD Complexified;
typedef RealD Realified; typedef RealD Realified;
typedef ComplexD DoublePrecision; typedef ComplexD DoublePrecision;
typedef ComplexD DoublePrecision2;
};
template<> struct GridTypeMapper<ComplexD2> : public GridTypeMapper_Base {
typedef ComplexD2 scalar_type;
typedef ComplexD2 scalar_typeD;
typedef ComplexD2 vector_type;
typedef ComplexD2 vector_typeD;
typedef ComplexD2 tensor_reduced;
typedef ComplexD2 scalar_object;
typedef ComplexD2 scalar_objectD;
typedef ComplexD2 Complexified;
typedef RealD2 Realified;
typedef ComplexD2 DoublePrecision;
typedef ComplexD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base { template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base {
typedef Integer scalar_type; typedef Integer scalar_type;
@ -129,6 +214,7 @@ NAMESPACE_BEGIN(Grid);
typedef void Complexified; typedef void Complexified;
typedef void Realified; typedef void Realified;
typedef void DoublePrecision; typedef void DoublePrecision;
typedef void DoublePrecision2;
}; };
template<> struct GridTypeMapper<vRealF> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vRealF> : public GridTypeMapper_Base {
@ -142,6 +228,7 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexF Complexified; typedef vComplexF Complexified;
typedef vRealF Realified; typedef vRealF Realified;
typedef vRealD DoublePrecision; typedef vRealD DoublePrecision;
typedef vRealD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<vRealD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vRealD> : public GridTypeMapper_Base {
typedef RealD scalar_type; typedef RealD scalar_type;
@ -154,6 +241,20 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexD Complexified; typedef vComplexD Complexified;
typedef vRealD Realified; typedef vRealD Realified;
typedef vRealD DoublePrecision; typedef vRealD DoublePrecision;
typedef vRealD DoublePrecision2;
};
template<> struct GridTypeMapper<vRealD2> : public GridTypeMapper_Base {
typedef RealD2 scalar_type;
typedef RealD2 scalar_typeD;
typedef vRealD2 vector_type;
typedef vRealD2 vector_typeD;
typedef vRealD2 tensor_reduced;
typedef RealD2 scalar_object;
typedef RealD2 scalar_objectD;
typedef vComplexD2 Complexified;
typedef vRealD2 Realified;
typedef vRealD2 DoublePrecision;
typedef vRealD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<vRealH> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vRealH> : public GridTypeMapper_Base {
// Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
@ -167,6 +268,7 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexH Complexified; typedef vComplexH Complexified;
typedef vRealH Realified; typedef vRealH Realified;
typedef vRealD DoublePrecision; typedef vRealD DoublePrecision;
typedef vRealD DoublePrecision2;
}; };
template<> struct GridTypeMapper<vComplexH> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vComplexH> : public GridTypeMapper_Base {
// Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
@ -180,6 +282,7 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexH Complexified; typedef vComplexH Complexified;
typedef vRealH Realified; typedef vRealH Realified;
typedef vComplexD DoublePrecision; typedef vComplexD DoublePrecision;
typedef vComplexD DoublePrecision2;
}; };
template<> struct GridTypeMapper<vComplexF> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vComplexF> : public GridTypeMapper_Base {
typedef ComplexF scalar_type; typedef ComplexF scalar_type;
@ -192,6 +295,7 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexF Complexified; typedef vComplexF Complexified;
typedef vRealF Realified; typedef vRealF Realified;
typedef vComplexD DoublePrecision; typedef vComplexD DoublePrecision;
typedef vComplexD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<vComplexD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vComplexD> : public GridTypeMapper_Base {
typedef ComplexD scalar_type; typedef ComplexD scalar_type;
@ -204,6 +308,20 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexD Complexified; typedef vComplexD Complexified;
typedef vRealD Realified; typedef vRealD Realified;
typedef vComplexD DoublePrecision; typedef vComplexD DoublePrecision;
typedef vComplexD DoublePrecision2;
};
template<> struct GridTypeMapper<vComplexD2> : public GridTypeMapper_Base {
typedef ComplexD2 scalar_type;
typedef ComplexD2 scalar_typeD;
typedef vComplexD2 vector_type;
typedef vComplexD2 vector_typeD;
typedef vComplexD2 tensor_reduced;
typedef ComplexD2 scalar_object;
typedef ComplexD2 scalar_objectD;
typedef vComplexD2 Complexified;
typedef vRealD2 Realified;
typedef vComplexD2 DoublePrecision;
typedef vComplexD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<vInteger> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vInteger> : public GridTypeMapper_Base {
typedef Integer scalar_type; typedef Integer scalar_type;
@ -216,6 +334,7 @@ NAMESPACE_BEGIN(Grid);
typedef void Complexified; typedef void Complexified;
typedef void Realified; typedef void Realified;
typedef void DoublePrecision; typedef void DoublePrecision;
typedef void DoublePrecision2;
}; };
#define GridTypeMapper_RepeatedTypes \ #define GridTypeMapper_RepeatedTypes \
@ -234,6 +353,7 @@ NAMESPACE_BEGIN(Grid);
using Complexified = iScalar<typename BaseTraits::Complexified>; using Complexified = iScalar<typename BaseTraits::Complexified>;
using Realified = iScalar<typename BaseTraits::Realified>; using Realified = iScalar<typename BaseTraits::Realified>;
using DoublePrecision = iScalar<typename BaseTraits::DoublePrecision>; using DoublePrecision = iScalar<typename BaseTraits::DoublePrecision>;
using DoublePrecision2= iScalar<typename BaseTraits::DoublePrecision2>;
static constexpr int Rank = BaseTraits::Rank + 1; static constexpr int Rank = BaseTraits::Rank + 1;
static constexpr std::size_t count = BaseTraits::count; static constexpr std::size_t count = BaseTraits::count;
static constexpr int Dimension(int dim) { static constexpr int Dimension(int dim) {
@ -248,6 +368,7 @@ NAMESPACE_BEGIN(Grid);
using Complexified = iVector<typename BaseTraits::Complexified, N>; using Complexified = iVector<typename BaseTraits::Complexified, N>;
using Realified = iVector<typename BaseTraits::Realified, N>; using Realified = iVector<typename BaseTraits::Realified, N>;
using DoublePrecision = iVector<typename BaseTraits::DoublePrecision, N>; using DoublePrecision = iVector<typename BaseTraits::DoublePrecision, N>;
using DoublePrecision2= iVector<typename BaseTraits::DoublePrecision2, N>;
static constexpr int Rank = BaseTraits::Rank + 1; static constexpr int Rank = BaseTraits::Rank + 1;
static constexpr std::size_t count = BaseTraits::count * N; static constexpr std::size_t count = BaseTraits::count * N;
static constexpr int Dimension(int dim) { static constexpr int Dimension(int dim) {
@ -262,6 +383,7 @@ NAMESPACE_BEGIN(Grid);
using Complexified = iMatrix<typename BaseTraits::Complexified, N>; using Complexified = iMatrix<typename BaseTraits::Complexified, N>;
using Realified = iMatrix<typename BaseTraits::Realified, N>; using Realified = iMatrix<typename BaseTraits::Realified, N>;
using DoublePrecision = iMatrix<typename BaseTraits::DoublePrecision, N>; using DoublePrecision = iMatrix<typename BaseTraits::DoublePrecision, N>;
using DoublePrecision2= iMatrix<typename BaseTraits::DoublePrecision2, N>;
static constexpr int Rank = BaseTraits::Rank + 2; static constexpr int Rank = BaseTraits::Rank + 2;
static constexpr std::size_t count = BaseTraits::count * N * N; static constexpr std::size_t count = BaseTraits::count * N * N;
static constexpr int Dimension(int dim) { static constexpr int Dimension(int dim) {

View File

@ -56,6 +56,7 @@ std::string GridCmdVectorIntToString(const VectorInt & vec);
void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec); void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
template<class VectorInt> template<class VectorInt>
void GridCmdOptionIntVector(std::string &str,VectorInt & vec); void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
void GridCmdOptionInt(std::string &str,int & val);
void GridParseLayout(char **argv,int argc, void GridParseLayout(char **argv,int argc,

View File

@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
using namespace Grid; using namespace Grid;
std::vector<int> L_list; std::vector<int> L_list;
std::vector<int> Ls_list; std::vector<int> Ls_list;
std::vector<double> mflop_list; std::vector<double> mflop_list;
@ -76,7 +75,6 @@ struct controls {
int Opt; int Opt;
int CommsOverlap; int CommsOverlap;
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch; Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
// int HugePages;
}; };
class Benchmark { class Benchmark {
@ -119,14 +117,15 @@ public:
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
comms_header(); comms_header();
for(int lat=4;lat<=maxlat;lat+=4){ for(int lat=16;lat<=maxlat;lat+=8){
for(int Ls=8;Ls<=8;Ls*=2){ // for(int Ls=8;Ls<=8;Ls*=2){
{ int Ls=12;
Coordinate latt_size ({lat*mpi_layout[0], Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1], lat*mpi_layout[1],
lat*mpi_layout[2], lat*mpi_layout[2],
lat*mpi_layout[3]}); lat*mpi_layout[3]});
std::cout << GridLogMessage<< latt_size <<std::endl;
GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors; RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount(); RealD Nnode = Grid.NodeCount();
@ -184,9 +183,6 @@ public:
} }
timestat.statistics(t_time); timestat.statistics(t_time);
// for(int i=0;i<t_time.size();i++){
// std::cout << i<<" "<<t_time[i]<<std::endl;
// }
dbytes=dbytes*ppn; dbytes=dbytes*ppn;
double xbytes = dbytes*0.5; double xbytes = dbytes*0.5;
@ -200,8 +196,6 @@ public:
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
} }
} }
@ -227,14 +221,15 @@ public:
uint64_t NN; uint64_t NN;
uint64_t lmax=48; uint64_t lmax=32;
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat) #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=4){ for(int lat=8;lat<=lmax;lat+=8){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// NP= Grid.RankCount(); // NP= Grid.RankCount();
@ -270,191 +265,8 @@ public:
} }
}; };
#if 0
static double DWF5(int Ls,int L)
{
// RealD mass=0.1;
RealD M5 =1.8;
double mflops; static double DWF(int Ls,int L)
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
Coordinate internal;
if ( SHM == 1 ) internal = Coordinate({1,1,1,1});
else if ( SHM == 2 ) internal = Coordinate({2,1,1,1});
else if ( SHM == 4 ) internal = Coordinate({2,2,1,1});
else if ( SHM == 8 ) internal = Coordinate({2,2,2,1});
else assert(0);
Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
///////// Source preparation ////////////
LatticeFermion src (sFGrid);
LatticeFermion tmp (sFGrid);
std::cout << GridLogMessage << "allocated src and tmp" << std::endl;
random(RNG5,src);
std::cout << GridLogMessage << "intialised random source" << std::endl;
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
LatticeFermion src_e (sFrbGrid);
LatticeFermion src_o (sFrbGrid);
LatticeFermion r_e (sFrbGrid);
LatticeFermion r_o (sFrbGrid);
LatticeFermion r_eo (sFGrid);
LatticeFermion err (sFGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else
const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O ");
#endif
controls Cases [] = {
#ifdef AVX512
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
for(int c=0;c<num_cases;c++) {
WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 100;
uint64_t ncall = 1000;
double t0=usecond();
sFGrid->Barrier();
for(int i=0;i<nwarm;i++){
sDw.DhopEO(src_o,r_e,DaggerNo);
}
sFGrid->Barrier();
double t1=usecond();
sDw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
sDw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
sFGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node "<< mflops/NN<<std::endl;
sDw.Report();
}
double robust = mflops_worst/mflops_best;;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
return mflops_best;
}
#endif
static double DWF(int Ls,int L, double & robust)
{ {
RealD mass=0.1; RealD mass=0.1;
RealD M5 =1.8; RealD M5 =1.8;
@ -471,37 +283,30 @@ public:
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L}); Coordinate local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount(); uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount(); uint64_t NN = TmpGrid->NodeCount();
NN_global=NN; NN_global=NN;
uint64_t SHM=NP/NN; uint64_t SHM=NP/NN;
Coordinate internal; Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
if ( SHM == 1 ) internal = Coordinate({1,1,1,1});
else if ( SHM == 2 ) internal = Coordinate({2,1,1,1});
else if ( SHM == 4 ) internal = Coordinate({2,2,1,1});
else if ( SHM == 8 ) internal = Coordinate({2,2,2,1});
else assert(0);
Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
///////// Welcome message //////////// ///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl; std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl; std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl; std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl; std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl; std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl; std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init //////////// ///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
@ -514,76 +319,31 @@ public:
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl; std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
typedef DomainWallFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
///////// Source preparation //////////// ///////// Source preparation ////////////
LatticeFermion src (FGrid); random(RNG5,src); Gauge Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
LatticeFermion ref (FGrid); Fermion src (FGrid); random(RNG5,src);
LatticeFermion tmp (FGrid); Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
RealD N2 = 1.0/::sqrt(norm2(src));
std::cout<<GridLogMessage << "Normalising src "<< N2 <<std::endl;
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
{
LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid);
{
autoView( Umu_v , Umu , CpuRead);
autoView( Umu5d_v, Umu5d, CpuWrite);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
}
}
}
ref = Zero();
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
LatticeFermion r_eo (FGrid);
LatticeFermion err (FGrid);
{ {
pickCheckerboard(Even,src_e,src); pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src); pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else
const int num_cases = 4; const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O "); std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
#endif
controls Cases [] = { controls Cases [] = {
#ifdef AVX512 { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
}; };
@ -596,15 +356,12 @@ public:
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 200; int nwarm = 10;
double t0=usecond(); double t0=usecond();
FGrid->Barrier(); FGrid->Barrier();
for(int i=0;i<nwarm;i++){ for(int i=0;i<nwarm;i++){
@ -612,9 +369,7 @@ public:
} }
FGrid->Barrier(); FGrid->Barrier();
double t1=usecond(); double t1=usecond();
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); uint64_t ncall = 50;
// if (ncall < 500) ncall = 500;
uint64_t ncall = 1000;
FGrid->Broadcast(0,&ncall,sizeof(ncall)); FGrid->Broadcast(0,&ncall,sizeof(ncall));
@ -651,24 +406,11 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
Dw.Report();
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-ref;
RealD absref = norm2(ref);
RealD abserr = norm2(err);
std::cout<<GridLogMessage << "norm diff "<< abserr << " / " << absref<<std::endl;
assert(abserr<1.0e-4);
} }
robust = mflops_worst/mflops_best;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl; std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ; std::cout<<GridLogMessage ;
@ -682,8 +424,166 @@ public:
return mflops_best; return mflops_best;
} }
static double Staggered(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
typedef ImprovedStaggeredFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU3::HotConfiguration(RNG4,Umu);
typename Action::ImplParams params;
Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
///////// Source preparation ////////////
Fermion src (FGrid); random(RNG4,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
const int num_cases = 4;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
}; };
for(int c=0;c<num_cases;c++) {
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
Ds.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Ds.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1146.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
return mflops_best;
}
};
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
@ -698,62 +598,50 @@ int main (int argc, char ** argv)
int do_memory=1; int do_memory=1;
int do_comms =1; int do_comms =1;
int do_su3 =0;
int do_wilson=1;
int do_dwf =1;
if ( do_su3 ) {
// empty for now
}
#if 1
int sel=2; int sel=2;
Coordinate L_list({8,12,16,24}); std::vector<int> L_list({16,24,32});
#else
int sel=1;
Coordinate L_list({8,12});
#endif
int selm1=sel-1; int selm1=sel-1;
std::vector<double> robust_list;
std::vector<double> wilson; std::vector<double> wilson;
std::vector<double> dwf4; std::vector<double> dwf4;
std::vector<double> dwf5; std::vector<double> staggered;
if ( do_wilson ) {
int Ls=1; int Ls=1;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl; std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){ for(int l=0;l<L_list.size();l++){
double robust; wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
wilson.push_back(Benchmark::DWF(Ls,L_list[l],robust));
}
} }
int Ls=16; Ls=12;
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){ for(int l=0;l<L_list.size();l++){
double robust; double result = Benchmark::DWF(Ls,L_list[l]) ;
double result = Benchmark::DWF(Ls,L_list[l],robust) ;
dwf4.push_back(result); dwf4.push_back(result);
robust_list.push_back(robust);
}
} }
if ( do_dwf ) { /*
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double result = Benchmark::Staggered(L_list[l]) ;
staggered.push_back(result);
}
*/
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 " <<std::endl; std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
for(int l=0;l<L_list.size();l++){ for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l] <<std::endl; std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
} }
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
int NN=NN_global; int NN=NN_global;
if ( do_memory ) { if ( do_memory ) {
@ -770,7 +658,6 @@ int main (int argc, char ** argv)
Benchmark::Comms(); Benchmark::Comms();
} }
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
@ -784,10 +671,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl; std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl; std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
std::cout<<std::setprecision(3); std::cout<<std::setprecision(3);
std::cout<<GridLogMessage << " Comparison point robustness: " << robust_list[sel] <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
Grid_finalize(); Grid_finalize();
} }

View File

@ -0,0 +1,176 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
void benchDw(std::vector<int> & L, int Ls);
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=12;
std::vector< std::vector<int> > latts;
#if 1
latts.push_back(std::vector<int> ({24,24,24,24}) );
latts.push_back(std::vector<int> ({48,24,24,24}) );
latts.push_back(std::vector<int> ({96,24,24,24}) );
latts.push_back(std::vector<int> ({96,48,24,24}) );
// latts.push_back(std::vector<int> ({96,48,48,24}) );
// latts.push_back(std::vector<int> ({96,48,48,48}) );
#else
// latts.push_back(std::vector<int> ({96,48,48,48}) );
latts.push_back(std::vector<int> ({96,96,96,192}) );
#endif
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t SchurDiagOne "<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
for (int l=0;l<latts.size();l++){
std::vector<int> latt4 = latts[l];
std::cout << GridLogMessage <<"\t";
for(int d=0;d<Nd;d++){
std::cout<<latt4[d]<<"x";
}
std::cout <<Ls<<"\t" ;
benchDw (latt4,Ls);
}
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
Grid_finalize();
}
void benchDw(std::vector<int> & latt4, int Ls)
{
/////////////////////////////////////////////////////////////////////////////////////
// for Nc=3
/////////////////////////////////////////////////////////////////////////////////////
// Dw : Ls*24*(7+48)= Ls*1320
//
// M5D: Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72
// Meo: Ls*24*(7+48) + Ls*72 = Ls*1392
//
// Mee: 3*Ns*2*Nc*Ls // Chroma 6*N5*Nc*Ns
//
// LeemInv : 2*2*Nc*madd*Ls
// LeeInv : 2*2*Nc*madd*Ls
// DeeInv : 4*2*Nc*mul *Ls
// UeeInv : 2*2*Nc*madd*Ls
// UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops
// QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops
// Mpc => 1452*cbvol*2*Ls flops //
// => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16
/////////////////////////////////////////////////////////////////////////////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
// long unsigned int single_site_flops = 8*Nc*(7+16*Nc)*Ls;
long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls;
long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls;
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
ColourMatrixF cm = ComplexF(1.0,0.0);
int ncall=300;
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
LatticeGaugeFieldF Umu(UGrid); Umu=Zero();
MobiusFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.5,0.5);
LatticeFermionF src_o (FrbGrid); src_o=1.0;
LatticeFermionF r_o (FrbGrid); r_o=Zero();
int order =151;
SchurDiagOneOperator<MobiusFermionF,LatticeFermionF> Mpc(Dw);
Chebyshev<LatticeFermionF> Cheby(0.0,60.0,order);
{
Mpc.Mpc(src_o,r_o);
Mpc.Mpc(src_o,r_o);
Mpc.Mpc(src_o,r_o);
double t0=usecond();
for(int i=0;i<ncall;i++){
Mpc.Mpc(src_o,r_o);
}
double t1=usecond();
double flops=(single_site_mpc_flops*volume*ncall); // Mpc has 1 - Moo^-1 Moe Mee^-1 Meo so CB cancels.
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0);
flops=(single_site_quda_flops*volume*ncall);
std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
// Cheby uses MpcDagMpc so 2x flops
for(int i=0;i<1;i++){
Cheby(Mpc,src_o,r_o);
t0=usecond();
Cheby(Mpc,src_o,r_o);
t1=usecond();
flops=(single_site_mpc_flops*volume*2*order);
std::cout <<"\t"<<flops/(t1-t0);
flops=(single_site_quda_flops*volume*2*order);
std::cout <<"\t"<<flops/(t1-t0) << "\t" << (t1-t0)/1000./1000. <<" s";
std::cout <<std::endl;
}
}
// Dw.Report();
}

View File

@ -87,6 +87,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu); U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
} }
ref = Zero();
RealD mass=0.1; RealD mass=0.1;
RealD c1=9.0/8.0; RealD c1=9.0/8.0;

View File

@ -309,10 +309,21 @@ case ${ac_gen_scalar} in
esac esac
##################### Compiler dependent choices ##################### Compiler dependent choices
case ${CXX} in
#Strip any optional compiler arguments from nvcc call (eg -ccbin) for compiler comparison
CXXBASE=${CXX}
CXXTEST=${CXX}
if echo "${CXX}" | grep -q "nvcc"; then
CXXTEST="nvcc"
fi
case ${CXXTEST} in
nvcc) nvcc)
CXX="nvcc -x cu " # CXX="nvcc -keep -v -x cu "
CXXLD="nvcc -link" # CXXLD="nvcc -v -link"
CXX="${CXXBASE} -x cu "
CXXLD="${CXXBASE} -link"
# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
if test $ac_openmp = yes; then if test $ac_openmp = yes; then
CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"

View File

@ -0,0 +1,84 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/io/Test_openqcd_io.cc
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace Grid;
int main(int argc, char** argv) {
#if !defined(GRID_COMMS_NONE)
Grid_init(&argc, &argv);
auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
auto mpi_layout = GridDefaultMpi();
auto latt_size = GridDefaultLatt();
GridCartesian grid(latt_size, simd_layout, mpi_layout);
GridParallelRNG pRNG(&grid);
pRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
LatticeGaugeField Umu_ref(&grid);
LatticeGaugeField Umu_me(&grid);
LatticeGaugeField Umu_diff(&grid);
FieldMetaData header_ref;
FieldMetaData header_me;
Umu_ref = Zero();
Umu_me = Zero();
std::string file("/home/daniel/configs/openqcd/test_16x8_pbcn6");
if(GridCmdOptionExists(argv, argv + argc, "--config")) {
file = GridCmdOptionPayload(argv, argv + argc, "--config");
std::cout << "file: " << file << std::endl;
assert(!file.empty());
}
OpenQcdIOChromaReference::readConfiguration(Umu_ref, header_ref, file);
OpenQcdIO::readConfiguration(Umu_me, header_me, file);
std::cout << GridLogMessage << header_ref << std::endl;
std::cout << GridLogMessage << header_me << std::endl;
Umu_diff = Umu_ref - Umu_me;
// clang-format off
std::cout << GridLogMessage
<< "norm2(Umu_ref) = " << norm2(Umu_ref)
<< " norm2(Umu_me) = " << norm2(Umu_me)
<< " norm2(Umu_diff) = " << norm2(Umu_diff) << std::endl;
// clang-format on
Grid_finalize();
#endif
}

View File

@ -0,0 +1,126 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_innerproduct_norm.cc
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace Grid;
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
const int nIter = 100;
// clang-format off
GridCartesian *Grid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
GridCartesian *Grid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
// clang-format on
GridParallelRNG pRNG_d(Grid_d);
GridParallelRNG pRNG_f(Grid_f);
std::vector<int> seeds_d({1, 2, 3, 4});
std::vector<int> seeds_f({5, 6, 7, 8});
pRNG_d.SeedFixedIntegers(seeds_d);
pRNG_f.SeedFixedIntegers(seeds_f);
// clang-format off
LatticeFermionD x_d(Grid_d); random(pRNG_d, x_d);
LatticeFermionD y_d(Grid_d); random(pRNG_d, y_d);
LatticeFermionF x_f(Grid_f); random(pRNG_f, x_f);
LatticeFermionF y_f(Grid_f); random(pRNG_f, y_f);
// clang-format on
GridStopWatch sw_ref;
GridStopWatch sw_res;
{ // double precision
ComplexD ip_d_ref, ip_d_res, diff_ip_d;
RealD norm2_d_ref, norm2_d_res, diff_norm2_d;
sw_ref.Reset();
sw_ref.Start();
for(int i = 0; i < nIter; ++i) {
ip_d_ref = innerProduct(x_d, y_d);
norm2_d_ref = norm2(x_d);
}
sw_ref.Stop();
sw_res.Reset();
sw_res.Start();
for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_d_res, norm2_d_res, x_d, y_d); }
sw_res.Stop();
diff_ip_d = ip_d_ref - ip_d_res;
diff_norm2_d = norm2_d_ref - norm2_d_res;
// clang-format off
std::cout << GridLogMessage << "Double: ip_ref = " << ip_d_ref << " ip_res = " << ip_d_res << " diff = " << diff_ip_d << std::endl;
std::cout << GridLogMessage << "Double: norm2_ref = " << norm2_d_ref << " norm2_res = " << norm2_d_res << " diff = " << diff_norm2_d << std::endl;
std::cout << GridLogMessage << "Double: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl;
// clang-format on
assert(diff_ip_d == 0.);
assert(diff_norm2_d == 0.);
std::cout << GridLogMessage << "Double: all checks passed" << std::endl;
}
{ // single precision
ComplexD ip_f_ref, ip_f_res, diff_ip_f;
RealD norm2_f_ref, norm2_f_res, diff_norm2_f;
sw_ref.Reset();
sw_ref.Start();
for(int i = 0; i < nIter; ++i) {
ip_f_ref = innerProduct(x_f, y_f);
norm2_f_ref = norm2(x_f);
}
sw_ref.Stop();
sw_res.Reset();
sw_res.Start();
for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_f_res, norm2_f_res, x_f, y_f); }
sw_res.Stop();
diff_ip_f = ip_f_ref - ip_f_res;
diff_norm2_f = norm2_f_ref - norm2_f_res;
// clang-format off
std::cout << GridLogMessage << "Single: ip_ref = " << ip_f_ref << " ip_res = " << ip_f_res << " diff = " << diff_ip_f << std::endl;
std::cout << GridLogMessage << "Single: norm2_ref = " << norm2_f_ref << " norm2_res = " << norm2_f_res << " diff = " << diff_norm2_f << std::endl;
std::cout << GridLogMessage << "Single: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl;
// clang-format on
assert(diff_ip_f == 0.);
assert(diff_norm2_f == 0.);
std::cout << GridLogMessage << "Single: all checks passed" << std::endl;
}
Grid_finalize();
}

View File

@ -238,11 +238,11 @@ void TestWhat(What & Ddwf,
RealD t1,t2; RealD t1,t2;
SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf); SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); HermOpEO.MpcDagMpc(chi_e,dchi_e);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); HermOpEO.MpcDagMpc(chi_o,dchi_o);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); HermOpEO.MpcDagMpc(phi_e,dphi_e);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); HermOpEO.MpcDagMpc(phi_o,dphi_o);
pDce = innerProduct(phi_e,dchi_e); pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o); pDco = innerProduct(phi_o,dchi_o);

View File

@ -218,11 +218,11 @@ int main (int argc, char ** argv)
RealD t1,t2; RealD t1,t2;
SchurDiagMooeeOperator<DomainWallEOFAFermionR,LatticeFermion> HermOpEO(Ddwf); SchurDiagMooeeOperator<DomainWallEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
HermOpEO.MpcDagMpc(chi_e, dchi_e, t1, t2); HermOpEO.MpcDagMpc(chi_e, dchi_e);
HermOpEO.MpcDagMpc(chi_o, dchi_o, t1, t2); HermOpEO.MpcDagMpc(chi_o, dchi_o);
HermOpEO.MpcDagMpc(phi_e, dphi_e, t1, t2); HermOpEO.MpcDagMpc(phi_e, dphi_e);
HermOpEO.MpcDagMpc(phi_o, dphi_o, t1, t2); HermOpEO.MpcDagMpc(phi_o, dphi_o);
pDce = innerProduct(phi_e, dchi_e); pDce = innerProduct(phi_e, dchi_e);
pDco = innerProduct(phi_o, dchi_o); pDco = innerProduct(phi_o, dchi_o);

View File

@ -216,11 +216,11 @@ int main (int argc, char ** argv)
SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf); SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); HermOpEO.MpcDagMpc(chi_e,dchi_e);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); HermOpEO.MpcDagMpc(chi_o,dchi_o);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); HermOpEO.MpcDagMpc(phi_e,dphi_e);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); HermOpEO.MpcDagMpc(phi_o,dphi_o);
pDce = innerProduct(phi_e,dchi_e); pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o); pDco = innerProduct(phi_o,dchi_o);

View File

@ -201,11 +201,11 @@ int main (int argc, char ** argv)
RealD t1,t2; RealD t1,t2;
SchurDiagMooeeOperator<GparityWilsonFermionR,FermionField> HermOpEO(Dw); SchurDiagMooeeOperator<GparityWilsonFermionR,FermionField> HermOpEO(Dw);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); HermOpEO.MpcDagMpc(chi_e,dchi_e);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); HermOpEO.MpcDagMpc(chi_o,dchi_o);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); HermOpEO.MpcDagMpc(phi_e,dphi_e);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); HermOpEO.MpcDagMpc(phi_o,dphi_o);
pDce = innerProduct(phi_e,dchi_e); pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o); pDco = innerProduct(phi_o,dchi_o);

View File

@ -220,11 +220,11 @@ int main (int argc, char ** argv)
RealD t1,t2; RealD t1,t2;
SchurDiagMooeeOperator<MobiusEOFAFermionR,LatticeFermion> HermOpEO(Ddwf); SchurDiagMooeeOperator<MobiusEOFAFermionR,LatticeFermion> HermOpEO(Ddwf);
HermOpEO.MpcDagMpc(chi_e, dchi_e, t1, t2); HermOpEO.MpcDagMpc(chi_e, dchi_e);
HermOpEO.MpcDagMpc(chi_o, dchi_o, t1, t2); HermOpEO.MpcDagMpc(chi_o, dchi_o);
HermOpEO.MpcDagMpc(phi_e, dphi_e, t1, t2); HermOpEO.MpcDagMpc(phi_e, dphi_e);
HermOpEO.MpcDagMpc(phi_o, dphi_o, t1, t2); HermOpEO.MpcDagMpc(phi_o, dphi_o);
pDce = innerProduct(phi_e, dchi_e); pDce = innerProduct(phi_e, dchi_e);
pDco = innerProduct(phi_o, dchi_o); pDco = innerProduct(phi_o, dchi_o);

View File

@ -266,11 +266,11 @@ int main (int argc, char ** argv)
SchurDiagMooeeOperator<MobiusFermionR,LatticeFermion> HermOpEO(Ddwf); SchurDiagMooeeOperator<MobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); HermOpEO.MpcDagMpc(chi_e,dchi_e);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); HermOpEO.MpcDagMpc(chi_o,dchi_o);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); HermOpEO.MpcDagMpc(phi_e,dphi_e);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); HermOpEO.MpcDagMpc(phi_o,dphi_o);
pDce = innerProduct(phi_e,dchi_e); pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o); pDco = innerProduct(phi_o,dchi_o);

View File

@ -270,11 +270,11 @@ int main (int argc, char ** argv)
pickCheckerboard(Odd ,phi_o,phi); pickCheckerboard(Odd ,phi_o,phi);
SchurDiagMooeeOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds); SchurDiagMooeeOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); HermOpEO.MpcDagMpc(chi_e,dchi_e);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); HermOpEO.MpcDagMpc(chi_o,dchi_o);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); HermOpEO.MpcDagMpc(phi_e,dphi_e);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); HermOpEO.MpcDagMpc(phi_o,dphi_o);
pDce = innerProduct(phi_e,dchi_e); pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o); pDco = innerProduct(phi_o,dchi_o);

View File

@ -290,11 +290,11 @@ int main (int argc, char ** argv)
pickCheckerboard(Odd ,phi_o,phi); pickCheckerboard(Odd ,phi_o,phi);
SchurDiagMooeeOperator<ImprovedStaggeredFermion5DR,FermionField> HermOpEO(Ds); SchurDiagMooeeOperator<ImprovedStaggeredFermion5DR,FermionField> HermOpEO(Ds);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); HermOpEO.MpcDagMpc(chi_e,dchi_e);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); HermOpEO.MpcDagMpc(chi_o,dchi_o);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); HermOpEO.MpcDagMpc(phi_e,dphi_e);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); HermOpEO.MpcDagMpc(phi_o,dphi_o);
pDce = innerProduct(phi_e,dchi_e); pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o); pDco = innerProduct(phi_o,dchi_o);

View File

@ -207,11 +207,11 @@ int main (int argc, char ** argv)
RealD t1,t2; RealD t1,t2;
SchurDiagMooeeOperator<WilsonFermionR,LatticeFermion> HermOpEO(Dw); SchurDiagMooeeOperator<WilsonFermionR,LatticeFermion> HermOpEO(Dw);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); HermOpEO.MpcDagMpc(chi_e,dchi_e);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); HermOpEO.MpcDagMpc(chi_o,dchi_o);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); HermOpEO.MpcDagMpc(phi_e,dphi_e);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); HermOpEO.MpcDagMpc(phi_o,dphi_o);
pDce = innerProduct(phi_e,dchi_e); pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o); pDco = innerProduct(phi_o,dchi_o);

View File

@ -208,11 +208,11 @@ int main (int argc, char ** argv)
RealD t1,t2; RealD t1,t2;
SchurDiagMooeeOperator<WilsonTMFermionR,LatticeFermion> HermOpEO(Dw); SchurDiagMooeeOperator<WilsonTMFermionR,LatticeFermion> HermOpEO(Dw);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); HermOpEO.MpcDagMpc(chi_e,dchi_e);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); HermOpEO.MpcDagMpc(chi_o,dchi_o);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); HermOpEO.MpcDagMpc(phi_e,dphi_e);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); HermOpEO.MpcDagMpc(phi_o,dphi_o);
pDce = innerProduct(phi_e,dchi_e); pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o); pDco = innerProduct(phi_o,dchi_o);

View File

@ -280,11 +280,11 @@ int main (int argc, char ** argv)
SchurDiagMooeeOperator<ZMobiusFermionR,LatticeFermion> HermOpEO(Ddwf); SchurDiagMooeeOperator<ZMobiusFermionR,LatticeFermion> HermOpEO(Ddwf);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); HermOpEO.MpcDagMpc(chi_e,dchi_e);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); HermOpEO.MpcDagMpc(chi_o,dchi_o);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); HermOpEO.MpcDagMpc(phi_e,dphi_e);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); HermOpEO.MpcDagMpc(phi_o,dphi_o);
pDce = innerProduct(phi_e,dchi_e); pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o); pDco = innerProduct(phi_o,dchi_o);

View File

@ -70,9 +70,6 @@ int main (int argc, char ** argv)
SU3::HotConfiguration(RNG4,Umu); SU3::HotConfiguration(RNG4,Umu);
TrivialPrecon<LatticeFermion> simple;
PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-6,10000,simple,4,160);
ConjugateResidual<LatticeFermion> CR(1.0e-6,10000); ConjugateResidual<LatticeFermion> CR(1.0e-6,10000);
@ -86,15 +83,19 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage<<"* Solving with MdagM VPGCR "<<std::endl; std::cout<<GridLogMessage<<"* Solving with MdagM VPGCR "<<std::endl;
std::cout<<GridLogMessage<<"*********************************************************"<<std::endl; std::cout<<GridLogMessage<<"*********************************************************"<<std::endl;
MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf); MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
TrivialPrecon<LatticeFermion> simple;
PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-6,10000,HermOp,simple,4,160);
result=Zero(); result=Zero();
PGCR(HermOp,src,result); PGCR(src,result);
std::cout<<GridLogMessage<<"*********************************************************"<<std::endl; std::cout<<GridLogMessage<<"*********************************************************"<<std::endl;
std::cout<<GridLogMessage<<"* Solving with g5-VPGCR "<<std::endl; std::cout<<GridLogMessage<<"* Solving with g5-VPGCR "<<std::endl;
std::cout<<GridLogMessage<<"*********************************************************"<<std::endl; std::cout<<GridLogMessage<<"*********************************************************"<<std::endl;
Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> g5HermOp(Ddwf); Gamma5R5HermitianLinearOperator<DomainWallFermionR,LatticeFermion> g5HermOp(Ddwf);
PrecGeneralisedConjugateResidual<LatticeFermion> PGCR5(1.0e-6,10000,g5HermOp,simple,4,160);
result=Zero(); result=Zero();
PGCR(g5HermOp,src,result); PGCR5(src,result);
std::cout<<GridLogMessage<<"*********************************************************"<<std::endl; std::cout<<GridLogMessage<<"*********************************************************"<<std::endl;
std::cout<<GridLogMessage<<"* Solving with MdagM-CR "<<std::endl; std::cout<<GridLogMessage<<"* Solving with MdagM-CR "<<std::endl;

View File

@ -128,9 +128,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl; std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl; std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp4d.Report();
} }
Ds4d.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -148,9 +146,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl; std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl; std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
} }
Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -167,10 +163,8 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl; std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl; std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
} }
Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -187,9 +181,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl; std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl; std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
} }
Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -206,9 +198,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl; std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl; std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
} }
Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << "****************************************************************** "<<std::endl; std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
@ -232,7 +222,6 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl; std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl; std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
// HermOp4d.Report();
} }

View File

@ -220,7 +220,7 @@ void run(const TestParams &params){
GridStopWatch CGTimer; GridStopWatch CGTimer;
typename RunParamsOuter::HermOpType<MobiusFermionD> HermOpEO_outer(D_outer); typename RunParamsOuter::template HermOpType<MobiusFermionD> HermOpEO_outer(D_outer);
CGTimer.Start(); CGTimer.Start();
CG_outer(HermOpEO_outer, src_o_outer, result_o_outer); CG_outer(HermOpEO_outer, src_o_outer, result_o_outer);