mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-15 14:27:06 +01:00
Compare commits
2 Commits
ISC-freeze
...
feature/sc
Author | SHA1 | Date | |
---|---|---|---|
fa5d8702cb | |||
0064685bd7 |
17
.travis.yml
17
.travis.yml
@ -19,8 +19,6 @@ before_install:
|
|||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- export CWD=`pwd`
|
|
||||||
- echo $CWD
|
|
||||||
- export CC=$CC$VERSION
|
- export CC=$CC$VERSION
|
||||||
- export CXX=$CXX$VERSION
|
- export CXX=$CXX$VERSION
|
||||||
- echo $PATH
|
- echo $PATH
|
||||||
@ -38,22 +36,11 @@ script:
|
|||||||
- ./bootstrap.sh
|
- ./bootstrap.sh
|
||||||
- mkdir build
|
- mkdir build
|
||||||
- cd build
|
- cd build
|
||||||
- mkdir lime
|
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
|
||||||
- cd lime
|
|
||||||
- mkdir build
|
|
||||||
- cd build
|
|
||||||
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
|
|
||||||
- tar xf lime-1.3.2.tar.gz
|
|
||||||
- cd lime-1.3.2
|
|
||||||
- ./configure --prefix=$CWD/build/lime/install
|
|
||||||
- make -j4
|
|
||||||
- make install
|
|
||||||
- cd $CWD/build
|
|
||||||
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install
|
|
||||||
- make -j4
|
- make -j4
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
||||||
- echo make clean
|
- echo make clean
|
||||||
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install
|
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
|
||||||
- make -j4
|
- make -j4
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
||||||
- make check
|
- make check
|
||||||
|
@ -51,7 +51,7 @@ namespace Grid {
|
|||||||
|
|
||||||
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
|
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
|
||||||
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
||||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
|
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
|
||||||
virtual void HermOp(const Field &in, Field &out)=0;
|
virtual void HermOp(const Field &in, Field &out)=0;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -309,59 +309,36 @@ namespace Grid {
|
|||||||
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
||||||
protected:
|
protected:
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
Field tmp;
|
|
||||||
RealD mass;
|
|
||||||
double tMpc;
|
|
||||||
double tIP;
|
|
||||||
double tMeo;
|
|
||||||
double taxpby_norm;
|
|
||||||
uint64_t ncall;
|
|
||||||
public:
|
public:
|
||||||
void Report(void)
|
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
|
||||||
{
|
|
||||||
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
|
|
||||||
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
|
|
||||||
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
|
|
||||||
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
|
|
||||||
}
|
|
||||||
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
|
|
||||||
{
|
|
||||||
assert( _Mat.isTrivialEE() );
|
|
||||||
mass = _Mat.Mass();
|
|
||||||
tMpc=0;
|
|
||||||
tIP =0;
|
|
||||||
tMeo=0;
|
|
||||||
taxpby_norm=0;
|
|
||||||
ncall=0;
|
|
||||||
}
|
|
||||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
ncall++;
|
GridLogIterative.TimingMode(1);
|
||||||
tMpc-=usecond();
|
std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl;
|
||||||
n2 = Mpc(in,out);
|
n2 = Mpc(in,out);
|
||||||
tMpc+=usecond();
|
std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl;
|
||||||
tIP-=usecond();
|
|
||||||
ComplexD dot= innerProduct(in,out);
|
ComplexD dot= innerProduct(in,out);
|
||||||
tIP+=usecond();
|
std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl;
|
||||||
n1 = real(dot);
|
n1 = real(dot);
|
||||||
}
|
}
|
||||||
virtual void HermOp(const Field &in, Field &out){
|
virtual void HermOp(const Field &in, Field &out){
|
||||||
ncall++;
|
std::cout << GridLogIterative << " HermOp "<<std::endl;
|
||||||
tMpc-=usecond();
|
Mpc(in,out);
|
||||||
_Mat.Meooe(in,out);
|
|
||||||
_Mat.Meooe(out,tmp);
|
|
||||||
tMpc+=usecond();
|
|
||||||
taxpby_norm-=usecond();
|
|
||||||
axpby(out,-1.0,mass*mass,tmp,in);
|
|
||||||
taxpby_norm+=usecond();
|
|
||||||
}
|
}
|
||||||
virtual RealD Mpc (const Field &in, Field &out) {
|
virtual RealD Mpc (const Field &in, Field &out) {
|
||||||
tMeo-=usecond();
|
Field tmp(in._grid);
|
||||||
|
Field tmp2(in._grid);
|
||||||
|
|
||||||
|
std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
|
||||||
|
_Mat.Mooee(in,out);
|
||||||
|
_Mat.Mooee(out,tmp);
|
||||||
|
std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
|
||||||
|
|
||||||
_Mat.Meooe(in,out);
|
_Mat.Meooe(in,out);
|
||||||
_Mat.Meooe(out,tmp);
|
_Mat.Meooe(out,tmp2);
|
||||||
tMeo+=usecond();
|
std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl;
|
||||||
taxpby_norm-=usecond();
|
|
||||||
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
|
RealD nn=axpy_norm(out,-1.0,tmp2,tmp);
|
||||||
taxpby_norm+=usecond();
|
std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl;
|
||||||
return nn;
|
return nn;
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag (const Field &in, Field &out){
|
virtual RealD MpcDag (const Field &in, Field &out){
|
||||||
|
@ -54,7 +54,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
|
|
||||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
||||||
|
|
||||||
|
|
||||||
psi.checkerboard = src.checkerboard;
|
psi.checkerboard = src.checkerboard;
|
||||||
conformable(psi, src);
|
conformable(psi, src);
|
||||||
|
|
||||||
@ -70,6 +69,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
|
|
||||||
|
|
||||||
Linop.HermOpAndNorm(psi, mmp, d, b);
|
Linop.HermOpAndNorm(psi, mmp, d, b);
|
||||||
|
|
||||||
|
|
||||||
r = src - mmp;
|
r = src - mmp;
|
||||||
p = r;
|
p = r;
|
||||||
@ -96,44 +96,38 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
|
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
|
||||||
|
|
||||||
GridStopWatch LinalgTimer;
|
GridStopWatch LinalgTimer;
|
||||||
GridStopWatch InnerTimer;
|
|
||||||
GridStopWatch AxpyNormTimer;
|
|
||||||
GridStopWatch LinearCombTimer;
|
|
||||||
GridStopWatch MatrixTimer;
|
GridStopWatch MatrixTimer;
|
||||||
GridStopWatch SolverTimer;
|
GridStopWatch SolverTimer;
|
||||||
|
|
||||||
SolverTimer.Start();
|
SolverTimer.Start();
|
||||||
int k;
|
int k;
|
||||||
for (k = 1; k <= MaxIterations*1000; k++) {
|
for (k = 1; k <= MaxIterations; k++) {
|
||||||
c = cp;
|
c = cp;
|
||||||
|
|
||||||
MatrixTimer.Start();
|
MatrixTimer.Start();
|
||||||
Linop.HermOp(p, mmp);
|
Linop.HermOpAndNorm(p, mmp, d, qq);
|
||||||
MatrixTimer.Stop();
|
MatrixTimer.Stop();
|
||||||
|
|
||||||
LinalgTimer.Start();
|
LinalgTimer.Start();
|
||||||
|
// RealD qqck = norm2(mmp);
|
||||||
|
// ComplexD dck = innerProduct(p,mmp);
|
||||||
|
|
||||||
InnerTimer.Start();
|
|
||||||
ComplexD dc = innerProduct(p,mmp);
|
|
||||||
InnerTimer.Stop();
|
|
||||||
d = dc.real();
|
|
||||||
a = c / d;
|
a = c / d;
|
||||||
|
b_pred = a * (a * qq - d) / c;
|
||||||
|
|
||||||
AxpyNormTimer.Start();
|
|
||||||
cp = axpy_norm(r, -a, mmp, r);
|
cp = axpy_norm(r, -a, mmp, r);
|
||||||
AxpyNormTimer.Stop();
|
|
||||||
b = cp / c;
|
b = cp / c;
|
||||||
|
|
||||||
LinearCombTimer.Start();
|
// Fuse these loops ; should be really easy
|
||||||
parallel_for(int ss=0;ss<src._grid->oSites();ss++){
|
psi = a * p + psi;
|
||||||
vstream(psi[ss], a * p[ss] + psi[ss]);
|
p = p * b + r;
|
||||||
vstream(p [ss], b * p[ss] + r[ss]);
|
|
||||||
}
|
|
||||||
LinearCombTimer.Stop();
|
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
||||||
<< " residual " << cp << " target " << rsq << std::endl;
|
<< " residual " << cp << " target " << rsq << std::endl;
|
||||||
|
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
|
||||||
|
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
|
||||||
|
|
||||||
// Stopping condition
|
// Stopping condition
|
||||||
if (cp <= rsq) {
|
if (cp <= rsq) {
|
||||||
@ -154,9 +148,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
|
|
||||||
|
|
||||||
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
||||||
|
|
||||||
|
@ -43,7 +43,6 @@ namespace Grid {
|
|||||||
public:
|
public:
|
||||||
RealD Tolerance;
|
RealD Tolerance;
|
||||||
Integer MaxIterations;
|
Integer MaxIterations;
|
||||||
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
|
|
||||||
int verbose;
|
int verbose;
|
||||||
MultiShiftFunction shifts;
|
MultiShiftFunction shifts;
|
||||||
|
|
||||||
@ -164,16 +163,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
for(int s=0;s<nshift;s++) {
|
for(int s=0;s<nshift;s++) {
|
||||||
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
|
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////
|
|
||||||
// Timers
|
|
||||||
///////////////////////////////////////
|
|
||||||
GridStopWatch AXPYTimer;
|
|
||||||
GridStopWatch ShiftTimer;
|
|
||||||
GridStopWatch QRTimer;
|
|
||||||
GridStopWatch MatrixTimer;
|
|
||||||
GridStopWatch SolverTimer;
|
|
||||||
SolverTimer.Start();
|
|
||||||
|
|
||||||
// Iteration loop
|
// Iteration loop
|
||||||
int k;
|
int k;
|
||||||
@ -181,9 +171,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
for (k=1;k<=MaxIterations;k++){
|
for (k=1;k<=MaxIterations;k++){
|
||||||
|
|
||||||
a = c /cp;
|
a = c /cp;
|
||||||
AXPYTimer.Start();
|
|
||||||
axpy(p,a,p,r);
|
axpy(p,a,p,r);
|
||||||
AXPYTimer.Stop();
|
|
||||||
|
|
||||||
// Note to self - direction ps is iterated seperately
|
// Note to self - direction ps is iterated seperately
|
||||||
// for each shift. Does not appear to have any scope
|
// for each shift. Does not appear to have any scope
|
||||||
@ -192,7 +180,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
// However SAME r is used. Could load "r" and update
|
// However SAME r is used. Could load "r" and update
|
||||||
// ALL ps[s]. 2/3 Bandwidth saving
|
// ALL ps[s]. 2/3 Bandwidth saving
|
||||||
// New Kernel: Load r, vector of coeffs, vector of pointers ps
|
// New Kernel: Load r, vector of coeffs, vector of pointers ps
|
||||||
AXPYTimer.Start();
|
|
||||||
for(int s=0;s<nshift;s++){
|
for(int s=0;s<nshift;s++){
|
||||||
if ( ! converged[s] ) {
|
if ( ! converged[s] ) {
|
||||||
if (s==0){
|
if (s==0){
|
||||||
@ -203,34 +190,22 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
AXPYTimer.Stop();
|
|
||||||
|
|
||||||
cp=c;
|
cp=c;
|
||||||
MatrixTimer.Start();
|
|
||||||
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
|
|
||||||
// The below is faster on KNL
|
|
||||||
Linop.HermOp(p,mmp);
|
|
||||||
d=real(innerProduct(p,mmp));
|
|
||||||
|
|
||||||
MatrixTimer.Stop();
|
Linop.HermOpAndNorm(p,mmp,d,qq);
|
||||||
|
|
||||||
AXPYTimer.Start();
|
|
||||||
axpy(mmp,mass[0],p,mmp);
|
axpy(mmp,mass[0],p,mmp);
|
||||||
AXPYTimer.Stop();
|
|
||||||
RealD rn = norm2(p);
|
RealD rn = norm2(p);
|
||||||
d += rn*mass[0];
|
d += rn*mass[0];
|
||||||
|
|
||||||
bp=b;
|
bp=b;
|
||||||
b=-cp/d;
|
b=-cp/d;
|
||||||
|
|
||||||
AXPYTimer.Start();
|
|
||||||
c=axpy_norm(r,b,mmp,r);
|
c=axpy_norm(r,b,mmp,r);
|
||||||
AXPYTimer.Stop();
|
|
||||||
|
|
||||||
// Toggle the recurrence history
|
// Toggle the recurrence history
|
||||||
bs[0] = b;
|
bs[0] = b;
|
||||||
iz = 1-iz;
|
iz = 1-iz;
|
||||||
ShiftTimer.Start();
|
|
||||||
for(int s=1;s<nshift;s++){
|
for(int s=1;s<nshift;s++){
|
||||||
if((!converged[s])){
|
if((!converged[s])){
|
||||||
RealD z0 = z[s][1-iz];
|
RealD z0 = z[s][1-iz];
|
||||||
@ -240,7 +215,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
|
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ShiftTimer.Stop();
|
|
||||||
|
|
||||||
for(int s=0;s<nshift;s++){
|
for(int s=0;s<nshift;s++){
|
||||||
int ss = s;
|
int ss = s;
|
||||||
@ -283,9 +257,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
|
|
||||||
if ( all_converged ){
|
if ( all_converged ){
|
||||||
|
|
||||||
SolverTimer.Stop();
|
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
|
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
|
||||||
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
|
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
|
||||||
|
|
||||||
@ -298,19 +269,8 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
RealD cn = norm2(src);
|
RealD cn = norm2(src);
|
||||||
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
|
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
|
|
||||||
|
|
||||||
IterationsToComplete = k;
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
// ugly hack
|
// ugly hack
|
||||||
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
||||||
|
@ -57,9 +57,8 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
|
|||||||
|
|
||||||
parallel_region
|
parallel_region
|
||||||
{
|
{
|
||||||
|
std::vector < vobj > B(Nm); // Thread private
|
||||||
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
|
|
||||||
|
|
||||||
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
|
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
|
||||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||||
|
|
||||||
|
@ -244,11 +244,19 @@ namespace Grid {
|
|||||||
|
|
||||||
template<class sobj,class vobj> strong_inline
|
template<class sobj,class vobj> strong_inline
|
||||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||||
return axpy_norm_fast(ret,a,x,y);
|
ret.checkerboard = x.checkerboard;
|
||||||
|
conformable(ret,x);
|
||||||
|
conformable(x,y);
|
||||||
|
axpy(ret,a,x,y);
|
||||||
|
return norm2(ret);
|
||||||
}
|
}
|
||||||
template<class sobj,class vobj> strong_inline
|
template<class sobj,class vobj> strong_inline
|
||||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||||
return axpby_norm_fast(ret,a,b,x,y);
|
ret.checkerboard = x.checkerboard;
|
||||||
|
conformable(ret,x);
|
||||||
|
conformable(x,y);
|
||||||
|
axpby(ret,a,b,x,y);
|
||||||
|
return norm2(ret); // FIXME implement parallel norm in ss loop
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -33,7 +33,7 @@ namespace Grid {
|
|||||||
// Deterministic Reduction operations
|
// Deterministic Reduction operations
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
||||||
auto nrm = innerProduct(arg,arg);
|
ComplexD nrm = innerProduct(arg,arg);
|
||||||
return std::real(nrm);
|
return std::real(nrm);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,84 +43,31 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
{
|
{
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
|
scalar_type nrm;
|
||||||
|
|
||||||
GridBase *grid = left._grid;
|
GridBase *grid = left._grid;
|
||||||
const int pad = 8;
|
|
||||||
|
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
|
||||||
ComplexD inner;
|
|
||||||
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
|
|
||||||
|
|
||||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
|
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
|
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
|
||||||
}
|
}
|
||||||
// All threads sum across SIMD; reduce serial work at end
|
sumarray[thr]=TensorRemove(vnrm) ;
|
||||||
// one write per cacheline with streaming store
|
|
||||||
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
|
|
||||||
vstream(sumarray[thr*pad],tmp);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inner=0.0;
|
vector_type vvnrm; vvnrm=zero; // sum across threads
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
inner = inner+sumarray[i*pad];
|
vvnrm = vvnrm+sumarray[i];
|
||||||
}
|
}
|
||||||
right._grid->GlobalSum(inner);
|
nrm = Reduce(vvnrm);// sum across simd
|
||||||
return inner;
|
right._grid->GlobalSum(nrm);
|
||||||
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////
|
|
||||||
// Fast axpby_norm
|
|
||||||
// z = a x + b y
|
|
||||||
// return norm z
|
|
||||||
/////////////////////////
|
|
||||||
template<class sobj,class vobj> strong_inline RealD
|
|
||||||
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
|
||||||
{
|
|
||||||
sobj one(1.0);
|
|
||||||
return axpby_norm_fast(z,a,one,x,y);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class sobj,class vobj> strong_inline RealD
|
|
||||||
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
|
||||||
{
|
|
||||||
const int pad = 8;
|
|
||||||
z.checkerboard = x.checkerboard;
|
|
||||||
conformable(z,x);
|
|
||||||
conformable(x,y);
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
|
||||||
RealD nrm;
|
|
||||||
|
|
||||||
GridBase *grid = x._grid;
|
|
||||||
|
|
||||||
Vector<RealD> sumarray(grid->SumArraySize()*pad);
|
|
||||||
|
|
||||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
|
||||||
int nwork, mywork, myoff;
|
|
||||||
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
|
|
||||||
|
|
||||||
// private to thread; sub summation
|
|
||||||
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
|
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
|
||||||
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
|
||||||
vnrm = vnrm + innerProductD(tmp,tmp);
|
|
||||||
vstream(z._odata[ss],tmp);
|
|
||||||
}
|
|
||||||
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
|
|
||||||
}
|
|
||||||
|
|
||||||
nrm = 0.0; // sum across threads; linear in thread count but fast
|
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
|
||||||
nrm = nrm+sumarray[i*pad];
|
|
||||||
}
|
|
||||||
z._grid->GlobalSum(nrm);
|
|
||||||
return nrm;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Op,class T1>
|
template<class Op,class T1>
|
||||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
||||||
|
@ -158,19 +158,10 @@ namespace Grid {
|
|||||||
// tens of seconds per trajectory so this is clean in all reasonable cases,
|
// tens of seconds per trajectory so this is clean in all reasonable cases,
|
||||||
// and margin of safety is orders of magnitude.
|
// and margin of safety is orders of magnitude.
|
||||||
// We could hack Sitmo to skip in the higher order words of state if necessary
|
// We could hack Sitmo to skip in the higher order words of state if necessary
|
||||||
//
|
|
||||||
// Replace with 2^30 ; avoid problem on large volumes
|
|
||||||
//
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////
|
||||||
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
|
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
|
||||||
const int shift = 30;
|
|
||||||
|
|
||||||
uint64_t skip = site;
|
uint64_t skip = site;
|
||||||
|
skip = skip<<40;
|
||||||
skip = skip<<shift;
|
|
||||||
|
|
||||||
assert((skip >> shift)==site); // check for overflow
|
|
||||||
|
|
||||||
eng.discard(skip);
|
eng.discard(skip);
|
||||||
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
|
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
|
||||||
}
|
}
|
||||||
|
@ -358,10 +358,17 @@ PARALLEL_CRITICAL
|
|||||||
|
|
||||||
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
|
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
|
||||||
#ifdef USE_MPI_IO
|
#ifdef USE_MPI_IO
|
||||||
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
|
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< " and offset " << offset << std::endl;
|
||||||
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
|
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
|
||||||
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
|
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
|
||||||
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
|
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
|
||||||
|
|
||||||
|
MPI_Offset os;
|
||||||
|
MPI_File_get_position(fh, &os);
|
||||||
|
MPI_File_get_byte_offset(fh, os, &disp);
|
||||||
|
offset = disp;
|
||||||
|
|
||||||
|
|
||||||
MPI_File_close(&fh);
|
MPI_File_close(&fh);
|
||||||
MPI_Type_free(&fileArray);
|
MPI_Type_free(&fileArray);
|
||||||
MPI_Type_free(&localArray);
|
MPI_Type_free(&localArray);
|
||||||
@ -370,20 +377,22 @@ PARALLEL_CRITICAL
|
|||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
|
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
|
||||||
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
|
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
|
||||||
std::ifstream fin;
|
std::ifstream fin;
|
||||||
fin.open(file, std::ios::binary | std::ios::in);
|
fin.open(file, std::ios::binary | std::ios::in);
|
||||||
if (control & BINARYIO_MASTER_APPEND)
|
if (control & BINARYIO_MASTER_APPEND)
|
||||||
{
|
{
|
||||||
fin.seekg(-sizeof(fobj), fin.end);
|
fin.seekg(-sizeof(fobj), fin.end);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fin.seekg(offset + myrank * lsites * sizeof(fobj));
|
fin.seekg(offset + myrank * lsites * sizeof(fobj));
|
||||||
}
|
}
|
||||||
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
|
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
|
||||||
|
|
||||||
assert(fin.fail() == 0);
|
assert(fin.fail() == 0);
|
||||||
fin.close();
|
offset = fin.tellg();
|
||||||
|
fin.close();
|
||||||
}
|
}
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
|
|
||||||
@ -638,6 +647,11 @@ PARALLEL_CRITICAL
|
|||||||
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
|
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "RNG file nersc_checksum " << std::hex << nersc_csum << std::dec << std::endl;
|
||||||
|
std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
|
||||||
|
std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
|
||||||
|
|
||||||
|
|
||||||
timer.Start();
|
timer.Start();
|
||||||
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
|
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
|
||||||
std::vector<RngStateType> tmp(RngStateCount);
|
std::vector<RngStateType> tmp(RngStateCount);
|
||||||
@ -656,6 +670,11 @@ PARALLEL_CRITICAL
|
|||||||
serial.SetState(tmp,0);
|
serial.SetState(tmp,0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "RNG file checksum t " << std::hex << nersc_csum_tmp << std::dec << std::endl;
|
||||||
|
std::cout << GridLogMessage << "RNG file checksuma t " << std::hex << scidac_csuma_tmp << std::dec << std::endl;
|
||||||
|
std::cout << GridLogMessage << "RNG file checksumb t " << std::hex << scidac_csumb_tmp << std::dec << std::endl;
|
||||||
|
|
||||||
|
|
||||||
nersc_csum = nersc_csum + nersc_csum_tmp;
|
nersc_csum = nersc_csum + nersc_csum_tmp;
|
||||||
scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
|
scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
|
||||||
scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
|
scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
|
||||||
@ -706,6 +725,11 @@ PARALLEL_CRITICAL
|
|||||||
|
|
||||||
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
|
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "RNG file checksum " << std::hex << nersc_csum << std::dec << std::endl;
|
||||||
|
std::cout << GridLogMessage << "RNG file checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
|
||||||
|
std::cout << GridLogMessage << "RNG file checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
|
||||||
|
|
||||||
iodata.resize(1);
|
iodata.resize(1);
|
||||||
{
|
{
|
||||||
std::vector<RngStateType> tmp(RngStateCount);
|
std::vector<RngStateType> tmp(RngStateCount);
|
||||||
@ -715,6 +739,11 @@ PARALLEL_CRITICAL
|
|||||||
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
|
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
|
||||||
nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
|
nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "RNG file checksum t " << std::hex << nersc_csum_tmp << std::dec << std::endl;
|
||||||
|
std::cout << GridLogMessage << "RNG file checksuma t " << std::hex << scidac_csuma_tmp << std::dec << std::endl;
|
||||||
|
std::cout << GridLogMessage << "RNG file checksumb t " << std::hex << scidac_csumb_tmp << std::dec << std::endl;
|
||||||
|
|
||||||
|
|
||||||
nersc_csum = nersc_csum + nersc_csum_tmp;
|
nersc_csum = nersc_csum + nersc_csum_tmp;
|
||||||
scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
|
scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
|
||||||
scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
|
scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
|
||||||
|
@ -182,11 +182,6 @@ class GridLimeReader : public BinaryIO {
|
|||||||
{
|
{
|
||||||
filename= _filename;
|
filename= _filename;
|
||||||
File = fopen(filename.c_str(), "r");
|
File = fopen(filename.c_str(), "r");
|
||||||
if (File == nullptr)
|
|
||||||
{
|
|
||||||
std::cerr << "cannot open file '" << filename << "'" << std::endl;
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
LimeR = limeCreateReader(File);
|
LimeR = limeCreateReader(File);
|
||||||
}
|
}
|
||||||
/////////////////////////////////////////////
|
/////////////////////////////////////////////
|
||||||
@ -243,10 +238,52 @@ class GridLimeReader : public BinaryIO {
|
|||||||
// Verify checksums
|
// Verify checksums
|
||||||
/////////////////////////////////////////////
|
/////////////////////////////////////////////
|
||||||
assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
|
assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
|
||||||
|
std::cout << GridLogMessage<< " readLimeLatticeBinaryObject checksums match ! " <<std::endl;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
// Read an RNG object and verify checksum
|
||||||
|
////////////////////////////////////////////
|
||||||
|
void readLimeRNGObject(GridSerialRNG &sRNG, GridParallelRNG &pRNG,std::string record_name)
|
||||||
|
{
|
||||||
|
scidacChecksum scidacChecksum_;
|
||||||
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
|
|
||||||
|
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
|
||||||
|
uint64_t file_bytes =limeReaderBytes(LimeR);
|
||||||
|
|
||||||
|
// std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
|
||||||
|
// std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
|
||||||
|
|
||||||
|
if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
|
||||||
|
|
||||||
|
const int RngStateCount = GridSerialRNG::RngStateCount;
|
||||||
|
typedef std::array<typename GridSerialRNG::RngStateType,RngStateCount> RNGstate;
|
||||||
|
|
||||||
|
uint64_t PayloadSize = sizeof(RNGstate) * (pRNG._grid->_gsites+1);
|
||||||
|
|
||||||
|
assert(PayloadSize == file_bytes);// Must match or user error
|
||||||
|
uint64_t offset= ftello(File);
|
||||||
|
std::cout << GridLogDebug << " ReadLatticeObject from offset "<<offset << std::endl;
|
||||||
|
BinaryIO::readRNG(sRNG, pRNG, filename, offset, nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Insist checksum is next record
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Verify checksums
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
|
||||||
|
std::cout << GridLogMessage<< " readLimeRNGObject checksums match ! " <<std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// Read a generic serialisable object
|
// Read a generic serialisable object
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
@ -434,6 +471,78 @@ class GridLimeWriter : public BinaryIO
|
|||||||
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
|
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////
|
||||||
|
// Write an rng object and csum
|
||||||
|
// This routine is Collectively called by all nodes
|
||||||
|
// in communicator used by the field._grid
|
||||||
|
////////////////////////////////////////////////////
|
||||||
|
void writeLimeRNGObject(GridSerialRNG &sRNG, GridParallelRNG &pRNG, std::string record_name)
|
||||||
|
{
|
||||||
|
GridBase *grid = pRNG._grid;
|
||||||
|
assert(boss_node == pRNG._grid->IsBoss() );
|
||||||
|
|
||||||
|
const int RngStateCount = GridSerialRNG::RngStateCount;
|
||||||
|
typedef std::array<typename GridSerialRNG::RngStateType,RngStateCount> RNGstate;
|
||||||
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
// Create record header
|
||||||
|
////////////////////////////////////////////
|
||||||
|
int err;
|
||||||
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
|
uint64_t PayloadSize = sizeof(RNGstate) * (grid->_gsites+1);
|
||||||
|
std::cout << GridLogDebug << "Computed payload size " << PayloadSize << std::endl;
|
||||||
|
if ( boss_node ) {
|
||||||
|
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
|
||||||
|
fflush(File);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
// Check all nodes agree on file position
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
uint64_t offset1;
|
||||||
|
if ( boss_node ) {
|
||||||
|
offset1 = ftello(File);
|
||||||
|
}
|
||||||
|
grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// The above is collective. Write by other means into the binary record
|
||||||
|
///////////////////////////////////////////
|
||||||
|
uint64_t offset = offset1;
|
||||||
|
BinaryIO::writeRNG(sRNG, pRNG,filename, offset, nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Wind forward and close the record
|
||||||
|
///////////////////////////////////////////
|
||||||
|
if ( boss_node ) {
|
||||||
|
fseek(File,0,SEEK_END);
|
||||||
|
uint64_t offset2 = ftello(File);
|
||||||
|
std::cout << GridLogDebug << " now at offset "<<offset2 << std::endl;
|
||||||
|
assert( (offset2-offset1) == PayloadSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////
|
||||||
|
// Check MPI-2 I/O did what we expect to file
|
||||||
|
/////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
if ( boss_node ) {
|
||||||
|
err=limeWriterCloseRecord(LimeW); assert(err>=0);
|
||||||
|
}
|
||||||
|
////////////////////////////////////////
|
||||||
|
// Write checksum element, propagaing forward from the BinaryIO
|
||||||
|
// Always pair a checksum with a binary object, and close message
|
||||||
|
////////////////////////////////////////
|
||||||
|
scidacChecksum checksum;
|
||||||
|
std::stringstream streama; streama << std::hex << scidac_csuma;
|
||||||
|
std::stringstream streamb; streamb << std::hex << scidac_csumb;
|
||||||
|
checksum.suma= streama.str();
|
||||||
|
checksum.sumb= streamb.str();
|
||||||
|
if ( boss_node ) {
|
||||||
|
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class ScidacWriter : public GridLimeWriter {
|
class ScidacWriter : public GridLimeWriter {
|
||||||
@ -450,6 +559,27 @@ class ScidacWriter : public GridLimeWriter {
|
|||||||
writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
|
writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void writeScidacRNGRecord(GridSerialRNG &sRNG, GridParallelRNG &pRNG)
|
||||||
|
{
|
||||||
|
GridBase *grid = pRNG._grid;
|
||||||
|
FieldMetaData header;
|
||||||
|
|
||||||
|
header.floating_point = "IEEE64BIG";
|
||||||
|
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
|
||||||
|
GridMetaData(grid,header);
|
||||||
|
MachineCharacteristics(header);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// Fill the Lime file record by record
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
if ( this->boss_node ) {
|
||||||
|
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
|
||||||
|
}
|
||||||
|
// Collective call
|
||||||
|
writeLimeRNGObject(sRNG, pRNG,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// Write generic lattice field in scidac format
|
// Write generic lattice field in scidac format
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
@ -478,6 +608,7 @@ class ScidacWriter : public GridLimeWriter {
|
|||||||
// Collective call
|
// Collective call
|
||||||
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
|
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -491,8 +622,27 @@ class ScidacReader : public GridLimeReader {
|
|||||||
readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
|
readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
|
||||||
readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
|
readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// Write generic lattice field in scidac format
|
// Read RNGobject in scidac format
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
void readScidacRNGRecord(GridSerialRNG &sRNG, GridParallelRNG &pRNG)
|
||||||
|
{
|
||||||
|
GridBase * grid = pRNG._grid;
|
||||||
|
|
||||||
|
////////////////////////////////////////
|
||||||
|
// fill the Grid header
|
||||||
|
////////////////////////////////////////
|
||||||
|
FieldMetaData header;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// Fill the Lime file record by record
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
|
||||||
|
readLimeRNGObject(sRNG, pRNG, std::string(ILDG_BINARY_DATA));
|
||||||
|
}
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
// Read generic lattice field in scidac format
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
template <class vobj, class userRecord>
|
template <class vobj, class userRecord>
|
||||||
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
|
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
|
||||||
|
@ -49,8 +49,7 @@ inline double usecond(void) {
|
|||||||
|
|
||||||
typedef std::chrono::system_clock GridClock;
|
typedef std::chrono::system_clock GridClock;
|
||||||
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
||||||
typedef std::chrono::milliseconds GridMillisecs;
|
typedef std::chrono::milliseconds GridTime;
|
||||||
typedef std::chrono::microseconds GridTime;
|
|
||||||
typedef std::chrono::microseconds GridUsecs;
|
typedef std::chrono::microseconds GridUsecs;
|
||||||
|
|
||||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
||||||
@ -58,11 +57,6 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli
|
|||||||
stream << time.count()<<" ms";
|
stream << time.count()<<" ms";
|
||||||
return stream;
|
return stream;
|
||||||
}
|
}
|
||||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
|
|
||||||
{
|
|
||||||
stream << time.count()<<" usec";
|
|
||||||
return stream;
|
|
||||||
}
|
|
||||||
|
|
||||||
class GridStopWatch {
|
class GridStopWatch {
|
||||||
private:
|
private:
|
||||||
|
@ -63,12 +63,9 @@ namespace Grid {
|
|||||||
virtual RealD M (const FermionField &in, FermionField &out)=0;
|
virtual RealD M (const FermionField &in, FermionField &out)=0;
|
||||||
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
|
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
|
||||||
|
|
||||||
// Query the even even properties to make algorithmic decisions
|
|
||||||
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
|
|
||||||
virtual int isTrivialEE(void) { return 0; };
|
|
||||||
virtual RealD Mass(void) {return 0.0;};
|
|
||||||
|
|
||||||
// half checkerboard operaions
|
// half checkerboard operaions
|
||||||
|
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
|
||||||
|
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
||||||
virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
|
virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
|
||||||
virtual void Mooee (const FermionField &in, FermionField &out)=0;
|
virtual void Mooee (const FermionField &in, FermionField &out)=0;
|
||||||
|
@ -764,12 +764,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
inline void loadLinkElement(Simd ®, ref &memory) {
|
inline void loadLinkElement(Simd ®, ref &memory) {
|
||||||
reg = memory;
|
reg = memory;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void InsertGaugeField(DoubledGaugeField &U_ds,
|
|
||||||
const GaugeLinkField &U,int mu)
|
|
||||||
{
|
|
||||||
PokeIndex<LorentzIndex>(U_ds, U, mu);
|
|
||||||
}
|
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,
|
inline void DoubleStore(GridBase *GaugeGrid,
|
||||||
DoubledGaugeField &UUUds, // for Naik term
|
DoubledGaugeField &UUUds, // for Naik term
|
||||||
DoubledGaugeField &Uds,
|
DoubledGaugeField &Uds,
|
||||||
@ -808,10 +803,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
U = U *phases;
|
U = U *phases;
|
||||||
Udag = Udag *phases;
|
Udag = Udag *phases;
|
||||||
|
|
||||||
InsertGaugeField(Uds,U,mu);
|
PokeIndex<LorentzIndex>(Uds, U, mu);
|
||||||
InsertGaugeField(Uds,Udag,mu+4);
|
PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
||||||
// PokeIndex<LorentzIndex>(Uds, U, mu);
|
|
||||||
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
|
||||||
|
|
||||||
// 3 hop based on thin links. Crazy huh ?
|
// 3 hop based on thin links. Crazy huh ?
|
||||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||||
@ -823,8 +816,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
UUU = UUU *phases;
|
UUU = UUU *phases;
|
||||||
UUUdag = UUUdag *phases;
|
UUUdag = UUUdag *phases;
|
||||||
|
|
||||||
InsertGaugeField(UUUds,UUU,mu);
|
PokeIndex<LorentzIndex>(UUUds, UUU, mu);
|
||||||
InsertGaugeField(UUUds,UUUdag,mu+4);
|
PokeIndex<LorentzIndex>(UUUds, UUUdag, mu+4);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -917,23 +910,6 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
mac(&phi(), &UU(), &chi());
|
mac(&phi(), &UU(), &chi());
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
|
||||||
{
|
|
||||||
GridBase *GaugeGrid = U_ds._grid;
|
|
||||||
parallel_for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
|
||||||
|
|
||||||
SiteScalarGaugeLink ScalarU;
|
|
||||||
SiteDoubledGaugeField ScalarUds;
|
|
||||||
|
|
||||||
std::vector<int> lcoor;
|
|
||||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
|
||||||
peekLocalSite(ScalarUds, U_ds, lcoor);
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, U, lcoor);
|
|
||||||
ScalarUds(mu) = ScalarU();
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,
|
inline void DoubleStore(GridBase *GaugeGrid,
|
||||||
DoubledGaugeField &UUUds, // for Naik term
|
DoubledGaugeField &UUUds, // for Naik term
|
||||||
DoubledGaugeField &Uds,
|
DoubledGaugeField &Uds,
|
||||||
@ -975,8 +951,23 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
U = U *phases;
|
U = U *phases;
|
||||||
Udag = Udag *phases;
|
Udag = Udag *phases;
|
||||||
|
|
||||||
InsertGaugeField(Uds,U,mu);
|
|
||||||
InsertGaugeField(Uds,Udag,mu+4);
|
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||||
|
SiteScalarGaugeLink ScalarU;
|
||||||
|
SiteDoubledGaugeField ScalarUds;
|
||||||
|
|
||||||
|
std::vector<int> lcoor;
|
||||||
|
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
peekLocalSite(ScalarUds, Uds, lcoor);
|
||||||
|
|
||||||
|
peekLocalSite(ScalarU, U, lcoor);
|
||||||
|
ScalarUds(mu) = ScalarU();
|
||||||
|
|
||||||
|
peekLocalSite(ScalarU, Udag, lcoor);
|
||||||
|
ScalarUds(mu + 4) = ScalarU();
|
||||||
|
|
||||||
|
pokeLocalSite(ScalarUds, Uds, lcoor);
|
||||||
|
}
|
||||||
|
|
||||||
// 3 hop based on thin links. Crazy huh ?
|
// 3 hop based on thin links. Crazy huh ?
|
||||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||||
@ -988,8 +979,24 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
UUU = UUU *phases;
|
UUU = UUU *phases;
|
||||||
UUUdag = UUUdag *phases;
|
UUUdag = UUUdag *phases;
|
||||||
|
|
||||||
InsertGaugeField(UUUds,UUU,mu);
|
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||||
InsertGaugeField(UUUds,UUUdag,mu+4);
|
|
||||||
|
SiteScalarGaugeLink ScalarU;
|
||||||
|
SiteDoubledGaugeField ScalarUds;
|
||||||
|
|
||||||
|
std::vector<int> lcoor;
|
||||||
|
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
|
||||||
|
peekLocalSite(ScalarUds, UUUds, lcoor);
|
||||||
|
|
||||||
|
peekLocalSite(ScalarU, UUU, lcoor);
|
||||||
|
ScalarUds(mu) = ScalarU();
|
||||||
|
|
||||||
|
peekLocalSite(ScalarU, UUUdag, lcoor);
|
||||||
|
ScalarUds(mu + 4) = ScalarU();
|
||||||
|
|
||||||
|
pokeLocalSite(ScalarUds, UUUds, lcoor);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -44,7 +44,6 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3,
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
||||||
RealD _mass,
|
RealD _mass,
|
||||||
RealD _c1, RealD _c2,RealD _u0,
|
|
||||||
const ImplParams &p)
|
const ImplParams &p)
|
||||||
: Kernels(p),
|
: Kernels(p),
|
||||||
_grid(&Fgrid),
|
_grid(&Fgrid),
|
||||||
@ -63,16 +62,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
|
|||||||
UUUmuOdd(&Hgrid) ,
|
UUUmuOdd(&Hgrid) ,
|
||||||
_tmp(&Hgrid)
|
_tmp(&Hgrid)
|
||||||
{
|
{
|
||||||
int vol4;
|
|
||||||
int LLs=1;
|
|
||||||
c1=_c1;
|
|
||||||
c2=_c2;
|
|
||||||
u0=_u0;
|
|
||||||
vol4= _grid->oSites();
|
|
||||||
Stencil.BuildSurfaceList(LLs,vol4);
|
|
||||||
vol4= _cbgrid->oSites();
|
|
||||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
|
||||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -80,10 +69,22 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
|
|||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
RealD _c1, RealD _c2,RealD _u0,
|
RealD _c1, RealD _c2,RealD _u0,
|
||||||
const ImplParams &p)
|
const ImplParams &p)
|
||||||
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
|
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
|
||||||
{
|
{
|
||||||
|
c1=_c1;
|
||||||
|
c2=_c2;
|
||||||
|
u0=_u0;
|
||||||
ImportGauge(_Uthin,_Ufat);
|
ImportGauge(_Uthin,_Ufat);
|
||||||
}
|
}
|
||||||
|
template <class Impl>
|
||||||
|
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
|
||||||
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
|
const ImplParams &p)
|
||||||
|
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
|
||||||
|
{
|
||||||
|
ImportGaugeSimple(_Utriple,_Ufat);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Momentum space propagator should be
|
// Momentum space propagator should be
|
||||||
@ -97,6 +98,11 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
|
|||||||
// of above link to implmement fourier based solver.
|
// of above link to implmement fourier based solver.
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin)
|
||||||
|
{
|
||||||
|
ImportGauge(_Uthin,_Uthin);
|
||||||
|
};
|
||||||
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
||||||
{
|
{
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
@ -119,20 +125,6 @@ void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utripl
|
|||||||
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
|
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
|
||||||
|
|
||||||
}
|
}
|
||||||
CopyGaugeCheckerboards();
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
|
||||||
{
|
|
||||||
|
|
||||||
Umu = _U;
|
|
||||||
UUUmu = _UUU;
|
|
||||||
CopyGaugeCheckerboards();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
|
|
||||||
{
|
|
||||||
pickCheckerboard(Even, UmuEven, Umu);
|
pickCheckerboard(Even, UmuEven, Umu);
|
||||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||||
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
||||||
@ -168,7 +160,10 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
|
|||||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||||
}
|
}
|
||||||
|
|
||||||
CopyGaugeCheckerboards();
|
pickCheckerboard(Even, UmuEven, Umu);
|
||||||
|
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||||
|
pickCheckerboard(Even, UUUmuEven, UUUmu);
|
||||||
|
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@ -327,7 +322,6 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
||||||
DhopCalls+=2;
|
|
||||||
conformable(in._grid, _grid); // verifies full grid
|
conformable(in._grid, _grid); // verifies full grid
|
||||||
conformable(in._grid, out._grid);
|
conformable(in._grid, out._grid);
|
||||||
|
|
||||||
@ -338,7 +332,6 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
||||||
DhopCalls+=1;
|
|
||||||
conformable(in._grid, _cbgrid); // verifies half grid
|
conformable(in._grid, _cbgrid); // verifies half grid
|
||||||
conformable(in._grid, out._grid); // drops the cb check
|
conformable(in._grid, out._grid); // drops the cb check
|
||||||
|
|
||||||
@ -350,7 +343,6 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
|
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
|
||||||
DhopCalls+=1;
|
|
||||||
conformable(in._grid, _cbgrid); // verifies half grid
|
conformable(in._grid, _cbgrid); // verifies half grid
|
||||||
conformable(in._grid, out._grid); // drops the cb check
|
conformable(in._grid, out._grid); // drops the cb check
|
||||||
|
|
||||||
@ -382,193 +374,25 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
|
|||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag) {
|
||||||
{
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
|
||||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
DoubledGaugeField &UUU,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out, int dag)
|
|
||||||
{
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
Compressor compressor;
|
|
||||||
int len = U._grid->oSites();
|
|
||||||
const int LLs = 1;
|
|
||||||
|
|
||||||
DhopTotalTime -= usecond();
|
|
||||||
|
|
||||||
DhopFaceTime -= usecond();
|
|
||||||
st.Prepare();
|
|
||||||
st.HaloGather(in,compressor);
|
|
||||||
st.CommsMergeSHM(compressor);
|
|
||||||
DhopFaceTime += usecond();
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
DhopComputeTime -= usecond();
|
|
||||||
#pragma omp parallel
|
|
||||||
{
|
|
||||||
int tid = omp_get_thread_num();
|
|
||||||
int nthreads = omp_get_num_threads();
|
|
||||||
int ncomms = CartesianCommunicator::nCommThreads;
|
|
||||||
if (ncomms == -1) ncomms = 1;
|
|
||||||
assert(nthreads > ncomms);
|
|
||||||
|
|
||||||
if (tid >= ncomms) {
|
|
||||||
nthreads -= ncomms;
|
|
||||||
int ttid = tid - ncomms;
|
|
||||||
int n = len;
|
|
||||||
int chunk = n / nthreads;
|
|
||||||
int rem = n % nthreads;
|
|
||||||
int myblock, myn;
|
|
||||||
if (ttid < rem) {
|
|
||||||
myblock = ttid * chunk + ttid;
|
|
||||||
myn = chunk+1;
|
|
||||||
} else {
|
|
||||||
myblock = ttid*chunk + rem;
|
|
||||||
myn = chunk;
|
|
||||||
}
|
|
||||||
|
|
||||||
// do the compute
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
|
||||||
int sU = ss;
|
|
||||||
// Interior = 1; Exterior = 0; must implement for staggered
|
|
||||||
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
|
||||||
// Interior = 1; Exterior = 0;
|
|
||||||
int sU = ss;
|
|
||||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
st.CommunicateThreaded();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
DhopComputeTime += usecond();
|
|
||||||
|
|
||||||
// First to enter, last to leave timing
|
|
||||||
DhopFaceTime -= usecond();
|
|
||||||
st.CommsMerge(compressor);
|
|
||||||
DhopFaceTime -= usecond();
|
|
||||||
|
|
||||||
DhopComputeTime2 -= usecond();
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
int sz=st.surface_list.size();
|
|
||||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
|
||||||
int sU = st.surface_list[ss];
|
|
||||||
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
int sz=st.surface_list.size();
|
|
||||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
|
||||||
int sU = st.surface_list[ss];
|
|
||||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
DhopComputeTime2 += usecond();
|
|
||||||
#else
|
|
||||||
assert(0);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
DoubledGaugeField &UUU,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out, int dag)
|
|
||||||
{
|
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
DhopTotalTime -= usecond();
|
|
||||||
|
|
||||||
DhopCommTime -= usecond();
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
DhopCommTime += usecond();
|
|
||||||
|
|
||||||
DhopComputeTime -= usecond();
|
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
PARALLEL_FOR_LOOP
|
||||||
|
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
PARALLEL_FOR_LOOP
|
||||||
|
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DhopComputeTime += usecond();
|
|
||||||
DhopTotalTime += usecond();
|
|
||||||
};
|
};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
// Reporting
|
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
template<class Impl>
|
|
||||||
void ImprovedStaggeredFermion<Impl>::Report(void)
|
|
||||||
{
|
|
||||||
std::vector<int> latt = GridDefaultLatt();
|
|
||||||
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
|
||||||
RealD NP = _grid->_Nprocessors;
|
|
||||||
RealD NN = _grid->NodeCount();
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
|
|
||||||
<< DhopCalls << std::endl;
|
|
||||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
|
|
||||||
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
|
|
||||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
|
|
||||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
|
||||||
|
|
||||||
// Average the compute time
|
|
||||||
_grid->GlobalSum(DhopComputeTime);
|
|
||||||
DhopComputeTime/=NP;
|
|
||||||
|
|
||||||
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
|
||||||
|
|
||||||
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
|
|
||||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
|
|
||||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
|
|
||||||
{
|
|
||||||
DhopCalls = 0;
|
|
||||||
DhopTotalTime = 0;
|
|
||||||
DhopCommTime = 0;
|
|
||||||
DhopComputeTime = 0;
|
|
||||||
DhopFaceTime = 0;
|
|
||||||
|
|
||||||
Stencil.ZeroCounters();
|
|
||||||
StencilEven.ZeroCounters();
|
|
||||||
StencilOdd.ZeroCounters();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Conserved current - not yet implemented.
|
// Conserved current - not yet implemented.
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
|
@ -49,18 +49,6 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
|||||||
FermionField _tmp;
|
FermionField _tmp;
|
||||||
FermionField &tmp(void) { return _tmp; }
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
////////////////////////////////////////
|
|
||||||
// Performance monitoring
|
|
||||||
////////////////////////////////////////
|
|
||||||
void Report(void);
|
|
||||||
void ZeroCounters(void);
|
|
||||||
double DhopTotalTime;
|
|
||||||
double DhopCalls;
|
|
||||||
double DhopCommTime;
|
|
||||||
double DhopComputeTime;
|
|
||||||
double DhopComputeTime2;
|
|
||||||
double DhopFaceTime;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -117,34 +105,25 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
|||||||
|
|
||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
// Constructor
|
||||||
// Grid own interface Constructor
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
|
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
RealD _c1, RealD _c2,RealD _u0,
|
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
|
||||||
const ImplParams &p = ImplParams());
|
const ImplParams &p = ImplParams());
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
|
||||||
// MILC constructor no gauge fields
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
//////////////////////////////////////////////////////////////////////////
|
|
||||||
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
|
|
||||||
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
|
|
||||||
const ImplParams &p = ImplParams());
|
const ImplParams &p = ImplParams());
|
||||||
|
|
||||||
|
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
|
const ImplParams &p = ImplParams());
|
||||||
|
|
||||||
|
|
||||||
// DoubleStore impl dependent
|
// DoubleStore impl dependent
|
||||||
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
|
void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat);
|
||||||
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
|
void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
|
||||||
void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U);
|
void ImportGauge(const GaugeField &_Uthin);
|
||||||
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
|
|
||||||
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
|
||||||
DoubledGaugeField &GetUUU(void) { return UUUmu; };
|
|
||||||
void CopyGaugeCheckerboards(void);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
@ -153,8 +132,7 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
|||||||
// protected:
|
// protected:
|
||||||
public:
|
public:
|
||||||
// any other parameters of action ???
|
// any other parameters of action ???
|
||||||
virtual int isTrivialEE(void) { return 1; };
|
|
||||||
virtual RealD Mass(void) { return mass; }
|
|
||||||
RealD mass;
|
RealD mass;
|
||||||
RealD u0;
|
RealD u0;
|
||||||
RealD c1;
|
RealD c1;
|
||||||
|
@ -41,7 +41,8 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3,
|
|||||||
|
|
||||||
// 5d lattice for DWF.
|
// 5d lattice for DWF.
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
|
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
|
||||||
|
GridCartesian &FiveDimGrid,
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
GridCartesian &FourDimGrid,
|
GridCartesian &FourDimGrid,
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
@ -120,74 +121,16 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
|
|||||||
assert(FiveDimGrid._simd_layout[0] ==1);
|
assert(FiveDimGrid._simd_layout[0] ==1);
|
||||||
|
|
||||||
}
|
}
|
||||||
int LLs = FiveDimGrid._rdimensions[0];
|
|
||||||
int vol4= FourDimGrid.oSites();
|
|
||||||
Stencil.BuildSurfaceList(LLs,vol4);
|
|
||||||
|
|
||||||
vol4=FourDimRedBlackGrid.oSites();
|
// Allocate the required comms buffer
|
||||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
|
||||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
|
|
||||||
{
|
|
||||||
pickCheckerboard(Even, UmuEven, Umu);
|
|
||||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
|
||||||
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
|
||||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
|
|
||||||
GridCartesian &FiveDimGrid,
|
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
|
||||||
GridCartesian &FourDimGrid,
|
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
|
||||||
RealD _mass,
|
|
||||||
RealD _c1,RealD _c2, RealD _u0,
|
|
||||||
const ImplParams &p) :
|
|
||||||
ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
|
|
||||||
FourDimGrid,FourDimRedBlackGrid,
|
|
||||||
_mass,_c1,_c2,_u0,p)
|
|
||||||
{
|
|
||||||
ImportGauge(_Uthin,_Ufat);
|
ImportGauge(_Uthin,_Ufat);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
|
||||||
// For MILC use; pass three link U's and 1 link U
|
|
||||||
///////////////////////////////////////////////////
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin)
|
||||||
{
|
{
|
||||||
/////////////////////////////////////////////////////////////////
|
ImportGauge(_Uthin,_Uthin);
|
||||||
// Trivial import; phases and fattening and such like preapplied
|
};
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
|
||||||
|
|
||||||
auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
|
|
||||||
Impl::InsertGaugeField(UUUmu,U,mu);
|
|
||||||
|
|
||||||
U = adj( Cshift(U, mu, -3));
|
|
||||||
Impl::InsertGaugeField(UUUmu,-U,mu+4);
|
|
||||||
|
|
||||||
U = PeekIndex<LorentzIndex>(_Ufat, mu);
|
|
||||||
Impl::InsertGaugeField(Umu,U,mu);
|
|
||||||
|
|
||||||
U = adj( Cshift(U, mu, -1));
|
|
||||||
Impl::InsertGaugeField(Umu,-U,mu+4);
|
|
||||||
|
|
||||||
}
|
|
||||||
CopyGaugeCheckerboards();
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
|
||||||
{
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
// Trivial import; phases and fattening and such like preapplied
|
|
||||||
/////////////////////////////////////////////////////////////////
|
|
||||||
Umu = _U;
|
|
||||||
UUUmu = _UUU;
|
|
||||||
CopyGaugeCheckerboards();
|
|
||||||
}
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
|
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
|
||||||
{
|
{
|
||||||
@ -216,7 +159,10 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,cons
|
|||||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||||
}
|
}
|
||||||
|
|
||||||
CopyGaugeCheckerboards();
|
pickCheckerboard(Even, UmuEven, Umu);
|
||||||
|
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||||
|
pickCheckerboard(Even, UUUmuEven, UUUmu);
|
||||||
|
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
|
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
|
||||||
@ -277,162 +223,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*CHANGE */
|
|
||||||
template<class Impl>
|
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
|
||||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
|
||||||
|
|
||||||
Compressor compressor;
|
|
||||||
|
|
||||||
int LLs = in._grid->_rdimensions[0];
|
|
||||||
int len = U._grid->oSites();
|
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.Prepare();
|
|
||||||
st.HaloGather(in,compressor);
|
|
||||||
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
|
||||||
DhopFaceTime+=usecond();
|
|
||||||
|
|
||||||
double ctime=0;
|
|
||||||
double ptime=0;
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
|
||||||
{
|
|
||||||
int tid = omp_get_thread_num();
|
|
||||||
int nthreads = omp_get_num_threads();
|
|
||||||
int ncomms = CartesianCommunicator::nCommThreads;
|
|
||||||
if (ncomms == -1) ncomms = 1;
|
|
||||||
assert(nthreads > ncomms);
|
|
||||||
if (tid >= ncomms) {
|
|
||||||
double start = usecond();
|
|
||||||
nthreads -= ncomms;
|
|
||||||
int ttid = tid - ncomms;
|
|
||||||
int n = U._grid->oSites(); // 4d vol
|
|
||||||
int chunk = n / nthreads;
|
|
||||||
int rem = n % nthreads;
|
|
||||||
int myblock, myn;
|
|
||||||
if (ttid < rem) {
|
|
||||||
myblock = ttid * chunk + ttid;
|
|
||||||
myn = chunk+1;
|
|
||||||
} else {
|
|
||||||
myblock = ttid*chunk + rem;
|
|
||||||
myn = chunk;
|
|
||||||
}
|
|
||||||
|
|
||||||
// do the compute
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
|
||||||
int sU = ss;
|
|
||||||
// Interior = 1; Exterior = 0; must implement for staggered
|
|
||||||
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
|
||||||
// Interior = 1; Exterior = 0;
|
|
||||||
int sU = ss;
|
|
||||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ptime = usecond() - start;
|
|
||||||
} else {
|
|
||||||
double start = usecond();
|
|
||||||
st.CommunicateThreaded();
|
|
||||||
ctime = usecond() - start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
DhopCommTime += ctime;
|
|
||||||
DhopComputeTime+=ptime;
|
|
||||||
|
|
||||||
// First to enter, last to leave timing
|
|
||||||
st.CollateThreads();
|
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.CommsMerge(compressor);
|
|
||||||
DhopFaceTime+=usecond();
|
|
||||||
|
|
||||||
DhopComputeTime2-=usecond();
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
int sz=st.surface_list.size();
|
|
||||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
|
||||||
int sU = st.surface_list[ss];
|
|
||||||
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
int sz=st.surface_list.size();
|
|
||||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
|
||||||
int sU = st.surface_list[ss];
|
|
||||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
|
|
||||||
}
|
|
||||||
}
|
|
||||||
DhopComputeTime2+=usecond();
|
|
||||||
#else
|
|
||||||
assert(0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
Compressor compressor;
|
|
||||||
int LLs = in._grid->_rdimensions[0];
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//double t1=usecond();
|
|
||||||
DhopTotalTime -= usecond();
|
|
||||||
DhopCommTime -= usecond();
|
|
||||||
st.HaloExchange(in,compressor);
|
|
||||||
DhopCommTime += usecond();
|
|
||||||
|
|
||||||
DhopComputeTime -= usecond();
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
|
||||||
int sU=ss;
|
|
||||||
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
|
||||||
int sU=ss;
|
|
||||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
DhopComputeTime += usecond();
|
|
||||||
DhopTotalTime += usecond();
|
|
||||||
//double t2=usecond();
|
|
||||||
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
|
|
||||||
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
|
|
||||||
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
|
|
||||||
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
/*CHANGE END*/
|
|
||||||
|
|
||||||
/* ORG
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
@ -464,7 +254,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
|||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
DhopTotalTime += usecond();
|
DhopTotalTime += usecond();
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -547,9 +336,6 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
|
|||||||
DhopTotalTime = 0;
|
DhopTotalTime = 0;
|
||||||
DhopCommTime = 0;
|
DhopCommTime = 0;
|
||||||
DhopComputeTime = 0;
|
DhopComputeTime = 0;
|
||||||
DhopFaceTime = 0;
|
|
||||||
|
|
||||||
|
|
||||||
Stencil.ZeroCounters();
|
Stencil.ZeroCounters();
|
||||||
StencilEven.ZeroCounters();
|
StencilEven.ZeroCounters();
|
||||||
StencilOdd.ZeroCounters();
|
StencilOdd.ZeroCounters();
|
||||||
|
@ -64,8 +64,6 @@ namespace QCD {
|
|||||||
double DhopCalls;
|
double DhopCalls;
|
||||||
double DhopCommTime;
|
double DhopCommTime;
|
||||||
double DhopComputeTime;
|
double DhopComputeTime;
|
||||||
double DhopComputeTime2;
|
|
||||||
double DhopFaceTime;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
@ -121,27 +119,7 @@ namespace QCD {
|
|||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
DoubledGaugeField &UUU,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
void DhopInternalSerialComms(StencilImpl & st,
|
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
DoubledGaugeField &UUU,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Grid internal interface -- Thin link and fat link, with coefficients
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
||||||
GaugeField &_Ufat,
|
GaugeField &_Ufat,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
@ -149,37 +127,17 @@ namespace QCD {
|
|||||||
GridCartesian &FourDimGrid,
|
GridCartesian &FourDimGrid,
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
double _mass,
|
double _mass,
|
||||||
RealD _c1, RealD _c2,RealD _u0,
|
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
|
||||||
const ImplParams &p= ImplParams());
|
const ImplParams &p= ImplParams());
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// MILC constructor ; triple links, no rescale factors; must be externally pre multiplied
|
// DoubleStore
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
void ImportGauge(const GaugeField &_U);
|
||||||
ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
|
void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
|
||||||
GridCartesian &FourDimGrid,
|
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
|
||||||
double _mass,
|
|
||||||
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
|
|
||||||
const ImplParams &p= ImplParams());
|
|
||||||
|
|
||||||
// DoubleStore gauge field in operator
|
|
||||||
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
|
|
||||||
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
|
|
||||||
void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
|
|
||||||
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
|
|
||||||
// Give a reference; can be used to do an assignment or copy back out after import
|
|
||||||
// if Carleton wants to cache them and not use the ImportSimple
|
|
||||||
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
|
||||||
DoubledGaugeField &GetUUU(void) { return UUUmu; };
|
|
||||||
void CopyGaugeCheckerboards(void);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
public:
|
public:
|
||||||
|
|
||||||
virtual int isTrivialEE(void) { return 1; };
|
|
||||||
virtual RealD Mass(void) { return mass; }
|
|
||||||
|
|
||||||
GridBase *_FourDimGrid;
|
GridBase *_FourDimGrid;
|
||||||
GridBase *_FourDimRedBlackGrid;
|
GridBase *_FourDimRedBlackGrid;
|
||||||
|
@ -32,241 +32,223 @@ namespace Grid {
|
|||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
||||||
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
|
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
|
|
||||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
|
||||||
if (SE->_is_local ) { \
|
|
||||||
if (SE->_permute) { \
|
|
||||||
chi_p = χ \
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype); \
|
|
||||||
} else { \
|
|
||||||
chi_p = &in._odata[SE->_offset]; \
|
|
||||||
} \
|
|
||||||
} else { \
|
|
||||||
chi_p = &buf[SE->_offset]; \
|
|
||||||
} \
|
|
||||||
multLink(Uchi, U._odata[sU], *chi_p, Dir);
|
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
|
|
||||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
|
||||||
if (SE->_is_local ) { \
|
|
||||||
if (SE->_permute) { \
|
|
||||||
chi_p = χ \
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype); \
|
|
||||||
} else { \
|
|
||||||
chi_p = &in._odata[SE->_offset]; \
|
|
||||||
} \
|
|
||||||
} else if ( st.same_node[Dir] ) { \
|
|
||||||
chi_p = &buf[SE->_offset]; \
|
|
||||||
} \
|
|
||||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
|
||||||
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
|
||||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
|
||||||
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
|
||||||
nmu++; \
|
|
||||||
chi_p = &buf[SE->_offset]; \
|
|
||||||
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// Generic implementation; move to different file?
|
// Generic implementation; move to different file?
|
||||||
// Int, Ext, Int+Ext cases for comms overlap
|
////////////////////////////////////////////
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
SiteSpinor *buf, int sF,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
|
||||||
const FermionField &in, FermionField &out, int dag) {
|
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
SiteSpinor chi;
|
SiteSpinor chi;
|
||||||
SiteSpinor Uchi;
|
SiteSpinor Uchi;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
int skew;
|
int skew = 0;
|
||||||
|
if (threeLink) skew=8;
|
||||||
|
///////////////////////////
|
||||||
|
// Xp
|
||||||
|
///////////////////////////
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
SE = st.GetEntry(ptype, Xp+skew, sF);
|
||||||
int sF=LLs*sU+s;
|
if (SE->_is_local) {
|
||||||
skew = 0;
|
if (SE->_permute) {
|
||||||
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
chi_p = χ
|
||||||
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
permute(chi, in._odata[SE->_offset], ptype);
|
||||||
GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
|
} else {
|
||||||
GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
|
chi_p = &in._odata[SE->_offset];
|
||||||
GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
|
||||||
skew=8;
|
|
||||||
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
|
||||||
if ( dag ) {
|
|
||||||
Uchi = - Uchi;
|
|
||||||
}
|
|
||||||
vstream(out._odata[sF], Uchi);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
|
||||||
// Only contributions from interior of our node
|
|
||||||
///////////////////////////////////////////////////
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out,int dag) {
|
|
||||||
const SiteSpinor *chi_p;
|
|
||||||
SiteSpinor chi;
|
|
||||||
SiteSpinor Uchi;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int ptype;
|
|
||||||
int skew ;
|
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sF=LLs*sU+s;
|
|
||||||
skew = 0;
|
|
||||||
Uchi=zero;
|
|
||||||
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
|
||||||
skew=8;
|
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
|
||||||
if ( dag ) {
|
|
||||||
Uchi = - Uchi;
|
|
||||||
}
|
}
|
||||||
vstream(out._odata[sF], Uchi);
|
} else {
|
||||||
|
chi_p = &buf[SE->_offset];
|
||||||
}
|
}
|
||||||
};
|
Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp);
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
///////////////////////////////////////////////////
|
// Yp
|
||||||
// Only contributions from exterior of our node
|
///////////////////////////
|
||||||
///////////////////////////////////////////////////
|
SE = st.GetEntry(ptype, Yp+skew, sF);
|
||||||
template <class Impl>
|
if (SE->_is_local) {
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
if (SE->_permute) {
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
chi_p = χ
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
permute(chi, in._odata[SE->_offset], ptype);
|
||||||
const FermionField &in, FermionField &out,int dag) {
|
} else {
|
||||||
const SiteSpinor *chi_p;
|
chi_p = &in._odata[SE->_offset];
|
||||||
SiteSpinor chi;
|
|
||||||
SiteSpinor Uchi;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int ptype;
|
|
||||||
int nmu=0;
|
|
||||||
int skew ;
|
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sF=LLs*sU+s;
|
|
||||||
skew = 0;
|
|
||||||
Uchi=zero;
|
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
|
||||||
skew=8;
|
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
|
||||||
|
|
||||||
if ( nmu ) {
|
|
||||||
if ( dag ) {
|
|
||||||
out._odata[sF] = out._odata[sF] - Uchi;
|
|
||||||
} else {
|
|
||||||
out._odata[sF] = out._odata[sF] + Uchi;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p = &buf[SE->_offset];
|
||||||
}
|
}
|
||||||
};
|
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////
|
||||||
// Driving / wrapping routine to select right kernel
|
// Zp
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////
|
||||||
|
SE = st.GetEntry(ptype, Zp+skew, sF);
|
||||||
|
if (SE->_is_local) {
|
||||||
|
if (SE->_permute) {
|
||||||
|
chi_p = χ
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype);
|
||||||
|
} else {
|
||||||
|
chi_p = &in._odata[SE->_offset];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
chi_p = &buf[SE->_offset];
|
||||||
|
}
|
||||||
|
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp);
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
|
// Tp
|
||||||
|
///////////////////////////
|
||||||
|
SE = st.GetEntry(ptype, Tp+skew, sF);
|
||||||
|
if (SE->_is_local) {
|
||||||
|
if (SE->_permute) {
|
||||||
|
chi_p = χ
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype);
|
||||||
|
} else {
|
||||||
|
chi_p = &in._odata[SE->_offset];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
chi_p = &buf[SE->_offset];
|
||||||
|
}
|
||||||
|
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp);
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
|
// Xm
|
||||||
|
///////////////////////////
|
||||||
|
SE = st.GetEntry(ptype, Xm+skew, sF);
|
||||||
|
if (SE->_is_local) {
|
||||||
|
if (SE->_permute) {
|
||||||
|
chi_p = χ
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype);
|
||||||
|
} else {
|
||||||
|
chi_p = &in._odata[SE->_offset];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
chi_p = &buf[SE->_offset];
|
||||||
|
}
|
||||||
|
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm);
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
|
// Ym
|
||||||
|
///////////////////////////
|
||||||
|
SE = st.GetEntry(ptype, Ym+skew, sF);
|
||||||
|
if (SE->_is_local) {
|
||||||
|
if (SE->_permute) {
|
||||||
|
chi_p = χ
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype);
|
||||||
|
} else {
|
||||||
|
chi_p = &in._odata[SE->_offset];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
chi_p = &buf[SE->_offset];
|
||||||
|
}
|
||||||
|
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym);
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
|
// Zm
|
||||||
|
///////////////////////////
|
||||||
|
SE = st.GetEntry(ptype, Zm+skew, sF);
|
||||||
|
if (SE->_is_local) {
|
||||||
|
if (SE->_permute) {
|
||||||
|
chi_p = χ
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype);
|
||||||
|
} else {
|
||||||
|
chi_p = &in._odata[SE->_offset];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
chi_p = &buf[SE->_offset];
|
||||||
|
}
|
||||||
|
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm);
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
|
// Tm
|
||||||
|
///////////////////////////
|
||||||
|
SE = st.GetEntry(ptype, Tm+skew, sF);
|
||||||
|
if (SE->_is_local) {
|
||||||
|
if (SE->_permute) {
|
||||||
|
chi_p = χ
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype);
|
||||||
|
} else {
|
||||||
|
chi_p = &in._odata[SE->_offset];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
chi_p = &buf[SE->_offset];
|
||||||
|
}
|
||||||
|
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm);
|
||||||
|
|
||||||
|
vstream(out, Uchi);
|
||||||
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionField &in, FermionField &out,
|
const FermionField &in, FermionField &out) {
|
||||||
int interior,int exterior)
|
SiteSpinor naik;
|
||||||
{
|
SiteSpinor naive;
|
||||||
|
int oneLink =0;
|
||||||
|
int threeLink=1;
|
||||||
int dag=1;
|
int dag=1;
|
||||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
switch(Opt) {
|
||||||
};
|
#ifdef AVX512
|
||||||
|
//FIXME; move the sign into the Asm routine
|
||||||
template <class Impl>
|
case OptInlineAsm:
|
||||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
for(int s=0;s<LLs;s++) {
|
||||||
const FermionField &in, FermionField &out,
|
int sF=s+LLs*sU;
|
||||||
int interior,int exterior)
|
out._odata[sF]=-out._odata[sF];
|
||||||
{
|
}
|
||||||
int dag=0;
|
break;
|
||||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
#endif
|
||||||
|
case OptHandUnroll:
|
||||||
|
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
break;
|
||||||
|
case OptGeneric:
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=s+LLs*sU;
|
||||||
|
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
||||||
|
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
||||||
|
out._odata[sF] =-naive-naik;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
std::cout<<"Oops Opt = "<<Opt<<std::endl;
|
||||||
|
assert(0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int LLs,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, FermionField &out,
|
int sU, const FermionField &in, FermionField &out)
|
||||||
int dag,int interior,int exterior)
|
|
||||||
{
|
{
|
||||||
|
int oneLink =0;
|
||||||
|
int threeLink=1;
|
||||||
|
SiteSpinor naik;
|
||||||
|
SiteSpinor naive;
|
||||||
|
int dag=0;
|
||||||
switch(Opt) {
|
switch(Opt) {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
case OptInlineAsm:
|
case OptInlineAsm:
|
||||||
if ( interior && exterior ) {
|
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
} else {
|
|
||||||
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case OptHandUnroll:
|
case OptHandUnroll:
|
||||||
if ( interior && exterior ) {
|
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
} else if ( interior ) {
|
|
||||||
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
} else if ( exterior ) {
|
|
||||||
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
case OptGeneric:
|
case OptGeneric:
|
||||||
if ( interior && exterior ) {
|
for(int s=0;s<LLs;s++){
|
||||||
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
int sF=LLs*sU+s;
|
||||||
} else if ( interior ) {
|
// assert(sF<in._odata.size());
|
||||||
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
// assert(sU< U._odata.size());
|
||||||
} else if ( exterior ) {
|
// assert(sF>=0); assert(sU>=0);
|
||||||
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
||||||
|
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
||||||
|
out._odata[sF] =naive+naik;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -38,9 +38,8 @@ namespace QCD {
|
|||||||
class StaggeredKernelsStatic {
|
class StaggeredKernelsStatic {
|
||||||
public:
|
public:
|
||||||
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
||||||
enum { CommsAndCompute, CommsThenCompute };
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
static int Opt;
|
static int Opt; // these are a temporary hack
|
||||||
static int Comms;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
|
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
|
||||||
@ -54,62 +53,24 @@ public:
|
|||||||
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
|
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
||||||
// Generic Nc kernels
|
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out,int dag);
|
|
||||||
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out,int dag);
|
|
||||||
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out,int dag);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Nc=3 specific kernels
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out,int dag);
|
|
||||||
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out,int dag);
|
|
||||||
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out,int dag);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
||||||
// Asm Nc=3 specific kernels
|
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out,int dag);
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Generic interface; fan out to right routine
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
|
||||||
|
|
||||||
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
|
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
int LLs, int sU, const FermionField &in, FermionField &out, int dag);
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
|
||||||
|
|
||||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
int LLs, int sU, const FermionField &in, FermionField &out);
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||||
|
int sF, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,
|
||||||
|
int LLs, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -560,53 +560,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSTORE(2,%0,pUChi_02) \
|
VSTORE(2,%0,pUChi_02) \
|
||||||
: : "r" (out) : "memory" );
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
#define nREDUCE(out) \
|
|
||||||
asm ( \
|
|
||||||
VADD(UChi_00,UChi_10,UChi_00) \
|
|
||||||
VADD(UChi_01,UChi_11,UChi_01) \
|
|
||||||
VADD(UChi_02,UChi_12,UChi_02) \
|
|
||||||
VADD(UChi_30,UChi_20,UChi_30) \
|
|
||||||
VADD(UChi_31,UChi_21,UChi_31) \
|
|
||||||
VADD(UChi_32,UChi_22,UChi_32) \
|
|
||||||
VADD(UChi_00,UChi_30,UChi_00) \
|
|
||||||
VADD(UChi_01,UChi_31,UChi_01) \
|
|
||||||
VADD(UChi_02,UChi_32,UChi_02) ); \
|
|
||||||
asm (VZERO(Chi_00) \
|
|
||||||
VSUB(UChi_00,Chi_00,UChi_00) \
|
|
||||||
VSUB(UChi_01,Chi_00,UChi_01) \
|
|
||||||
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
|
||||||
asm ( \
|
|
||||||
VSTORE(0,%0,pUChi_00) \
|
|
||||||
VSTORE(1,%0,pUChi_01) \
|
|
||||||
VSTORE(2,%0,pUChi_02) \
|
|
||||||
: : "r" (out) : "memory" );
|
|
||||||
|
|
||||||
#define REDUCEa(out) \
|
#define REDUCEa(out) \
|
||||||
asm ( \
|
asm ( \
|
||||||
VADD(UChi_00,UChi_10,UChi_00) \
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
VADD(UChi_01,UChi_11,UChi_01) \
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
VADD(UChi_02,UChi_12,UChi_02) ); \
|
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||||
asm ( \
|
|
||||||
VSTORE(0,%0,pUChi_00) \
|
|
||||||
VSTORE(1,%0,pUChi_01) \
|
|
||||||
VSTORE(2,%0,pUChi_02) \
|
|
||||||
: : "r" (out) : "memory" );
|
|
||||||
|
|
||||||
// FIXME is sign right in the VSUB ?
|
|
||||||
#define nREDUCEa(out) \
|
|
||||||
asm ( \
|
asm ( \
|
||||||
VADD(UChi_00,UChi_10,UChi_00) \
|
VSTORE(0,%0,pUChi_00) \
|
||||||
VADD(UChi_01,UChi_11,UChi_01) \
|
VSTORE(1,%0,pUChi_01) \
|
||||||
VADD(UChi_02,UChi_12,UChi_02) ); \
|
VSTORE(2,%0,pUChi_02) \
|
||||||
asm (VZERO(Chi_00) \
|
: : "r" (out) : "memory" );
|
||||||
VSUB(UChi_00,Chi_00,UChi_00) \
|
|
||||||
VSUB(UChi_01,Chi_00,UChi_01) \
|
|
||||||
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
|
||||||
asm ( \
|
|
||||||
VSTORE(0,%0,pUChi_00) \
|
|
||||||
VSTORE(1,%0,pUChi_01) \
|
|
||||||
VSTORE(2,%0,pUChi_02) \
|
|
||||||
: : "r" (out) : "memory" );
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_0,Chi_0);\
|
permute##dir(Chi_0,Chi_0);\
|
||||||
@ -618,9 +581,10 @@ namespace QCD {
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
SiteSpinor *buf, int LLs,
|
||||||
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
};
|
};
|
||||||
@ -681,9 +645,10 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
SiteSpinor *buf, int LLs,
|
||||||
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -720,11 +685,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
if ( dag ) {
|
REDUCE(addr0);
|
||||||
nREDUCE(addr0);
|
|
||||||
} else {
|
|
||||||
REDUCE(addr0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -734,9 +695,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
SiteSpinor *buf, int LLs,
|
||||||
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -772,11 +734,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
if ( dag ) {
|
REDUCE(addr0);
|
||||||
nREDUCE(addr0);
|
|
||||||
} else {
|
|
||||||
REDUCE(addr0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -818,9 +776,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
|
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
SiteSpinor *buf, int LLs,
|
||||||
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -873,11 +832,7 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
if ( dag ) {
|
REDUCEa(addr0);
|
||||||
nREDUCEa(addr0);
|
|
||||||
} else {
|
|
||||||
REDUCEa(addr0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -886,9 +841,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
SiteSpinor *buf, int LLs,
|
||||||
|
int sU, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -941,11 +897,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
if ( dag ) {
|
REDUCEa(addr0);
|
||||||
nREDUCEa(addr0);
|
|
||||||
} else {
|
|
||||||
REDUCEa(addr0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -957,7 +909,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
DoubledGaugeField &U, \
|
DoubledGaugeField &U, \
|
||||||
DoubledGaugeField &UUU, \
|
DoubledGaugeField &UUU, \
|
||||||
SiteSpinor *buf, int LLs, \
|
SiteSpinor *buf, int LLs, \
|
||||||
int sU, const FermionField &in, FermionField &out,int dag);
|
int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
||||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
||||||
|
@ -28,6 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
|
|
||||||
|
#define REGISTER
|
||||||
|
|
||||||
#define LOAD_CHI(b) \
|
#define LOAD_CHI(b) \
|
||||||
const SiteSpinor & ref (b[offset]); \
|
const SiteSpinor & ref (b[offset]); \
|
||||||
@ -58,7 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
UChi ## _1 += U_12*Chi_2;\
|
UChi ## _1 += U_12*Chi_2;\
|
||||||
UChi ## _2 += U_22*Chi_2;
|
UChi ## _2 += U_22*Chi_2;
|
||||||
|
|
||||||
#define MULT_ADD(U,A,UChi) \
|
#define MULT_ADD(A,UChi) \
|
||||||
auto & ref(U._odata[sU](A)); \
|
auto & ref(U._odata[sU](A)); \
|
||||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||||
@ -81,319 +82,241 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_0,Chi_0); \
|
permute##dir(Chi_0,Chi_0);\
|
||||||
permute##dir(Chi_1,Chi_1); \
|
permute##dir(Chi_1,Chi_1);\
|
||||||
permute##dir(Chi_2,Chi_2);
|
permute##dir(Chi_2,Chi_2);
|
||||||
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
|
||||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
|
||||||
offset = SE->_offset; \
|
|
||||||
local = SE->_is_local; \
|
|
||||||
perm = SE->_permute; \
|
|
||||||
if ( local ) { \
|
|
||||||
LOAD_CHI(in._odata); \
|
|
||||||
if ( perm) { \
|
|
||||||
PERMUTE_DIR(Perm); \
|
|
||||||
} \
|
|
||||||
} else { \
|
|
||||||
LOAD_CHI(buf); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
|
|
||||||
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
|
||||||
{ \
|
|
||||||
MULT(Dir,even); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
|
|
||||||
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
|
||||||
{ \
|
|
||||||
MULT_ADD(U,Dir,even); \
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
|
|
||||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
|
||||||
offset = SE->_offset; \
|
|
||||||
local = SE->_is_local; \
|
|
||||||
perm = SE->_permute; \
|
|
||||||
if ( local ) { \
|
|
||||||
LOAD_CHI(in._odata); \
|
|
||||||
if ( perm) { \
|
|
||||||
PERMUTE_DIR(Perm); \
|
|
||||||
} \
|
|
||||||
} else if ( st.same_node[Dir] ) { \
|
|
||||||
LOAD_CHI(buf); \
|
|
||||||
} \
|
|
||||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
|
||||||
MULT_ADD(U,Dir,even); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
|
|
||||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
|
||||||
offset = SE->_offset; \
|
|
||||||
local = SE->_is_local; \
|
|
||||||
perm = SE->_permute; \
|
|
||||||
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
|
||||||
nmu++; \
|
|
||||||
{ LOAD_CHI(buf); } \
|
|
||||||
{ MULT_ADD(U,Dir,even); } \
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
int sU, const FermionField &in, FermionField &out, int dag)
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
SiteSpinor naik;
|
||||||
typedef typename Simd::vector_type V;
|
SiteSpinor naive;
|
||||||
|
int oneLink =0;
|
||||||
Simd even_0; // 12 regs on knc
|
int threeLink=1;
|
||||||
Simd even_1;
|
int skew(0);
|
||||||
Simd even_2;
|
Real scale(1.0);
|
||||||
Simd odd_0; // 12 regs on knc
|
|
||||||
Simd odd_1;
|
if(dag) scale = -1.0;
|
||||||
Simd odd_2;
|
|
||||||
|
|
||||||
Simd Chi_0; // two spinor; 6 regs
|
|
||||||
Simd Chi_1;
|
|
||||||
Simd Chi_2;
|
|
||||||
|
|
||||||
Simd U_00; // two rows of U matrix
|
|
||||||
Simd U_10;
|
|
||||||
Simd U_20;
|
|
||||||
Simd U_01;
|
|
||||||
Simd U_11;
|
|
||||||
Simd U_21; // 2 reg left.
|
|
||||||
Simd U_02;
|
|
||||||
Simd U_12;
|
|
||||||
Simd U_22;
|
|
||||||
|
|
||||||
SiteSpinor result;
|
|
||||||
int offset,local,perm, ptype;
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
|
||||||
int skew;
|
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
int sF=s+LLs*sU;
|
int sF=s+LLs*sU;
|
||||||
|
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
||||||
skew = 0;
|
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
||||||
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
out._odata[sF] =scale*(naive+naik);
|
||||||
HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG (U,Zp,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG (U,Tp,0,skew,odd);
|
|
||||||
HAND_STENCIL_LEG (U,Xm,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
|
||||||
skew = 8;
|
|
||||||
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
|
|
||||||
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
|
||||||
|
|
||||||
if ( dag ) {
|
|
||||||
result()()(0) = - even_0 - odd_0;
|
|
||||||
result()()(1) = - even_1 - odd_1;
|
|
||||||
result()()(2) = - even_2 - odd_2;
|
|
||||||
} else {
|
|
||||||
result()()(0) = even_0 + odd_0;
|
|
||||||
result()()(1) = even_1 + odd_1;
|
|
||||||
result()()(2) = even_2 + odd_2;
|
|
||||||
}
|
|
||||||
vstream(out._odata[sF],result);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
SiteSpinor *buf, int sF,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
int sU, const FermionField &in, SiteSpinor &out,int threeLink)
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
Simd even_0; // 12 regs on knc
|
REGISTER Simd even_0; // 12 regs on knc
|
||||||
Simd even_1;
|
REGISTER Simd even_1;
|
||||||
Simd even_2;
|
REGISTER Simd even_2;
|
||||||
Simd odd_0; // 12 regs on knc
|
REGISTER Simd odd_0; // 12 regs on knc
|
||||||
Simd odd_1;
|
REGISTER Simd odd_1;
|
||||||
Simd odd_2;
|
REGISTER Simd odd_2;
|
||||||
|
|
||||||
Simd Chi_0; // two spinor; 6 regs
|
REGISTER Simd Chi_0; // two spinor; 6 regs
|
||||||
Simd Chi_1;
|
REGISTER Simd Chi_1;
|
||||||
Simd Chi_2;
|
REGISTER Simd Chi_2;
|
||||||
|
|
||||||
Simd U_00; // two rows of U matrix
|
REGISTER Simd U_00; // two rows of U matrix
|
||||||
Simd U_10;
|
REGISTER Simd U_10;
|
||||||
Simd U_20;
|
REGISTER Simd U_20;
|
||||||
Simd U_01;
|
REGISTER Simd U_01;
|
||||||
Simd U_11;
|
REGISTER Simd U_11;
|
||||||
Simd U_21; // 2 reg left.
|
REGISTER Simd U_21; // 2 reg left.
|
||||||
Simd U_02;
|
REGISTER Simd U_02;
|
||||||
Simd U_12;
|
REGISTER Simd U_12;
|
||||||
Simd U_22;
|
REGISTER Simd U_22;
|
||||||
|
|
||||||
|
int skew = 0;
|
||||||
|
if (threeLink) skew=8;
|
||||||
|
|
||||||
SiteSpinor result;
|
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int skew;
|
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// Xp
|
||||||
int sF=s+LLs*sU;
|
SE=st.GetEntry(ptype,Xp+skew,sF);
|
||||||
|
offset = SE->_offset;
|
||||||
even_0 = zero; even_1 = zero; even_2 = zero;
|
local = SE->_is_local;
|
||||||
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
perm = SE->_permute;
|
||||||
|
|
||||||
skew = 0;
|
|
||||||
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
|
||||||
skew = 8;
|
|
||||||
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
|
||||||
|
|
||||||
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
|
||||||
if ( dag ) {
|
|
||||||
result()()(0) = - even_0 - odd_0;
|
|
||||||
result()()(1) = - even_1 - odd_1;
|
|
||||||
result()()(2) = - even_2 - odd_2;
|
|
||||||
} else {
|
|
||||||
result()()(0) = even_0 + odd_0;
|
|
||||||
result()()(1) = even_1 + odd_1;
|
|
||||||
result()()(2) = even_2 + odd_2;
|
|
||||||
}
|
|
||||||
vstream(out._odata[sF],result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
typedef typename Simd::scalar_type S;
|
|
||||||
typedef typename Simd::vector_type V;
|
|
||||||
|
|
||||||
Simd even_0; // 12 regs on knc
|
|
||||||
Simd even_1;
|
|
||||||
Simd even_2;
|
|
||||||
Simd odd_0; // 12 regs on knc
|
|
||||||
Simd odd_1;
|
|
||||||
Simd odd_2;
|
|
||||||
|
|
||||||
Simd Chi_0; // two spinor; 6 regs
|
|
||||||
Simd Chi_1;
|
|
||||||
Simd Chi_2;
|
|
||||||
|
|
||||||
Simd U_00; // two rows of U matrix
|
if ( local ) {
|
||||||
Simd U_10;
|
LOAD_CHI(in._odata);
|
||||||
Simd U_20;
|
if ( perm) {
|
||||||
Simd U_01;
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
Simd U_11;
|
|
||||||
Simd U_21; // 2 reg left.
|
|
||||||
Simd U_02;
|
|
||||||
Simd U_12;
|
|
||||||
Simd U_22;
|
|
||||||
|
|
||||||
SiteSpinor result;
|
|
||||||
int offset,local,perm, ptype;
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
|
||||||
int skew;
|
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sF=s+LLs*sU;
|
|
||||||
|
|
||||||
even_0 = zero; even_1 = zero; even_2 = zero;
|
|
||||||
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
|
||||||
int nmu=0;
|
|
||||||
skew = 0;
|
|
||||||
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
|
||||||
skew = 8;
|
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
|
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
|
||||||
|
|
||||||
// Add sum of all exterior connected stencil legs
|
|
||||||
if ( nmu ) {
|
|
||||||
if ( dag ) {
|
|
||||||
result()()(0) = - even_0 - odd_0;
|
|
||||||
result()()(1) = - even_1 - odd_1;
|
|
||||||
result()()(2) = - even_2 - odd_2;
|
|
||||||
} else {
|
|
||||||
result()()(0) = even_0 + odd_0;
|
|
||||||
result()()(1) = even_1 + odd_1;
|
|
||||||
result()()(2) = even_2 + odd_2;
|
|
||||||
}
|
|
||||||
out._odata[sF] = out._odata[sF] + result;
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(buf);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT(Xp,even);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Yp
|
||||||
|
SE=st.GetEntry(ptype,Yp+skew,sF);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHI(in._odata);
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(buf);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT(Yp,odd);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
// Zp
|
||||||
|
SE=st.GetEntry(ptype,Zp+skew,sF);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHI(in._odata);
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(buf);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_ADD(Zp,even);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tp
|
||||||
|
SE=st.GetEntry(ptype,Tp+skew,sF);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHI(in._odata);
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(buf);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_ADD(Tp,odd);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Xm
|
||||||
|
SE=st.GetEntry(ptype,Xm+skew,sF);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHI(in._odata);
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(buf);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_ADD(Xm,even);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Ym
|
||||||
|
SE=st.GetEntry(ptype,Ym+skew,sF);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHI(in._odata);
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(buf);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_ADD(Ym,odd);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Zm
|
||||||
|
SE=st.GetEntry(ptype,Zm+skew,sF);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHI(in._odata);
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(buf);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_ADD(Zm,even);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tm
|
||||||
|
SE=st.GetEntry(ptype,Tm+skew,sF);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHI(in._odata);
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(buf);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_ADD(Tm,odd);
|
||||||
|
}
|
||||||
|
|
||||||
|
vstream(out()()(0),even_0+odd_0);
|
||||||
|
vstream(out()()(1),even_1+odd_1);
|
||||||
|
vstream(out()()(2),even_2+odd_2);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
SiteSpinor *buf, int LLs, int sU, \
|
SiteSpinor *buf, int LLs, \
|
||||||
const FermionField &in, FermionField &out, int dag); \
|
int sU, const FermionField &in, FermionField &out, int dag);
|
||||||
\
|
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
|
||||||
SiteSpinor *buf, int LLs, int sU, \
|
|
||||||
const FermionField &in, FermionField &out, int dag); \
|
|
||||||
\
|
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
|
||||||
SiteSpinor *buf, int LLs, int sU, \
|
|
||||||
const FermionField &in, FermionField &out, int dag); \
|
|
||||||
|
|
||||||
|
#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \
|
||||||
|
SiteSpinor *buf, int sF, \
|
||||||
|
int sU, const FermionField &in, SiteSpinor &out,int threeLink) ;
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
||||||
|
|
||||||
|
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
|
||||||
|
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
|
||||||
|
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
||||||
|
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
||||||
|
|
||||||
}
|
}}
|
||||||
}
|
|
||||||
|
|
||||||
|
@ -274,16 +274,41 @@ public:
|
|||||||
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<int> same_node;
|
||||||
|
std::vector<int> surface_list;
|
||||||
|
|
||||||
WilsonStencil(GridBase *grid,
|
WilsonStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
|
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
||||||
|
same_node(npoints)
|
||||||
{
|
{
|
||||||
ZeroCountersi();
|
ZeroCountersi();
|
||||||
|
surface_list.resize(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void BuildSurfaceList(int Ls,int vol4){
|
||||||
|
|
||||||
|
// find same node for SHM
|
||||||
|
// Here we know the distance is 1 for WilsonStencil
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
same_node[point] = this->SameNode(point);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int site = 0 ;site< vol4;site++){
|
||||||
|
int local = 1;
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
||||||
|
local = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(local == 0) {
|
||||||
|
surface_list.push_back(site);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template < class compressor>
|
template < class compressor>
|
||||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
@ -344,23 +369,23 @@ public:
|
|||||||
int dag = compress.dag;
|
int dag = compress.dag;
|
||||||
int face_idx=0;
|
int face_idx=0;
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
||||||
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
||||||
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
||||||
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
||||||
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
||||||
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
||||||
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
||||||
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
||||||
} else {
|
} else {
|
||||||
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
||||||
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
||||||
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
||||||
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
||||||
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
||||||
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
||||||
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
||||||
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
||||||
}
|
}
|
||||||
this->face_table_computed=1;
|
this->face_table_computed=1;
|
||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
|
@ -348,98 +348,15 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
|
|||||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
/*Change starts*/
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag) {
|
FermionField &out, int dag) {
|
||||||
#ifdef GRID_OMP
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
|
||||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
DhopInternalSerial(st,lo,U,in,out,dag);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out, int dag) {
|
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
#ifdef GRID_OMP
|
|
||||||
Compressor compressor;
|
|
||||||
int len = U._grid->oSites();
|
|
||||||
const int LLs = 1;
|
|
||||||
|
|
||||||
st.Prepare();
|
|
||||||
st.HaloGather(in,compressor);
|
|
||||||
st.CommsMergeSHM(compressor);
|
|
||||||
#pragma omp parallel
|
|
||||||
{
|
|
||||||
int tid = omp_get_thread_num();
|
|
||||||
int nthreads = omp_get_num_threads();
|
|
||||||
int ncomms = CartesianCommunicator::nCommThreads;
|
|
||||||
if (ncomms == -1) ncomms = 1;
|
|
||||||
assert(nthreads > ncomms);
|
|
||||||
if (tid >= ncomms) {
|
|
||||||
nthreads -= ncomms;
|
|
||||||
int ttid = tid - ncomms;
|
|
||||||
int n = len;
|
|
||||||
int chunk = n / nthreads;
|
|
||||||
int rem = n % nthreads;
|
|
||||||
int myblock, myn;
|
|
||||||
if (ttid < rem) {
|
|
||||||
myblock = ttid * chunk + ttid;
|
|
||||||
myn = chunk+1;
|
|
||||||
} else {
|
|
||||||
myblock = ttid*chunk + rem;
|
|
||||||
myn = chunk;
|
|
||||||
}
|
|
||||||
// do the compute
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
|
|
||||||
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
|
||||||
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
|
||||||
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
|
||||||
}
|
|
||||||
} //else
|
|
||||||
|
|
||||||
} else {
|
|
||||||
st.CommunicateThreaded();
|
|
||||||
}
|
|
||||||
|
|
||||||
Compressor compressor(dag);
|
|
||||||
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} //pragma
|
|
||||||
#else
|
|
||||||
assert(0);
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out, int dag) {
|
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
|
|
||||||
@ -453,7 +370,6 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
/*Change ends */
|
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||||
|
@ -130,12 +130,6 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
|||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
@ -151,8 +145,6 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
|||||||
|
|
||||||
// protected:
|
// protected:
|
||||||
public:
|
public:
|
||||||
virtual RealD Mass(void) { return mass; }
|
|
||||||
virtual int isTrivialEE(void) { return 1; };
|
|
||||||
RealD mass;
|
RealD mass;
|
||||||
RealD diag_mass;
|
RealD diag_mass;
|
||||||
|
|
||||||
|
@ -445,7 +445,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
ptime = usecond() - start;
|
ptime = usecond() - start;
|
||||||
} else {
|
}
|
||||||
|
{
|
||||||
double start = usecond();
|
double start = usecond();
|
||||||
st.CommunicateThreaded();
|
st.CommunicateThreaded();
|
||||||
ctime = usecond() - start;
|
ctime = usecond() - start;
|
||||||
|
@ -53,7 +53,7 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
|
|||||||
typedef FermionOperator<Impl> Base;
|
typedef FermionOperator<Impl> Base;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
template <bool EnableBool = true>
|
template <bool EnableBool = true>
|
||||||
typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
|
typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
|
||||||
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
@ -70,27 +70,27 @@ public:
|
|||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case OptHandUnroll:
|
case OptHandUnroll:
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case OptGeneric:
|
case OptGeneric:
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
||||||
else assert(0);
|
else assert(0);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -232,7 +232,6 @@ private:
|
|||||||
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
int sF, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
|
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
|
@ -80,8 +80,12 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
|
|||||||
this->build_filenames(traj, Params, config, rng);
|
this->build_filenames(traj, Params, config, rng);
|
||||||
GridBase *grid = U._grid;
|
GridBase *grid = U._grid;
|
||||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
|
||||||
ScidacWriter _ScidacWriter(grid->IsBoss());
|
ScidacWriter _ScidacWriter(grid->IsBoss());
|
||||||
|
_ScidacWriter.open(rng);
|
||||||
|
_ScidacWriter.writeScidacRNGRecord(sRNG, pRNG);
|
||||||
|
_ScidacWriter.close();
|
||||||
|
|
||||||
|
//BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
_ScidacWriter.open(config);
|
_ScidacWriter.open(config);
|
||||||
_ScidacWriter.writeScidacFieldRecord(U, MData);
|
_ScidacWriter.writeScidacFieldRecord(U, MData);
|
||||||
_ScidacWriter.close();
|
_ScidacWriter.close();
|
||||||
@ -102,10 +106,12 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
|
|||||||
|
|
||||||
|
|
||||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
|
||||||
|
|
||||||
Metadata md_content;
|
|
||||||
ScidacReader _ScidacReader;
|
ScidacReader _ScidacReader;
|
||||||
|
//BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
_ScidacReader.open(rng);
|
||||||
|
_ScidacReader.readScidacRNGRecord(sRNG, pRNG);
|
||||||
|
_ScidacReader.close();
|
||||||
|
Metadata md_content;
|
||||||
_ScidacReader.open(config);
|
_ScidacReader.open(config);
|
||||||
_ScidacReader.readScidacFieldRecord(U,md_content); // format from the header
|
_ScidacReader.readScidacFieldRecord(U,md_content); // format from the header
|
||||||
_ScidacReader.close();
|
_ScidacReader.close();
|
||||||
|
@ -150,9 +150,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
std::vector<int> _distances;
|
std::vector<int> _distances;
|
||||||
std::vector<int> _comm_buf_size;
|
std::vector<int> _comm_buf_size;
|
||||||
std::vector<int> _permute_type;
|
std::vector<int> _permute_type;
|
||||||
std::vector<int> same_node;
|
|
||||||
std::vector<int> surface_list;
|
|
||||||
|
|
||||||
Vector<StencilEntry> _entries;
|
Vector<StencilEntry> _entries;
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
std::vector<Merge> Mergers;
|
std::vector<Merge> Mergers;
|
||||||
@ -203,7 +201,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
|
|
||||||
int dimension = _directions[point];
|
int dimension = _directions[point];
|
||||||
int displacement = _distances[point];
|
int displacement = _distances[point];
|
||||||
|
assert( (displacement==1) || (displacement==-1));
|
||||||
|
|
||||||
int pd = _grid->_processors[dimension];
|
int pd = _grid->_processors[dimension];
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
@ -218,12 +216,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
if ( ! comm_dim ) return 1;
|
if ( ! comm_dim ) return 1;
|
||||||
|
|
||||||
int nbr_proc;
|
int nbr_proc;
|
||||||
if (displacement>0) nbr_proc = 1;
|
if (displacement==1) nbr_proc = 1;
|
||||||
else nbr_proc = pd-1;
|
else nbr_proc = pd-1;
|
||||||
|
|
||||||
// FIXME this logic needs to be sorted for three link term
|
|
||||||
// assert( (displacement==1) || (displacement==-1));
|
|
||||||
// Present hack only works for >= 4^4 subvol per node
|
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
||||||
@ -544,29 +539,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Move interior/exterior split into the generic stencil
|
|
||||||
// FIXME Explicit Ls in interface is a pain. Should just use a vol
|
|
||||||
void BuildSurfaceList(int Ls,int vol4){
|
|
||||||
|
|
||||||
// find same node for SHM
|
|
||||||
// Here we know the distance is 1 for WilsonStencil
|
|
||||||
for(int point=0;point<this->_npoints;point++){
|
|
||||||
same_node[point] = this->SameNode(point);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int site = 0 ;site< vol4;site++){
|
|
||||||
int local = 1;
|
|
||||||
for(int point=0;point<this->_npoints;point++){
|
|
||||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
|
||||||
local = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(local == 0) {
|
|
||||||
surface_list.push_back(site);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
CartesianStencil(GridBase *grid,
|
CartesianStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
int checkerboard,
|
int checkerboard,
|
||||||
@ -577,8 +549,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
comm_bytes_thr(npoints),
|
comm_bytes_thr(npoints),
|
||||||
comm_enter_thr(npoints),
|
comm_enter_thr(npoints),
|
||||||
comm_leave_thr(npoints),
|
comm_leave_thr(npoints),
|
||||||
comm_time_thr(npoints),
|
comm_time_thr(npoints)
|
||||||
same_node(npoints)
|
|
||||||
{
|
{
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_npoints = npoints;
|
_npoints = npoints;
|
||||||
@ -586,7 +557,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
_directions = directions;
|
_directions = directions;
|
||||||
_distances = distances;
|
_distances = distances;
|
||||||
_unified_buffer_size=0;
|
_unified_buffer_size=0;
|
||||||
surface_list.resize(0);
|
|
||||||
|
|
||||||
int osites = _grid->oSites();
|
int osites = _grid->oSites();
|
||||||
|
|
||||||
|
@ -368,10 +368,8 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
||||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
||||||
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute;
|
|
||||||
} else {
|
} else {
|
||||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||||
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute;
|
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
||||||
|
@ -141,7 +141,6 @@ int main (int argc, char ** argv)
|
|||||||
t1=usecond();
|
t1=usecond();
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
@ -161,8 +160,7 @@ int main (int argc, char ** argv)
|
|||||||
localConvert(sresult,tmp);
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
@ -183,7 +181,6 @@ int main (int argc, char ** argv)
|
|||||||
localConvert(sresult,tmp);
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
||||||
|
|
||||||
|
@ -1,196 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./benchmarks/Benchmark_wilson.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
using namespace Grid::QCD;
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
|
|
||||||
std::vector<int> latt_size = GridDefaultLatt();
|
|
||||||
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
|
||||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
|
||||||
|
|
||||||
const int Ls=16;
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
|
||||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
|
||||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
|
||||||
|
|
||||||
std::vector<int> seeds({1,2,3,4});
|
|
||||||
|
|
||||||
GridParallelRNG pRNG4(UGrid);
|
|
||||||
GridParallelRNG pRNG5(FGrid);
|
|
||||||
pRNG4.SeedFixedIntegers(seeds);
|
|
||||||
pRNG5.SeedFixedIntegers(seeds);
|
|
||||||
|
|
||||||
typedef typename ImprovedStaggeredFermion5DF::FermionField FermionField;
|
|
||||||
typedef typename ImprovedStaggeredFermion5DF::ComplexField ComplexField;
|
|
||||||
typename ImprovedStaggeredFermion5DF::ImplParams params;
|
|
||||||
|
|
||||||
FermionField src (FGrid);
|
|
||||||
random(pRNG5,src);
|
|
||||||
/*
|
|
||||||
std::vector<int> site({0,1,2,0,0});
|
|
||||||
ColourVector cv = zero;
|
|
||||||
cv()()(0)=1.0;
|
|
||||||
src = zero;
|
|
||||||
pokeSite(cv,src,site);
|
|
||||||
*/
|
|
||||||
FermionField result(FGrid); result=zero;
|
|
||||||
FermionField tmp(FGrid); tmp=zero;
|
|
||||||
FermionField err(FGrid); tmp=zero;
|
|
||||||
FermionField phi (FGrid); random(pRNG5,phi);
|
|
||||||
FermionField chi (FGrid); random(pRNG5,chi);
|
|
||||||
|
|
||||||
LatticeGaugeFieldF Umu(UGrid);
|
|
||||||
SU3::HotConfiguration(pRNG4,Umu);
|
|
||||||
|
|
||||||
/*
|
|
||||||
for(int mu=1;mu<4;mu++){
|
|
||||||
auto tmp = PeekIndex<LorentzIndex>(Umu,mu);
|
|
||||||
tmp = zero;
|
|
||||||
PokeIndex<LorentzIndex>(Umu,tmp,mu);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
double volume=Ls;
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
volume=volume*latt_size[mu];
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
|
||||||
RealD c1=9.0/8.0;
|
|
||||||
RealD c2=-1.0/24.0;
|
|
||||||
RealD u0=1.0;
|
|
||||||
|
|
||||||
ImprovedStaggeredFermion5DF Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params);
|
|
||||||
ImprovedStaggeredFermionVec5dF sDs(Umu,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,c1,c2,u0,params);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation "<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
|
||||||
|
|
||||||
int ncall=1000;
|
|
||||||
int ncall1=1000;
|
|
||||||
double t0(0),t1(0);
|
|
||||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling staggered operator"<<std::endl;
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<ncall1;i++){
|
|
||||||
Ds.Dhop(src,result,0);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
|
|
||||||
|
|
||||||
#ifdef AVX512
|
|
||||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
|
||||||
#else
|
|
||||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<ncall1;i++){
|
|
||||||
Ds.Dhop(src,tmp,0);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
|
||||||
|
|
||||||
err = tmp-result;
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
FermionField ssrc (sFGrid); localConvert(src,ssrc);
|
|
||||||
FermionField sresult(sFGrid); sresult=zero;
|
|
||||||
|
|
||||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<ncall1;i++){
|
|
||||||
sDs.Dhop(ssrc,sresult,0);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
localConvert(sresult,tmp);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef AVX512
|
|
||||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
|
||||||
#else
|
|
||||||
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
err = tmp-result;
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
|
||||||
int extra=1;
|
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<ncall1*extra;i++){
|
|
||||||
sDs.Dhop(ssrc,sresult,0);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
localConvert(sresult,tmp);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
|
||||||
|
|
||||||
err = tmp-result;
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
@ -74,16 +74,8 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
||||||
|
|
||||||
double volume=1;
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
volume=volume*latt_size[mu];
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD mass=0.003;
|
RealD mass=0.003;
|
||||||
RealD c1=9.0/8.0;
|
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
|
||||||
RealD c2=-1.0/24.0;
|
|
||||||
RealD u0=1.0;
|
|
||||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
||||||
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
@ -95,26 +87,14 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||||
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
|
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
|
||||||
FermionField result4d_o(UrbGrid);
|
FermionField result4d_o(UrbGrid);
|
||||||
|
|
||||||
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
|
||||||
result4d_o=zero;
|
result4d_o=zero;
|
||||||
{
|
CG(HermOp4d,src4d_o,result4d_o);
|
||||||
double t1=usecond();
|
|
||||||
CG(HermOp4d,src4d_o,result4d_o);
|
|
||||||
double t2=usecond();
|
|
||||||
double ncall=CG.IterationsToComplete;
|
|
||||||
double flops = deodoe_flops * ncall;
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
HermOp4d.Report();
|
|
||||||
}
|
|
||||||
Ds4d.Report();
|
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
@ -123,17 +103,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
result_o=zero;
|
result_o=zero;
|
||||||
{
|
CG(HermOp,src_o,result_o);
|
||||||
double t1=usecond();
|
|
||||||
CG(HermOp,src_o,result_o);
|
|
||||||
double t2=usecond();
|
|
||||||
double ncall=CG.IterationsToComplete*Ls;
|
|
||||||
double flops = deodoe_flops * ncall;
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
HermOp.Report();
|
|
||||||
}
|
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
@ -142,18 +112,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
result_o=zero;
|
result_o=zero;
|
||||||
{
|
mCG(HermOp,src_o,result_o);
|
||||||
double t1=usecond();
|
|
||||||
mCG(HermOp,src_o,result_o);
|
|
||||||
double t2=usecond();
|
|
||||||
double ncall=mCG.IterationsToComplete*Ls;
|
|
||||||
double flops = deodoe_flops * ncall;
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
HermOp.Report();
|
|
||||||
}
|
|
||||||
|
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
@ -162,17 +121,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
result_o=zero;
|
result_o=zero;
|
||||||
{
|
BCGrQ(HermOp,src_o,result_o);
|
||||||
double t1=usecond();
|
|
||||||
BCGrQ(HermOp,src_o,result_o);
|
|
||||||
double t2=usecond();
|
|
||||||
double ncall=BCGrQ.IterationsToComplete*Ls;
|
|
||||||
double flops = deodoe_flops * ncall;
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
HermOp.Report();
|
|
||||||
}
|
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
@ -74,16 +74,7 @@ int main (int argc, char ** argv)
|
|||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
||||||
|
|
||||||
RealD mass=0.003;
|
RealD mass=0.003;
|
||||||
RealD c1=9.0/8.0;
|
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
|
||||||
RealD c2=-1.0/24.0;
|
|
||||||
RealD u0=1.0;
|
|
||||||
|
|
||||||
double volume=1;
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
volume=volume*latt_size[mu];
|
|
||||||
}
|
|
||||||
|
|
||||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
|
||||||
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
||||||
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
@ -95,23 +86,11 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
|
||||||
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||||
FermionField result4d(UGrid); result4d=zero;
|
FermionField result4d(UGrid); result4d=zero;
|
||||||
|
CG(HermOp4d,src4d,result4d);
|
||||||
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
|
||||||
{
|
|
||||||
double t1=usecond();
|
|
||||||
CG(HermOp4d,src4d,result4d);
|
|
||||||
double t2=usecond();
|
|
||||||
double ncall=CG.IterationsToComplete;
|
|
||||||
double flops = deodoe_flops * ncall;
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
@ -119,18 +98,9 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
|
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
result=zero;
|
result=zero;
|
||||||
{
|
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
double t1=usecond();
|
|
||||||
CG(HermOp,src,result);
|
CG(HermOp,src,result);
|
||||||
double t2=usecond();
|
|
||||||
double ncall=CG.IterationsToComplete;
|
|
||||||
double flops = deodoe_flops * ncall;
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
}
|
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
@ -138,16 +108,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
result=zero;
|
result=zero;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
{
|
|
||||||
double t1=usecond();
|
|
||||||
mCG(HermOp,src,result);
|
mCG(HermOp,src,result);
|
||||||
double t2=usecond();
|
|
||||||
double ncall=CG.IterationsToComplete;
|
|
||||||
double flops = deodoe_flops * ncall;
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
}
|
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
@ -156,16 +117,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
result=zero;
|
result=zero;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
{
|
|
||||||
double t1=usecond();
|
|
||||||
BCGrQ(HermOp,src,result);
|
BCGrQ(HermOp,src,result);
|
||||||
double t2=usecond();
|
|
||||||
double ncall=CG.IterationsToComplete;
|
|
||||||
double flops = deodoe_flops * ncall;
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
}
|
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
@ -71,10 +71,7 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
RealD mass=0.003;
|
RealD mass=0.003;
|
||||||
RealD c1=9.0/8.0;
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
||||||
RealD c2=-1.0/24.0;
|
|
||||||
RealD u0=1.0;
|
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
|
||||||
|
|
||||||
FermionField res_o(&RBGrid);
|
FermionField res_o(&RBGrid);
|
||||||
FermionField src_o(&RBGrid);
|
FermionField src_o(&RBGrid);
|
||||||
@ -83,19 +80,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
double t1=usecond();
|
|
||||||
CG(HermOpEO,src_o,res_o);
|
CG(HermOpEO,src_o,res_o);
|
||||||
double t2=usecond();
|
|
||||||
|
|
||||||
// Schur solver: uses DeoDoe => volume * 1146
|
|
||||||
double ncall=CG.IterationsToComplete;
|
|
||||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
FermionField tmp(&RBGrid);
|
FermionField tmp(&RBGrid);
|
||||||
|
|
||||||
|
@ -65,10 +65,7 @@ int main (int argc, char ** argv)
|
|||||||
FermionField resid(&Grid);
|
FermionField resid(&Grid);
|
||||||
|
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
RealD c1=9.0/8.0;
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
||||||
RealD c2=-1.0/24.0;
|
|
||||||
RealD u0=1.0;
|
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
|
||||||
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
|
SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
|
||||||
|
@ -73,10 +73,7 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
RealD c1=9.0/8.0;
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
||||||
RealD c2=-1.0/24.0;
|
|
||||||
RealD u0=1.0;
|
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
|
||||||
|
|
||||||
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
|
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
|
||||||
ConjugateGradient<FermionField> CG(1.0e-6,10000);
|
ConjugateGradient<FermionField> CG(1.0e-6,10000);
|
||||||
|
@ -1,121 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./tests/Test_wilson_cg_unprec.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
using namespace Grid::QCD;
|
|
||||||
|
|
||||||
template<class d>
|
|
||||||
struct scal {
|
|
||||||
d internal;
|
|
||||||
};
|
|
||||||
|
|
||||||
Gamma::Algebra Gmu [] = {
|
|
||||||
Gamma::Algebra::GammaX,
|
|
||||||
Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ,
|
|
||||||
Gamma::Algebra::GammaT
|
|
||||||
};
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
|
||||||
{
|
|
||||||
typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
|
|
||||||
typename ImprovedStaggeredFermionR::ImplParams params;
|
|
||||||
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
|
|
||||||
std::vector<int> latt_size = GridDefaultLatt();
|
|
||||||
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
|
||||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
|
||||||
GridRedBlackCartesian RBGrid(&Grid);
|
|
||||||
|
|
||||||
std::vector<int> seeds({1,2,3,4});
|
|
||||||
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
|
|
||||||
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
|
|
||||||
|
|
||||||
double volume=1;
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
volume=volume*latt_size[mu];
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////
|
|
||||||
// sqrt
|
|
||||||
////////////////////////////////////////
|
|
||||||
double lo=0.001;
|
|
||||||
double hi=1.0;
|
|
||||||
int precision=64;
|
|
||||||
int degree=10;
|
|
||||||
AlgRemez remez(lo,hi,precision);
|
|
||||||
remez.generateApprox(degree,1,2);
|
|
||||||
MultiShiftFunction Sqrt(remez,1.0e-6,false);
|
|
||||||
std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////
|
|
||||||
// Setup staggered
|
|
||||||
////////////////////////////////////////////
|
|
||||||
RealD mass=0.003;
|
|
||||||
RealD c1=9.0/8.0;
|
|
||||||
RealD c2=-1.0/24.0;
|
|
||||||
RealD u0=1.0;
|
|
||||||
|
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
|
||||||
|
|
||||||
FermionField src(&Grid); random(pRNG,src);
|
|
||||||
FermionField src_o(&RBGrid);
|
|
||||||
pickCheckerboard(Odd,src_o,src);
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////
|
|
||||||
//Multishift CG
|
|
||||||
/////////////////////////////////
|
|
||||||
std::vector<FermionField> result(degree,&RBGrid);
|
|
||||||
ConjugateGradientMultiShift<FermionField> MSCG(10000,Sqrt);
|
|
||||||
|
|
||||||
double deodoe_flops=(1205+15*degree)*volume; // == 66*16 + == 1146
|
|
||||||
|
|
||||||
double t1=usecond();
|
|
||||||
MSCG(HermOpEO,src_o,result);
|
|
||||||
double t2=usecond();
|
|
||||||
double ncall=MSCG.IterationsToComplete;
|
|
||||||
double flops = deodoe_flops * ncall;
|
|
||||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
|
||||||
// HermOpEO.Report();
|
|
||||||
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
Reference in New Issue
Block a user