mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-15 02:05:37 +00:00
Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering
This commit is contained in:
commit
0cd6b1858c
@ -41,7 +41,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
std::vector<int> latt4 = GridDefaultLatt();
|
std::vector<int> latt4 = GridDefaultLatt();
|
||||||
const int Ls=8;
|
const int Ls=16;
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
@ -70,7 +70,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
{
|
{
|
||||||
const int ncall=100;
|
const int ncall=1000;
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||||
@ -81,18 +81,7 @@ int main (int argc, char ** argv)
|
|||||||
LatticeFermion result(FGrid);
|
LatticeFermion result(FGrid);
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
double t0,t1;
|
double t0,t1;
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.Dhop(src,result,0);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
|
|
||||||
|
|
||||||
LatticeFermion r_eo(FGrid);
|
LatticeFermion r_eo(FGrid);
|
||||||
LatticeFermion src_e (FrbGrid);
|
LatticeFermion src_e (FrbGrid);
|
||||||
@ -109,48 +98,46 @@ int main (int argc, char ** argv)
|
|||||||
r_e = zero;
|
r_e = zero;
|
||||||
r_o = zero;
|
r_o = zero;
|
||||||
|
|
||||||
FGrid->Barrier();
|
|
||||||
t0=usecond();
|
|
||||||
for (int i = 0; i < ncall; i++) {
|
|
||||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
std::cout<<GridLogMessage << "Called DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
|
|
||||||
|
|
||||||
FGrid->Barrier();
|
#define BENCH_DW(A,in,out) \
|
||||||
t0=usecond();
|
Dw.CayleyZeroCounters(); \
|
||||||
for (int i = 0; i < ncall; i++) {
|
Dw. A (in,out); \
|
||||||
Dw.Mooee(src_o, r_o);
|
FGrid->Barrier(); \
|
||||||
}
|
t0=usecond(); \
|
||||||
t1=usecond();
|
for(int i=0;i<ncall;i++){ \
|
||||||
FGrid->Barrier();
|
Dw. A (in,out); \
|
||||||
std::cout<<GridLogMessage << "Called Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
|
} \
|
||||||
|
t1=usecond(); \
|
||||||
|
FGrid->Barrier(); \
|
||||||
|
Dw.CayleyReport(); \
|
||||||
|
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
|
||||||
|
std::cout<<GridLogMessage << "******************"<<std::endl;
|
||||||
|
|
||||||
FGrid->Barrier();
|
#define BENCH_DW_MEO(A,in,out) \
|
||||||
t0=usecond();
|
Dw.CayleyZeroCounters(); \
|
||||||
for (int i = 0; i < ncall; i++) {
|
Dw. A (in,out,0); \
|
||||||
Dw.MooeeInv(src_o, r_o);
|
FGrid->Barrier(); \
|
||||||
}
|
t0=usecond(); \
|
||||||
t1=usecond();
|
for(int i=0;i<ncall;i++){ \
|
||||||
FGrid->Barrier();
|
Dw. A (in,out,0); \
|
||||||
std::cout<<GridLogMessage << "Called MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
|
} \
|
||||||
|
t1=usecond(); \
|
||||||
|
FGrid->Barrier(); \
|
||||||
|
Dw.CayleyReport(); \
|
||||||
|
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
|
||||||
|
std::cout<<GridLogMessage << "******************"<<std::endl;
|
||||||
|
|
||||||
|
BENCH_DW_MEO(Dhop ,src,result);
|
||||||
FGrid->Barrier();
|
BENCH_DW_MEO(DhopEO ,src_o,r_e);
|
||||||
t0=usecond();
|
BENCH_DW(Meooe ,src_o,r_e);
|
||||||
for (int i = 0; i < ncall; i++) {
|
BENCH_DW(Mooee ,src_o,r_o);
|
||||||
Dw.Meooe(src_o, r_e);
|
BENCH_DW(MooeeInv,src_o,r_o);
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
std::cout<<GridLogMessage << "Called Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
{
|
{
|
||||||
const int ncall=100;
|
const int ncall=1000;
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
|
||||||
@ -168,14 +155,6 @@ int main (int argc, char ** argv)
|
|||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
|
|
||||||
double t0,t1;
|
double t0,t1;
|
||||||
t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.Dhop(src,result,0);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Vec5D Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
|
|
||||||
|
|
||||||
LatticeFermion r_eo(sFGrid);
|
LatticeFermion r_eo(sFGrid);
|
||||||
LatticeFermion src_e (sFrbGrid);
|
LatticeFermion src_e (sFrbGrid);
|
||||||
@ -192,46 +171,13 @@ int main (int argc, char ** argv)
|
|||||||
r_e = zero;
|
r_e = zero;
|
||||||
r_o = zero;
|
r_o = zero;
|
||||||
|
|
||||||
FGrid->Barrier();
|
BENCH_DW_MEO(Dhop ,src,result);
|
||||||
t0=usecond();
|
BENCH_DW_MEO(DhopEO ,src_o,r_e);
|
||||||
for (int i = 0; i < ncall; i++) {
|
BENCH_DW(Meooe ,src_o,r_e);
|
||||||
Dw.DhopEO(src_o, r_e, DaggerNo);
|
BENCH_DW(Mooee ,src_o,r_o);
|
||||||
}
|
BENCH_DW(MooeeInv,src_o,r_o);
|
||||||
t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
std::cout<<GridLogMessage << "Called Vec5D DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
|
|
||||||
|
|
||||||
FGrid->Barrier();
|
|
||||||
t0=usecond();
|
|
||||||
for (int i = 0; i < ncall; i++) {
|
|
||||||
Dw.Mooee(src_o, r_o);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
std::cout<<GridLogMessage << "Called Vec5D Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
|
|
||||||
|
|
||||||
FGrid->Barrier();
|
|
||||||
t0=usecond();
|
|
||||||
for (int i = 0; i < ncall; i++) {
|
|
||||||
Dw.MooeeInv(src_o, r_o);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
std::cout<<GridLogMessage << "Called Vec5D MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
FGrid->Barrier();
|
|
||||||
t0=usecond();
|
|
||||||
for (int i = 0; i < ncall; i++) {
|
|
||||||
Dw.Meooe(src_o, r_e);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
std::cout<<GridLogMessage << "Called Vec5D Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
@ -62,6 +62,50 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
|
|||||||
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
|
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
||||||
|
{
|
||||||
|
this->Report();
|
||||||
|
std::vector<int> latt = GridDefaultLatt();
|
||||||
|
RealD volume = this->Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||||
|
RealD NP = this->_FourDimGrid->_Nprocessors;
|
||||||
|
if ( M5Dcalls > 0 ) {
|
||||||
|
std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
|
||||||
|
std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl;
|
||||||
|
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl;
|
||||||
|
|
||||||
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
|
RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( MooeeInvCalls > 0 ) {
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
||||||
|
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
||||||
|
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
||||||
|
|
||||||
|
// Flops = 9*12*Ls*vol/2
|
||||||
|
RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
|
||||||
|
{
|
||||||
|
this->ZeroCounters();
|
||||||
|
M5Dflops=0;
|
||||||
|
M5Dcalls=0;
|
||||||
|
M5Dtime=0;
|
||||||
|
MooeeInvFlops=0;
|
||||||
|
MooeeInvCalls=0;
|
||||||
|
MooeeInvTime=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
|
@ -121,6 +121,18 @@ namespace Grid {
|
|||||||
RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
|
RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void CayleyReport(void);
|
||||||
|
void CayleyZeroCounters(void);
|
||||||
|
|
||||||
|
double M5Dflops;
|
||||||
|
double M5Dcalls;
|
||||||
|
double M5Dtime;
|
||||||
|
|
||||||
|
double MooeeInvFlops;
|
||||||
|
double MooeeInvCalls;
|
||||||
|
double MooeeInvTime;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
|
@ -51,6 +51,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
GridBase *grid=psi._grid;
|
GridBase *grid=psi._grid;
|
||||||
assert(phi.checkerboard == psi.checkerboard);
|
assert(phi.checkerboard == psi.checkerboard);
|
||||||
chi.checkerboard=psi.checkerboard;
|
chi.checkerboard=psi.checkerboard;
|
||||||
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
|
M5Dcalls++;
|
||||||
|
M5Dtime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
@ -76,6 +79,7 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
M5Dtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -91,6 +95,9 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
assert(phi.checkerboard == psi.checkerboard);
|
assert(phi.checkerboard == psi.checkerboard);
|
||||||
chi.checkerboard=psi.checkerboard;
|
chi.checkerboard=psi.checkerboard;
|
||||||
|
|
||||||
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
|
M5Dcalls++;
|
||||||
|
M5Dtime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
auto tmp = psi._odata[0];
|
auto tmp = psi._odata[0];
|
||||||
@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
M5Dtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -126,10 +134,14 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
|
|||||||
|
|
||||||
chi.checkerboard=psi.checkerboard;
|
chi.checkerboard=psi.checkerboard;
|
||||||
|
|
||||||
|
MooeeInvCalls++;
|
||||||
|
MooeeInvTime-=usecond();
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
auto tmp = psi._odata[0];
|
auto tmp = psi._odata[0];
|
||||||
|
|
||||||
|
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
||||||
// Apply (L^{\prime})^{-1}
|
// Apply (L^{\prime})^{-1}
|
||||||
chi[ss]=psi[ss]; // chi[0]=psi[0]
|
chi[ss]=psi[ss]; // chi[0]=psi[0]
|
||||||
for(int s=1;s<Ls;s++){
|
for(int s=1;s<Ls;s++){
|
||||||
@ -155,6 +167,9 @@ PARALLEL_FOR_LOOP
|
|||||||
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
|
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MooeeInvTime+=usecond();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -166,6 +181,8 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
|
|||||||
assert(psi.checkerboard == psi.checkerboard);
|
assert(psi.checkerboard == psi.checkerboard);
|
||||||
chi.checkerboard=psi.checkerboard;
|
chi.checkerboard=psi.checkerboard;
|
||||||
|
|
||||||
|
MooeeInvCalls++;
|
||||||
|
MooeeInvTime-=usecond();
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP
|
|||||||
chi[ss+s] = chi[ss+s] - lee[s]*tmp;
|
chi[ss+s] = chi[ss+s] - lee[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MooeeInvTime+=usecond();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CAYLEY_DPERP_CACHE
|
#ifdef CAYLEY_DPERP_CACHE
|
||||||
|
@ -60,7 +60,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
GridBase *grid=psi._grid;
|
GridBase *grid=psi._grid;
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int LLs = grid->_rdimensions[0];
|
int LLs = grid->_rdimensions[0];
|
||||||
int nsimd= Simd::Nsimd();
|
const int nsimd= Simd::Nsimd();
|
||||||
|
|
||||||
Vector<iSinglet<Simd> > u(LLs);
|
Vector<iSinglet<Simd> > u(LLs);
|
||||||
Vector<iSinglet<Simd> > l(LLs);
|
Vector<iSinglet<Simd> > l(LLs);
|
||||||
@ -86,9 +86,15 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
d_p[ss] = diag[s];
|
d_p[ss] = diag[s];
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
|
||||||
|
M5Dcalls++;
|
||||||
|
M5Dtime-=usecond();
|
||||||
|
|
||||||
|
assert(Nc==3);
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||||
|
#if 0
|
||||||
alignas(64) SiteHalfSpinor hp;
|
alignas(64) SiteHalfSpinor hp;
|
||||||
alignas(64) SiteHalfSpinor hm;
|
alignas(64) SiteHalfSpinor hm;
|
||||||
alignas(64) SiteSpinor fp;
|
alignas(64) SiteSpinor fp;
|
||||||
@ -105,16 +111,113 @@ PARALLEL_FOR_LOOP
|
|||||||
if ( vp<=v ) rotate(hp,hp,1);
|
if ( vp<=v ) rotate(hp,hp,1);
|
||||||
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
||||||
|
|
||||||
hp=hp*0.5;
|
hp=0.5*hp;
|
||||||
hm=hm*0.5;
|
hm=0.5*hm;
|
||||||
|
|
||||||
spRecon5m(fp,hp);
|
spRecon5m(fp,hp);
|
||||||
spRecon5p(fm,hm);
|
spRecon5p(fm,hm);
|
||||||
|
|
||||||
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
|
chi[ss+v] = d[v]*phi[ss+v];
|
||||||
|
chi[ss+v] = chi[ss+v] +u[v]*fp;
|
||||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
for(int v=0;v<LLs;v++){
|
||||||
|
|
||||||
|
vprefetch(psi[ss+v+LLs]);
|
||||||
|
// vprefetch(phi[ss+v+LLs]);
|
||||||
|
|
||||||
|
int vp= (v==LLs-1) ? 0 : v+1;
|
||||||
|
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||||
|
|
||||||
|
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||||
|
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||||
|
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||||
|
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||||
|
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||||
|
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||||
|
|
||||||
|
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||||
|
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||||
|
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||||
|
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||||
|
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||||
|
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||||
|
|
||||||
|
// if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
|
||||||
|
// if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
|
||||||
|
|
||||||
|
if ( vp<=v ) {
|
||||||
|
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||||
|
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||||
|
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||||
|
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||||
|
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||||
|
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||||
}
|
}
|
||||||
|
if ( vm>=v ) {
|
||||||
|
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||||
|
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||||
|
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||||
|
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||||
|
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||||
|
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
|
||||||
|
if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
|
||||||
|
if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
|
||||||
|
if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
|
||||||
|
*/
|
||||||
|
Simd p_00 = d[v]()()() * phi[ss+v]()(0)(0) + l[v]()()()*hm_00;
|
||||||
|
Simd p_01 = d[v]()()() * phi[ss+v]()(0)(1) + l[v]()()()*hm_01;
|
||||||
|
Simd p_02 = d[v]()()() * phi[ss+v]()(0)(2) + l[v]()()()*hm_02;
|
||||||
|
Simd p_10 = d[v]()()() * phi[ss+v]()(1)(0) + l[v]()()()*hm_10;
|
||||||
|
Simd p_11 = d[v]()()() * phi[ss+v]()(1)(1) + l[v]()()()*hm_11;
|
||||||
|
Simd p_12 = d[v]()()() * phi[ss+v]()(1)(2) + l[v]()()()*hm_12;
|
||||||
|
Simd p_20 = d[v]()()() * phi[ss+v]()(2)(0) + u[v]()()()*hp_00;
|
||||||
|
Simd p_21 = d[v]()()() * phi[ss+v]()(2)(1) + u[v]()()()*hp_01;
|
||||||
|
Simd p_22 = d[v]()()() * phi[ss+v]()(2)(2) + u[v]()()()*hp_02;
|
||||||
|
Simd p_30 = d[v]()()() * phi[ss+v]()(3)(0) + u[v]()()()*hp_10;
|
||||||
|
Simd p_31 = d[v]()()() * phi[ss+v]()(3)(1) + u[v]()()()*hp_11;
|
||||||
|
Simd p_32 = d[v]()()() * phi[ss+v]()(3)(2) + u[v]()()()*hp_12;
|
||||||
|
|
||||||
|
|
||||||
|
// if ( ss==0){
|
||||||
|
/*
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
|
||||||
|
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
vstream(chi[ss+v]()(0)(0),p_00);
|
||||||
|
vstream(chi[ss+v]()(0)(1),p_01);
|
||||||
|
vstream(chi[ss+v]()(0)(2),p_02);
|
||||||
|
vstream(chi[ss+v]()(1)(0),p_10);
|
||||||
|
vstream(chi[ss+v]()(1)(1),p_11);
|
||||||
|
vstream(chi[ss+v]()(1)(2),p_12);
|
||||||
|
vstream(chi[ss+v]()(2)(0),p_20);
|
||||||
|
vstream(chi[ss+v]()(2)(1),p_21);
|
||||||
|
vstream(chi[ss+v]()(2)(2),p_22);
|
||||||
|
vstream(chi[ss+v]()(3)(0),p_30);
|
||||||
|
vstream(chi[ss+v]()(3)(1),p_31);
|
||||||
|
vstream(chi[ss+v]()(3)(2),p_32);
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
M5Dtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -154,6 +257,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
d_p[ss] = diag[s];
|
d_p[ss] = diag[s];
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
M5Dcalls++;
|
||||||
|
M5Dtime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||||
|
|
||||||
@ -183,8 +288,8 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
M5Dtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
|
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
|
||||||
{
|
{
|
||||||
@ -250,13 +355,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MooeeInvCalls++;
|
||||||
|
MooeeInvTime-=usecond();
|
||||||
// Dynamic allocate on stack to get per thread without serialised heap acces
|
// Dynamic allocate on stack to get per thread without serialised heap acces
|
||||||
PARALLEL_FOR_LOOP
|
#pragma omp parallel
|
||||||
for(auto site=0;site<vol;site++){
|
{
|
||||||
|
|
||||||
// SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
|
|
||||||
// SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
|
|
||||||
// SiteSpinor *SiteChi =(SiteSpinor *) alloca(LLs*sizeof(SiteSpinor));
|
|
||||||
|
|
||||||
Vector<SiteHalfSpinor> SitePplus(LLs);
|
Vector<SiteHalfSpinor> SitePplus(LLs);
|
||||||
Vector<SiteHalfSpinor> SitePminus(LLs);
|
Vector<SiteHalfSpinor> SitePminus(LLs);
|
||||||
@ -267,6 +370,9 @@ PARALLEL_FOR_LOOP
|
|||||||
SiteHalfSpinor BcastP;
|
SiteHalfSpinor BcastP;
|
||||||
SiteHalfSpinor BcastM;
|
SiteHalfSpinor BcastM;
|
||||||
|
|
||||||
|
#pragma omp for
|
||||||
|
for(auto site=0;site<vol;site++){
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
int lex = s+LLs*site;
|
int lex = s+LLs*site;
|
||||||
spProj5p(SitePplus[s] ,psi[lex]);
|
spProj5p(SitePplus[s] ,psi[lex]);
|
||||||
@ -295,6 +401,8 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
MooeeInvTime+=usecond();
|
||||||
|
}
|
||||||
|
|
||||||
INSTANTIATE_DPERP(DomainWallVec5dImplD);
|
INSTANTIATE_DPERP(DomainWallVec5dImplD);
|
||||||
INSTANTIATE_DPERP(DomainWallVec5dImplF);
|
INSTANTIATE_DPERP(DomainWallVec5dImplF);
|
||||||
|
@ -514,7 +514,7 @@ namespace Optimization {
|
|||||||
template<int n>
|
template<int n>
|
||||||
static inline __m256 tRotate(__m256 in){
|
static inline __m256 tRotate(__m256 in){
|
||||||
__m256 tmp = Permute::Permute0(in);
|
__m256 tmp = Permute::Permute0(in);
|
||||||
__m256 ret;
|
__m256 ret = in;
|
||||||
if ( n > 3 ) {
|
if ( n > 3 ) {
|
||||||
_mm256_alignr_epi32_grid(ret,in,tmp,n);
|
_mm256_alignr_epi32_grid(ret,in,tmp,n);
|
||||||
} else {
|
} else {
|
||||||
@ -526,7 +526,7 @@ namespace Optimization {
|
|||||||
template<int n>
|
template<int n>
|
||||||
static inline __m256d tRotate(__m256d in){
|
static inline __m256d tRotate(__m256d in){
|
||||||
__m256d tmp = Permute::Permute0(in);
|
__m256d tmp = Permute::Permute0(in);
|
||||||
__m256d ret;
|
__m256d ret = in;
|
||||||
if ( n > 1 ) {
|
if ( n > 1 ) {
|
||||||
_mm256_alignr_epi64_grid(ret,in,tmp,n);
|
_mm256_alignr_epi64_grid(ret,in,tmp,n);
|
||||||
} else {
|
} else {
|
||||||
|
@ -86,13 +86,13 @@ namespace Optimization {
|
|||||||
struct Vstream{
|
struct Vstream{
|
||||||
//Float
|
//Float
|
||||||
inline void operator()(float * a, __m512 b){
|
inline void operator()(float * a, __m512 b){
|
||||||
//_mm512_stream_ps(a,b);
|
_mm512_stream_ps(a,b);
|
||||||
_mm512_store_ps(a,b);
|
// _mm512_store_ps(a,b);
|
||||||
}
|
}
|
||||||
//Double
|
//Double
|
||||||
inline void operator()(double * a, __m512d b){
|
inline void operator()(double * a, __m512d b){
|
||||||
//_mm512_stream_pd(a,b);
|
_mm512_stream_pd(a,b);
|
||||||
_mm512_store_pd(a,b);
|
// _mm512_store_pd(a,b);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -130,7 +130,7 @@ class Grid_simd {
|
|||||||
|
|
||||||
Vector_type v;
|
Vector_type v;
|
||||||
|
|
||||||
static inline int Nsimd(void) {
|
static inline constexpr int Nsimd(void) {
|
||||||
return sizeof(Vector_type) / sizeof(Scalar_type);
|
return sizeof(Vector_type) / sizeof(Scalar_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user