mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Merge branch 'develop' into temporary-smearing
This commit is contained in:
commit
fdfbf11c6d
1
.gitignore
vendored
1
.gitignore
vendored
@ -62,6 +62,7 @@ stamp-h1
|
|||||||
config.sub
|
config.sub
|
||||||
config.guess
|
config.guess
|
||||||
INSTALL
|
INSTALL
|
||||||
|
.dirstamp
|
||||||
|
|
||||||
# Packages #
|
# Packages #
|
||||||
############
|
############
|
||||||
|
@ -88,3 +88,7 @@ script:
|
|||||||
- ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
|
- ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
|
||||||
- make -j4
|
- make -j4
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1
|
- ./benchmarks/Benchmark_dwf --threads 1
|
||||||
|
- make clean
|
||||||
|
- ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none
|
||||||
|
- make -j4
|
||||||
|
- ./benchmarks/Benchmark_dwf --threads 1
|
||||||
|
4
VERSION
Normal file
4
VERSION
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
Version : 0.5.0
|
||||||
|
|
||||||
|
- AVX512, AVX2, AVX, SSE good
|
||||||
|
- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above
|
@ -165,11 +165,11 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
{
|
{
|
||||||
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
|
||||||
LatticeFermionF ssrc(sFGrid);
|
LatticeFermion ssrc(sFGrid);
|
||||||
LatticeFermionF sref(sFGrid);
|
LatticeFermion sref(sFGrid);
|
||||||
LatticeFermionF sresult(sFGrid);
|
LatticeFermion sresult(sFGrid);
|
||||||
WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
|
WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
|
||||||
|
|
||||||
for(int x=0;x<latt4[0];x++){
|
for(int x=0;x<latt4[0];x++){
|
||||||
for(int y=0;y<latt4[1];y++){
|
for(int y=0;y<latt4[1];y++){
|
||||||
@ -177,7 +177,7 @@ int main (int argc, char ** argv)
|
|||||||
for(int t=0;t<latt4[3];t++){
|
for(int t=0;t<latt4[3];t++){
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
std::vector<int> site({s,x,y,z,t});
|
std::vector<int> site({s,x,y,z,t});
|
||||||
SpinColourVectorF tmp;
|
SpinColourVector tmp;
|
||||||
peekSite(tmp,src,site);
|
peekSite(tmp,src,site);
|
||||||
pokeSite(tmp,ssrc,site);
|
pokeSite(tmp,ssrc,site);
|
||||||
}}}}}
|
}}}}}
|
||||||
@ -217,7 +217,7 @@ int main (int argc, char ** argv)
|
|||||||
for(int t=0;t<latt4[3];t++){
|
for(int t=0;t<latt4[3];t++){
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
std::vector<int> site({s,x,y,z,t});
|
std::vector<int> site({s,x,y,z,t});
|
||||||
SpinColourVectorF normal, simd;
|
SpinColourVector normal, simd;
|
||||||
peekSite(normal,result,site);
|
peekSite(normal,result,site);
|
||||||
peekSite(simd,sresult,site);
|
peekSite(simd,sresult,site);
|
||||||
sum=sum+norm2(normal-simd);
|
sum=sum+norm2(normal-simd);
|
||||||
@ -230,8 +230,8 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
if (1) {
|
if (1) {
|
||||||
|
|
||||||
LatticeFermionF sr_eo(sFGrid);
|
LatticeFermion sr_eo(sFGrid);
|
||||||
LatticeFermionF serr(sFGrid);
|
LatticeFermion serr(sFGrid);
|
||||||
|
|
||||||
LatticeFermion ssrc_e (sFrbGrid);
|
LatticeFermion ssrc_e (sFrbGrid);
|
||||||
LatticeFermion ssrc_o (sFrbGrid);
|
LatticeFermion ssrc_o (sFrbGrid);
|
||||||
|
369
benchmarks/Benchmark_dwf_sweep.cc
Normal file
369
benchmarks/Benchmark_dwf_sweep.cc
Normal file
@ -0,0 +1,369 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
template<class d>
|
||||||
|
struct scal {
|
||||||
|
d internal;
|
||||||
|
};
|
||||||
|
|
||||||
|
Gamma::GammaMatrix Gmu [] = {
|
||||||
|
Gamma::GammaX,
|
||||||
|
Gamma::GammaY,
|
||||||
|
Gamma::GammaZ,
|
||||||
|
Gamma::GammaT
|
||||||
|
};
|
||||||
|
|
||||||
|
void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
|
||||||
|
void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
const int Ls=16;
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
if ( getenv("ASMOPT") ) {
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=1;
|
||||||
|
} else {
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s) "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
|
||||||
|
int Lmax=32;
|
||||||
|
int dmin=0;
|
||||||
|
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
|
||||||
|
if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
|
||||||
|
for (int L=8;L<=Lmax;L*=2){
|
||||||
|
std::vector<int> latt4(4,L);
|
||||||
|
for(int d=4;d>dmin;d--){
|
||||||
|
if ( d<=3 ) latt4[d]*=2;
|
||||||
|
std::cout << GridLogMessage <<"\t";
|
||||||
|
for(int d=0;d<Nd;d++){
|
||||||
|
std::cout<<latt4[d]<<"x";
|
||||||
|
}
|
||||||
|
std::cout <<Ls<<"\t" ;
|
||||||
|
benchDw (latt4,Ls,threads,0);
|
||||||
|
benchsDw(latt4,Ls,threads,0);
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
{
|
||||||
|
std::vector<int> latt4(4,16);
|
||||||
|
std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
|
||||||
|
benchDw (latt4,Ls,threads,1);
|
||||||
|
std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
|
||||||
|
benchsDw(latt4,Ls,threads,1);
|
||||||
|
}
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef CHECK
|
||||||
|
|
||||||
|
void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
|
||||||
|
{
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
#ifdef CHECK
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
LatticeFermion src (FGrid); random(RNG5,src);
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
#else
|
||||||
|
LatticeFermion src (FGrid); src=zero;
|
||||||
|
LatticeGaugeField Umu(UGrid); Umu=zero;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LatticeFermion result(FGrid); result=zero;
|
||||||
|
LatticeFermion ref(FGrid); ref=zero;
|
||||||
|
LatticeFermion tmp(FGrid);
|
||||||
|
LatticeFermion err(FGrid);
|
||||||
|
|
||||||
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
|
|
||||||
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
|
// replicate across fifth dimension
|
||||||
|
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Naive wilson implementation
|
||||||
|
////////////////////////////////////
|
||||||
|
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CHECK
|
||||||
|
if (1)
|
||||||
|
{
|
||||||
|
ref = zero;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
|
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||||
|
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||||
|
|
||||||
|
tmp =adj(U[mu])*src;
|
||||||
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
|
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||||
|
}
|
||||||
|
ref = -0.5*ref;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
#ifdef TIMERS_OFF
|
||||||
|
int ncall =10;
|
||||||
|
#else
|
||||||
|
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (ncall < 5 ) exit(0);
|
||||||
|
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
|
||||||
|
PerformanceCounter Counter(8);
|
||||||
|
Counter.Start();
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
Counter.Stop();
|
||||||
|
if ( report ) {
|
||||||
|
Counter.Report();
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! report )
|
||||||
|
{
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CHECK
|
||||||
|
err = ref-result;
|
||||||
|
RealD errd = norm2(err);
|
||||||
|
if ( errd> 1.0e-4 ) {
|
||||||
|
std::cout<<GridLogMessage << "oops !!! norm diff "<< norm2(err)<<std::endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LatticeFermion src_e (FrbGrid);
|
||||||
|
LatticeFermion src_o (FrbGrid);
|
||||||
|
LatticeFermion r_e (FrbGrid);
|
||||||
|
LatticeFermion r_o (FrbGrid);
|
||||||
|
LatticeFermion r_eo (FGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
{
|
||||||
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
if(!report){
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
std::cout<< flops/(t1-t0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef CHECK_SDW
|
||||||
|
void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
|
||||||
|
{
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
#ifdef CHECK_SDW
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
LatticeFermion src (FGrid); random(RNG5,src);
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
#else
|
||||||
|
LatticeFermion src (FGrid); src=zero;
|
||||||
|
LatticeGaugeField Umu(UGrid); Umu=zero;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LatticeFermion result(FGrid); result=zero;
|
||||||
|
LatticeFermion ref(FGrid); ref=zero;
|
||||||
|
LatticeFermion tmp(FGrid);
|
||||||
|
LatticeFermion err(FGrid);
|
||||||
|
|
||||||
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
|
// replicate across fifth dimension
|
||||||
|
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
|
||||||
|
LatticeFermion ssrc(sFGrid);
|
||||||
|
LatticeFermion sref(sFGrid);
|
||||||
|
LatticeFermion sresult(sFGrid);
|
||||||
|
WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
|
||||||
|
|
||||||
|
for(int x=0;x<latt4[0];x++){
|
||||||
|
for(int y=0;y<latt4[1];y++){
|
||||||
|
for(int z=0;z<latt4[2];z++){
|
||||||
|
for(int t=0;t<latt4[3];t++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
std::vector<int> site({s,x,y,z,t});
|
||||||
|
SpinColourVector tmp;
|
||||||
|
peekSite(tmp,src,site);
|
||||||
|
pokeSite(tmp,ssrc,site);
|
||||||
|
}}}}}
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
#ifdef TIMERS_OFF
|
||||||
|
int ncall =10;
|
||||||
|
#else
|
||||||
|
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
PerformanceCounter Counter(8);
|
||||||
|
Counter.Start();
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
Counter.Stop();
|
||||||
|
|
||||||
|
if ( report ) {
|
||||||
|
Counter.Report();
|
||||||
|
} else {
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
std::cout<<"\t"<< flops/(t1-t0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
LatticeFermion sr_eo(sFGrid);
|
||||||
|
LatticeFermion serr(sFGrid);
|
||||||
|
|
||||||
|
LatticeFermion ssrc_e (sFrbGrid);
|
||||||
|
LatticeFermion ssrc_o (sFrbGrid);
|
||||||
|
LatticeFermion sr_e (sFrbGrid);
|
||||||
|
LatticeFermion sr_o (sFrbGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,ssrc_e,ssrc);
|
||||||
|
pickCheckerboard(Odd,ssrc_o,ssrc);
|
||||||
|
|
||||||
|
setCheckerboard(sr_eo,ssrc_o);
|
||||||
|
setCheckerboard(sr_eo,ssrc_e);
|
||||||
|
|
||||||
|
sr_e = zero;
|
||||||
|
sr_o = zero;
|
||||||
|
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
PerformanceCounter CounterSdw(8);
|
||||||
|
CounterSdw.Start();
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
__SSC_START;
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
__SSC_STOP;
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
CounterSdw.Stop();
|
||||||
|
|
||||||
|
if ( report ) {
|
||||||
|
CounterSdw.Report();
|
||||||
|
} else {
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
std::cout<<"\t"<< flops/(t1-t0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
|
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
|
||||||
|
|
||||||
|
|
||||||
Benchmark_comms_SOURCES=Benchmark_comms.cc
|
Benchmark_comms_SOURCES=Benchmark_comms.cc
|
||||||
@ -14,6 +14,10 @@ Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
|
|||||||
Benchmark_dwf_ntpf_LDADD=-lGrid
|
Benchmark_dwf_ntpf_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
|
Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
|
||||||
|
Benchmark_dwf_sweep_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
|
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
|
||||||
Benchmark_memory_asynch_LDADD=-lGrid
|
Benchmark_memory_asynch_LDADD=-lGrid
|
||||||
|
|
||||||
|
0
lib/.dirstamp
Normal file
0
lib/.dirstamp
Normal file
@ -262,7 +262,6 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
std::cout <<COL_YELLOW<< std::endl;
|
std::cout <<COL_YELLOW<< std::endl;
|
||||||
std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
|
std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
|
||||||
std::cout << "Colours by Tadahito Boyle "<<std::endl;
|
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
|
std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
|
||||||
std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
|
std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
|
||||||
@ -274,6 +273,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the"<<std::endl;
|
std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the"<<std::endl;
|
||||||
std::cout << "GNU General Public License for more details."<<std::endl;
|
std::cout << "GNU General Public License for more details."<<std::endl;
|
||||||
std::cout << COL_BACKGROUND <<std::endl;
|
std::cout << COL_BACKGROUND <<std::endl;
|
||||||
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
@ -261,6 +261,9 @@
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline uint64_t Touch(int ent) {
|
||||||
|
// _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
|
||||||
|
}
|
||||||
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
||||||
_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
|
_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
|
||||||
local = _entries[ent]._is_local;
|
local = _entries[ent]._is_local;
|
||||||
@ -269,6 +272,11 @@
|
|||||||
if (local) return base + _entries[ent]._byte_offset;
|
if (local) return base + _entries[ent]._byte_offset;
|
||||||
else return _entries[ent]._byte_offset;
|
else return _entries[ent]._byte_offset;
|
||||||
}
|
}
|
||||||
|
inline uint64_t GetPFInfo(int ent,uint64_t base) {
|
||||||
|
int local = _entries[ent]._is_local;
|
||||||
|
if (local) return base + _entries[ent]._byte_offset;
|
||||||
|
else return _entries[ent]._byte_offset;
|
||||||
|
}
|
||||||
|
|
||||||
// Comms buffers
|
// Comms buffers
|
||||||
std::vector<Vector<scalar_object> > u_simd_send_buf;
|
std::vector<Vector<scalar_object> > u_simd_send_buf;
|
||||||
|
0
lib/algorithms/approx/.dirstamp
Normal file
0
lib/algorithms/approx/.dirstamp
Normal file
0
lib/communicator/.dirstamp
Normal file
0
lib/communicator/.dirstamp
Normal file
0
lib/qcd/action/fermion/.dirstamp
Normal file
0
lib/qcd/action/fermion/.dirstamp
Normal file
@ -63,7 +63,7 @@ namespace Grid {
|
|||||||
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
||||||
assert(zdata->n==this->Ls);
|
assert(zdata->n==this->Ls);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
|
// std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
|
||||||
// Call base setter
|
// Call base setter
|
||||||
this->SetCoefficientsTanh(zdata,1.0,0.0);
|
this->SetCoefficientsTanh(zdata,1.0,0.0);
|
||||||
|
|
||||||
|
@ -53,6 +53,8 @@ namespace QCD {
|
|||||||
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
|
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
|
||||||
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
|
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd (&Hgrid)
|
UmuOdd (&Hgrid)
|
||||||
@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
out.checkerboard = in.checkerboard;
|
out.checkerboard = in.checkerboard;
|
||||||
|
|
||||||
DhopInternal(Stencil,Umu,in,out,dag);
|
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
|
|||||||
assert(in.checkerboard==Even);
|
assert(in.checkerboard==Even);
|
||||||
out.checkerboard = Odd;
|
out.checkerboard = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven,UmuOdd,in,out,dag);
|
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
|
|||||||
assert(in.checkerboard==Odd);
|
assert(in.checkerboard==Odd);
|
||||||
out.checkerboard = Even;
|
out.checkerboard = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -285,7 +287,7 @@ PARALLEL_FOR_LOOP
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
@ -296,12 +298,12 @@ PARALLEL_FOR_LOOP
|
|||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -111,7 +111,7 @@ namespace Grid {
|
|||||||
const FermionField &B,
|
const FermionField &B,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
const FermionField &in, FermionField &out,int dag) ;
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
@ -146,6 +146,10 @@ namespace Grid {
|
|||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -321,14 +321,14 @@ PARALLEL_FOR_LOOP
|
|||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF=LLs*sU;
|
int sF=LLs*sU;
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF=LLs*sU;
|
int sF=LLs*sU;
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -38,20 +38,23 @@ template<class Impl>
|
|||||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
|
#ifdef AVX512
|
||||||
if ( AsmOpt ) {
|
if ( AsmOpt ) {
|
||||||
|
|
||||||
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
|
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
#endif
|
||||||
for(int site=0;site<Ns;site++) {
|
for(int site=0;site<Ns;site++) {
|
||||||
for(int s=0;s<Ls;s++) {
|
for(int s=0;s<Ls;s++) {
|
||||||
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
|
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
@ -61,17 +64,17 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
// No asm implementation yet.
|
// No asm implementation yet.
|
||||||
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
|
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
// else
|
// else
|
||||||
for(int site=0;site<Ns;site++) {
|
for(int site=0;site<Ns;site++) {
|
||||||
for(int s=0;s<Ls;s++) {
|
for(int s=0;s<Ls;s++) {
|
||||||
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
|
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
@ -84,7 +87,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
|
|||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -262,7 +265,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaug
|
|||||||
|
|
||||||
// Need controls to do interior, exterior, or both
|
// Need controls to do interior, exterior, or both
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
|
@ -53,11 +53,11 @@ namespace Grid {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
|
int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
@ -67,24 +67,24 @@ namespace Grid {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
// Specialised variants
|
// Specialised variants
|
||||||
void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU, const FermionField &in, FermionField &out);
|
int sF,int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in,FermionField &out);
|
int sF,int sU,const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
public:
|
public:
|
||||||
|
@ -39,9 +39,9 @@ namespace QCD {
|
|||||||
// Default to no assembler implementation
|
// Default to no assembler implementation
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -67,49 +67,54 @@ int setupSigns(void ){
|
|||||||
}
|
}
|
||||||
static int signInit = setupSigns();
|
static int signInit = setupSigns();
|
||||||
|
|
||||||
|
#define label(A) ilabel(A)
|
||||||
|
#define ilabel(A) ".globl\n" #A ":\n"
|
||||||
|
|
||||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||||
|
#define FX(A) WILSONASM_ ##A
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#undef VMOVIDUP
|
#undef VMOVIDUP
|
||||||
#undef VMOVRDUP
|
#undef VMOVRDUP
|
||||||
#undef MAYBEPERM
|
#undef MAYBEPERM
|
||||||
#undef MULT_2SPIN
|
#undef MULT_2SPIN
|
||||||
|
#undef FX
|
||||||
|
#define FX(A) DWFASM_ ## A
|
||||||
#define MAYBEPERM(A,B)
|
#define MAYBEPERM(A,B)
|
||||||
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
|
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
|
||||||
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
||||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
@ -1,35 +1,44 @@
|
|||||||
{
|
{
|
||||||
int locala,perma, ptypea;
|
int local,perm, ptype;
|
||||||
int localb,permb, ptypeb;
|
uint64_t base;
|
||||||
uint64_t basea, baseb;
|
uint64_t basep;
|
||||||
uint64_t basex;
|
|
||||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
// vComplexF isigns[2] = { signs[0], signs[1] };
|
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||||
vComplexF *isigns = &signs[0];
|
vComplexF *isigns = &signs[0];
|
||||||
|
|
||||||
MASK_REGS;
|
MASK_REGS;
|
||||||
|
int nmax=U._grid->oSites();
|
||||||
for(int site=0;site<Ns;site++) {
|
for(int site=0;site<Ns;site++) {
|
||||||
|
int sU =lo.Reorder(ssU);
|
||||||
|
int ssn=ssU+1;
|
||||||
|
if(ssn>=nmax) ssn=0;
|
||||||
|
int sUn=lo.Reorder(ssn);
|
||||||
for(int s=0;s<Ls;s++) {
|
for(int s=0;s<Ls;s++) {
|
||||||
|
ss =sU*Ls+s;
|
||||||
|
ssn=sUn*Ls+s;
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Xp
|
// Xp
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
int ent=ss*8;// 2*Ndim
|
int ent=ss*8;// 2*Ndim
|
||||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
int nent=ssn*8;
|
||||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
|
||||||
basex = basea;
|
|
||||||
|
|
||||||
if ( locala ) {
|
PF_GAUGE(Xp);
|
||||||
|
base = st.GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH1_CHIMU(base);
|
||||||
|
|
||||||
|
basep = st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
LOAD64(%r10,isigns);
|
LOAD64(%r10,isigns);
|
||||||
XM_PROJMEM(basea);
|
XM_PROJMEM(base);
|
||||||
MAYBEPERM(PERMUTE_DIR3,perma);
|
MAYBEPERM(PERMUTE_DIR3,perm);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(basea);
|
LOAD_CHI(base);
|
||||||
}
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
MULT_2SPIN_DIR_PFXP(Xp,basep);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns);
|
LOAD64(%r10,isigns);
|
||||||
XM_RECON;
|
XM_RECON;
|
||||||
@ -37,16 +46,18 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Yp
|
// Yp
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
basep = st.GetPFInfo(nent,plocal); nent++;
|
||||||
if ( localb ) {
|
if ( local ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
YM_PROJMEM(baseb);
|
YM_PROJMEM(base);
|
||||||
MAYBEPERM(PERMUTE_DIR2,permb);
|
MAYBEPERM(PERMUTE_DIR2,perm);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(baseb);
|
LOAD_CHI(base);
|
||||||
}
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFYP(Yp,basea);
|
MULT_2SPIN_DIR_PFYP(Yp,basep);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
YM_RECON_ACCUM;
|
YM_RECON_ACCUM;
|
||||||
@ -54,16 +65,18 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Zp
|
// Zp
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
basep = st.GetPFInfo(nent,plocal); nent++;
|
||||||
if ( locala ) {
|
if ( local ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
ZM_PROJMEM(basea);
|
ZM_PROJMEM(base);
|
||||||
MAYBEPERM(PERMUTE_DIR1,perma);
|
MAYBEPERM(PERMUTE_DIR1,perm);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(basea);
|
LOAD_CHI(base);
|
||||||
}
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFZP(Zp,baseb);
|
MULT_2SPIN_DIR_PFZP(Zp,basep);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
ZM_RECON_ACCUM;
|
ZM_RECON_ACCUM;
|
||||||
@ -71,16 +84,18 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Tp
|
// Tp
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
basep = st.GetPFInfo(nent,plocal); nent++;
|
||||||
if ( localb ) {
|
if ( local ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
TM_PROJMEM(baseb);
|
TM_PROJMEM(base);
|
||||||
MAYBEPERM(PERMUTE_DIR0,permb);
|
MAYBEPERM(PERMUTE_DIR0,perm);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(baseb);
|
LOAD_CHI(base);
|
||||||
}
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFTP(Tp,basea);
|
MULT_2SPIN_DIR_PFTP(Tp,basep);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
TM_RECON_ACCUM;
|
TM_RECON_ACCUM;
|
||||||
@ -88,16 +103,19 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Xm
|
// Xm
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
basep= (uint64_t) &out._odata[ss];
|
||||||
if ( locala ) {
|
// basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
XP_PROJMEM(basea);
|
XP_PROJMEM(base);
|
||||||
MAYBEPERM(PERMUTE_DIR3,perma);
|
MAYBEPERM(PERMUTE_DIR3,perm);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(basea);
|
LOAD_CHI(base);
|
||||||
}
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFXM(Xm,baseb);
|
MULT_2SPIN_DIR_PFXM(Xm,basep);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
XP_RECON_ACCUM;
|
XP_RECON_ACCUM;
|
||||||
@ -105,16 +123,18 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Ym
|
// Ym
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
if ( localb ) {
|
if ( local ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
YP_PROJMEM(baseb);
|
YP_PROJMEM(base);
|
||||||
MAYBEPERM(PERMUTE_DIR2,permb);
|
MAYBEPERM(PERMUTE_DIR2,perm);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(baseb);
|
LOAD_CHI(base);
|
||||||
}
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
MULT_2SPIN_DIR_PFYM(Ym,basep);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
YP_RECON_ACCUM;
|
YP_RECON_ACCUM;
|
||||||
@ -122,16 +142,18 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Zm
|
// Zm
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
if ( locala ) {
|
if ( local ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
ZP_PROJMEM(basea);
|
ZP_PROJMEM(base);
|
||||||
MAYBEPERM(PERMUTE_DIR1,perma);
|
MAYBEPERM(PERMUTE_DIR1,perm);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(basea);
|
LOAD_CHI(base);
|
||||||
}
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
MULT_2SPIN_DIR_PFZM(Zm,basep);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
ZP_RECON_ACCUM;
|
ZP_RECON_ACCUM;
|
||||||
@ -139,26 +161,26 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Tm
|
// Tm
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
basea = (uint64_t)&out._odata[ss];
|
basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
if ( localb ) {
|
if ( local ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
TP_PROJMEM(baseb);
|
TP_PROJMEM(base);
|
||||||
MAYBEPERM(PERMUTE_DIR0,permb);
|
MAYBEPERM(PERMUTE_DIR0,perm);
|
||||||
} else {
|
} else {
|
||||||
LOAD_CHI(baseb);
|
LOAD_CHI(base);
|
||||||
}
|
}
|
||||||
|
base= (uint64_t) &out._odata[ss];
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFTM(Tm,basea);
|
MULT_2SPIN_DIR_PFTM(Tm,basep);
|
||||||
}
|
}
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
TP_RECON_ACCUM;
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
PREFETCH_CHIMU(basex);
|
basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
SAVE_RESULT(&out._odata[ss]);
|
SAVE_RESULT(base,basep);
|
||||||
|
|
||||||
|
|
||||||
ss++;
|
}
|
||||||
}
|
ssU++;
|
||||||
sU++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
161
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
Normal file
161
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
{
|
||||||
|
int locala,perma, ptypea;
|
||||||
|
int localb,permb, ptypeb;
|
||||||
|
uint64_t basea, baseb;
|
||||||
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
|
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||||
|
vComplexF *isigns = &signs[0];
|
||||||
|
|
||||||
|
MASK_REGS;
|
||||||
|
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
int sU=lo.Reorder(ssU);
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
ss=sU*Ls+s;
|
||||||
|
////////////////////////////////
|
||||||
|
// Xp
|
||||||
|
////////////////////////////////
|
||||||
|
int ent=ss*8;// 2*Ndim
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_RECON;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Yp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYP(Yp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zp
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZP(Zp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTP(Tp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xm
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXM(Xm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Ym
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zm
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tm
|
||||||
|
////////////////////////////////
|
||||||
|
basea = (uint64_t)&out._odata[ss];
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTM(Tm,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
|
SAVE_RESULT(&out._odata[ss],baseb);
|
||||||
|
|
||||||
|
}
|
||||||
|
ssU++;
|
||||||
|
}
|
||||||
|
}
|
187
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
Normal file
187
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
{
|
||||||
|
int locala,perma, ptypea;
|
||||||
|
int localb,permb, ptypeb;
|
||||||
|
int localc,permc, ptypec;
|
||||||
|
uint64_t basea, baseb, basec;
|
||||||
|
uint64_t basex;
|
||||||
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
|
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||||
|
vComplexF *isigns = &signs[0];
|
||||||
|
|
||||||
|
MASK_REGS;
|
||||||
|
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
int sU=lo.Reorder(ssU);
|
||||||
|
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
ss =sU*Ls+s;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xp
|
||||||
|
////////////////////////////////
|
||||||
|
int ent=ss*8;// 2*Ndim
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(baseb);
|
||||||
|
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basec);
|
||||||
|
|
||||||
|
basex = basea;
|
||||||
|
|
||||||
|
label(FX(XP) );
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_RECON;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Yp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
|
label(FX(YP) );
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYP(Yp,basec);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zp
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(baseb);
|
||||||
|
label(FX(ZP) );
|
||||||
|
if ( localc ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_PROJMEM(basec);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,permc);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basec);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZP(Zp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tp
|
||||||
|
////////////////////////////////
|
||||||
|
basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basec);
|
||||||
|
label(FX(TP) );
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTP(Tp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xm
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
|
label(FX(XM) );
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXM(Xm,basec);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Ym
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(baseb);
|
||||||
|
label(FX(YM) );
|
||||||
|
if ( localc ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_PROJMEM(basec);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permc);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basec);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zm
|
||||||
|
////////////////////////////////
|
||||||
|
basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basec);
|
||||||
|
label(FX(ZM) );
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tm
|
||||||
|
////////////////////////////////
|
||||||
|
basea = (uint64_t)&out._odata[ss];
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
|
label(FX(TM) );
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTM(Tm,basec);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
|
// PREFETCH_CHIMU(basex);
|
||||||
|
label(FX(SAV) );
|
||||||
|
SAVE_RESULT(&out._odata[ss]);
|
||||||
|
|
||||||
|
}
|
||||||
|
ssU++;
|
||||||
|
}
|
||||||
|
}
|
@ -312,7 +312,7 @@ namespace QCD {
|
|||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -555,7 +555,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -803,7 +803,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
|||||||
// Specialise Gparity to simple implementation
|
// Specialise Gparity to simple implementation
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -811,7 +811,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Dou
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -819,7 +819,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -827,7 +827,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Dou
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -839,44 +839,44 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
|
|||||||
////////////// Wilson ; uses this implementation /////////////////////
|
////////////// Wilson ; uses this implementation /////////////////////
|
||||||
// Need Nc=3 though //
|
// Need Nc=3 though //
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
0
lib/qcd/hmc/.dirstamp
Normal file
0
lib/qcd/hmc/.dirstamp
Normal file
0
lib/qcd/spin/.dirstamp
Normal file
0
lib/qcd/spin/.dirstamp
Normal file
0
lib/qcd/utils/.dirstamp
Normal file
0
lib/qcd/utils/.dirstamp
Normal file
@ -1,4 +1,4 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ASM_INTEL_COMMON_512_H
|
#ifndef GRID_ASM_INTEL_COMMON_512_H
|
||||||
#define GRID_ASM_INTEL_COMMON_512_H
|
#define GRID_ASM_INTEL_COMMON_512_H
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Peformance options
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#undef AVX512_PF_L2_WRITE
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Opcodes common
|
// Opcodes common
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -37,6 +42,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
"mov $0x5555, %%eax \n"\
|
"mov $0x5555, %%eax \n"\
|
||||||
"kmovw %%eax, %%k7 \n" : : : "%eax");
|
"kmovw %%eax, %%k7 \n" : : : "%eax");
|
||||||
|
|
||||||
|
//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
|
||||||
|
|
||||||
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
|
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||||
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
|
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||||
|
|
||||||
@ -86,9 +93,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
||||||
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
||||||
|
|
||||||
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n"
|
#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n"
|
||||||
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||||
|
#ifdef AVX512_PF_L2_WRITE
|
||||||
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
||||||
|
#else
|
||||||
|
#define VPREFETCHW(O,A)
|
||||||
|
#endif
|
||||||
|
#define VPREFETCHNTA(O,A)
|
||||||
|
#define VPREFETCH(O,A)
|
||||||
|
|
||||||
#define VEVICT(O,A)
|
#define VEVICT(O,A)
|
||||||
|
|
||||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||||
@ -124,8 +138,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||||
|
|
||||||
#define VPREFETCHNTA(O,A)
|
|
||||||
#define VPREFETCH(O,A)
|
|
||||||
|
|
||||||
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
|
@ -104,7 +104,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
||||||
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
||||||
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
||||||
#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
|
#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
|
||||||
|
|
||||||
#define LOAD_CHIMUi \
|
#define LOAD_CHIMUi \
|
||||||
LOAD_CHIMU01i \
|
LOAD_CHIMU01i \
|
||||||
@ -169,22 +169,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSTORE(5,%r8,Chi_12) \
|
VSTORE(5,%r8,Chi_12) \
|
||||||
);
|
);
|
||||||
|
|
||||||
#define SAVE_RESULTi(PTR)\
|
|
||||||
LOAD64(%r8,PTR) \
|
|
||||||
__asm__ ( \
|
|
||||||
VSTORE(0,%r8,result_00) \
|
|
||||||
VSTORE(1,%r8,result_01) \
|
|
||||||
VSTORE(2,%r8,result_02) \
|
|
||||||
VSTORE(3,%r8,result_10) \
|
|
||||||
VSTORE(4,%r8,result_11) \
|
|
||||||
VSTORE(5,%r8,result_12) \
|
|
||||||
VSTORE(6,%r8,result_20) \
|
|
||||||
VSTORE(7,%r8,result_21) \
|
|
||||||
VSTORE(8,%r8,result_22) \
|
|
||||||
VSTORE(9,%r8,result_30) \
|
|
||||||
VSTORE(10,%r8,result_31) \
|
|
||||||
VSTORE(11,%r8,result_32) \
|
|
||||||
);
|
|
||||||
|
|
||||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
|
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
|
||||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
|
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
|
||||||
@ -277,8 +261,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define XM_PROJMEM(PTR) \
|
#define XM_PROJMEM(PTR) \
|
||||||
LOAD64(%r8,PTR)\
|
LOAD64(%r8,PTR)\
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
SHUF_CHIMU23i \
|
|
||||||
LOAD_CHIi \
|
LOAD_CHIi \
|
||||||
|
SHUF_CHIMU23i \
|
||||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
|
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
|
||||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
|
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
|
||||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
|
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
|
||||||
@ -306,8 +290,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define ZM_PROJMEM(PTR) \
|
#define ZM_PROJMEM(PTR) \
|
||||||
LOAD64(%r8,PTR) \
|
LOAD64(%r8,PTR) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
SHUF_CHIMU23i \
|
|
||||||
LOAD_CHIi \
|
LOAD_CHIi \
|
||||||
|
SHUF_CHIMU23i \
|
||||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
|
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
|
||||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
|
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
|
||||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
|
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
|
||||||
@ -559,23 +543,95 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSUB(UChi_02,result_22,result_22)\
|
VSUB(UChi_02,result_22,result_22)\
|
||||||
VSUB(UChi_12,result_32,result_32) );
|
VSUB(UChi_12,result_32,result_32) );
|
||||||
|
|
||||||
#define PREFETCH_CHIMU(A) \
|
#define AVX512_PF_L1
|
||||||
LOAD64(%r9,A) \
|
#define AVX512_PF_L2_GAUGE
|
||||||
__asm__ ( \
|
#define AVX512_PF_L2_TABLE
|
||||||
VPREFETCHG(12,%r9)\
|
#undef AVX512_PF_L2_LINEAR
|
||||||
VPREFETCHG(13,%r9)\
|
|
||||||
VPREFETCHG(14,%r9)\
|
|
||||||
VPREFETCHG(15,%r9)\
|
|
||||||
VPREFETCHG(16,%r9)\
|
|
||||||
VPREFETCHG(17,%r9)\
|
|
||||||
VPREFETCHG(18,%r9)\
|
|
||||||
VPREFETCHG(19,%r9)\
|
|
||||||
VPREFETCHG(20,%r9)\
|
|
||||||
VPREFETCHG(21,%r9)\
|
|
||||||
VPREFETCHG(22,%r9)\
|
|
||||||
VPREFETCHG(23,%r9));
|
|
||||||
|
|
||||||
#define PERMUTE_DIR0 __asm__ ( \
|
#ifdef AVX512_PF_L2_TABLE
|
||||||
|
// P1 Fetches the base pointer for next link into L1 with P1
|
||||||
|
// M1 Fetches the next site pointer into L2
|
||||||
|
#define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
|
||||||
|
#define VPREFETCH_P2(A,B)
|
||||||
|
#define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
|
||||||
|
#define VPREFETCH_M2(A,B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef AVX512_PF_L2_LINEAR
|
||||||
|
#define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
|
||||||
|
#define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
|
||||||
|
#define VPREFETCH_P1(A,B)
|
||||||
|
#define VPREFETCH_P2(A,B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef AVX512_PF_L2_GAUGE
|
||||||
|
#define VPREFETCH_G1(A,B) VPREFETCH1(A,B)
|
||||||
|
#define VPREFETCH_G2(A,B) VPREFETCH2(A,B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define PF_GAUGE(A) \
|
||||||
|
LOAD64(%r8,&U._odata[sU](A)) \
|
||||||
|
__asm__ ( \
|
||||||
|
VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \
|
||||||
|
VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \
|
||||||
|
);
|
||||||
|
|
||||||
|
#define SAVE_RESULTi(PTR,pf) \
|
||||||
|
LOAD64(%r8,PTR) \
|
||||||
|
LOAD64(%r9,pf) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \
|
||||||
|
VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \
|
||||||
|
VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \
|
||||||
|
VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \
|
||||||
|
VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \
|
||||||
|
VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \
|
||||||
|
VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \
|
||||||
|
VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \
|
||||||
|
VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \
|
||||||
|
VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \
|
||||||
|
VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \
|
||||||
|
VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \
|
||||||
|
);
|
||||||
|
|
||||||
|
#ifdef AVX512_PF_L2_TABLE
|
||||||
|
#define PREFETCH_CHIMU(A) \
|
||||||
|
LOAD64(%r9,A) \
|
||||||
|
__asm__ ( \
|
||||||
|
VPREFETCH_P1(0,%r9) \
|
||||||
|
VPREFETCH_P1(1,%r9) \
|
||||||
|
VPREFETCH_P1(2,%r9) \
|
||||||
|
VPREFETCH_P1(3,%r9) \
|
||||||
|
VPREFETCH_P1(4,%r9) \
|
||||||
|
VPREFETCH_P1(5,%r9) \
|
||||||
|
VPREFETCH_P1(6,%r9) \
|
||||||
|
VPREFETCH_P1(7,%r9) \
|
||||||
|
VPREFETCH_P1(8,%r9) \
|
||||||
|
VPREFETCH_P1(9,%r9) \
|
||||||
|
VPREFETCH_P1(10,%r9) \
|
||||||
|
VPREFETCH_P1(11,%r9));
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define PREFETCH_CHIMU(A)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define PREFETCH1_CHIMU(A) \
|
||||||
|
LOAD64(%r9,A) \
|
||||||
|
__asm__ ( \
|
||||||
|
VPREFETCH_P1(0,%r9) \
|
||||||
|
VPREFETCH_P1(1,%r9) \
|
||||||
|
VPREFETCH_P1(2,%r9) \
|
||||||
|
VPREFETCH_P1(3,%r9) \
|
||||||
|
VPREFETCH_P1(4,%r9) \
|
||||||
|
VPREFETCH_P1(5,%r9) \
|
||||||
|
VPREFETCH_P1(6,%r9) \
|
||||||
|
VPREFETCH_P1(7,%r9) \
|
||||||
|
VPREFETCH_P1(8,%r9) \
|
||||||
|
VPREFETCH_P1(9,%r9) \
|
||||||
|
VPREFETCH_P1(10,%r9) \
|
||||||
|
VPREFETCH_P1(11,%r9));
|
||||||
|
|
||||||
|
#define PERMUTE_DIR0 __asm__ ( \
|
||||||
VPERM0(Chi_00,Chi_00) \
|
VPERM0(Chi_00,Chi_00) \
|
||||||
VPERM0(Chi_01,Chi_01) \
|
VPERM0(Chi_01,Chi_01) \
|
||||||
VPERM0(Chi_02,Chi_02) \
|
VPERM0(Chi_02,Chi_02) \
|
||||||
@ -612,15 +668,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
LOAD64(%r8,ptr) \
|
LOAD64(%r8,ptr) \
|
||||||
LOAD64(%r9,pf) \
|
LOAD64(%r9,pf) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
VPREFETCH2(9,%r8) \
|
VPREFETCH_G2(9,%r8) \
|
||||||
VPREFETCH2(10,%r8) \
|
VPREFETCH_G2(10,%r8) \
|
||||||
VPREFETCH2(11,%r8) \
|
VPREFETCH_G2(11,%r8) \
|
||||||
VPREFETCH2(12,%r8) \
|
VPREFETCH_G2(12,%r8) \
|
||||||
VPREFETCH2(13,%r8) \
|
VPREFETCH_G2(13,%r8) \
|
||||||
VPREFETCH2(14,%r8) \
|
VPREFETCH_G2(14,%r8) \
|
||||||
VPREFETCH2(15,%r8) \
|
VPREFETCH_G2(15,%r8) \
|
||||||
VPREFETCH2(16,%r8) \
|
VPREFETCH_G2(16,%r8) \
|
||||||
VPREFETCH2(17,%r8) \
|
VPREFETCH_G2(17,%r8) \
|
||||||
VSHUF(Chi_00,T1) \
|
VSHUF(Chi_00,T1) \
|
||||||
VMOVIDUP(0,%r8,Z0 ) \
|
VMOVIDUP(0,%r8,Z0 ) \
|
||||||
VMOVIDUP(3,%r8,Z1 ) \
|
VMOVIDUP(3,%r8,Z1 ) \
|
||||||
@ -632,10 +688,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
||||||
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
||||||
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
||||||
VPREFETCHG(0,%r9) \
|
VPREFETCH_M1(0,%r9) \
|
||||||
VPREFETCHG(1,%r9) \
|
VPREFETCH_M1(1,%r9) \
|
||||||
VPREFETCHG(2,%r9) \
|
VPREFETCH_M1(2,%r9) \
|
||||||
VPREFETCHG(3,%r9) \
|
VPREFETCH_M1(3,%r9) \
|
||||||
/*18*/ \
|
/*18*/ \
|
||||||
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
||||||
VMADDSUB(Z3,Chi_10,UChi_10) \
|
VMADDSUB(Z3,Chi_10,UChi_10) \
|
||||||
@ -643,10 +699,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
|
VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
|
||||||
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
||||||
VMADDSUB(Z5,Chi_10,UChi_12) \
|
VMADDSUB(Z5,Chi_10,UChi_12) \
|
||||||
VPREFETCHG(4,%r9) \
|
VPREFETCH_M1(4,%r9) \
|
||||||
VPREFETCHG(5,%r9) \
|
VPREFETCH_M1(5,%r9) \
|
||||||
VPREFETCHG(6,%r9) \
|
VPREFETCH_M1(6,%r9) \
|
||||||
VPREFETCHG(7,%r9) \
|
VPREFETCH_M1(7,%r9) \
|
||||||
/*28*/ \
|
/*28*/ \
|
||||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
||||||
VMADDSUB(Z0,T2,UChi_10) \
|
VMADDSUB(Z0,T2,UChi_10) \
|
||||||
@ -673,15 +729,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
|
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
|
||||||
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
||||||
VMADDSUB(Z5,Chi_11,UChi_12) \
|
VMADDSUB(Z5,Chi_11,UChi_12) \
|
||||||
VPREFETCHG(9,%r8) \
|
VPREFETCH_M1(9,%r8) \
|
||||||
VPREFETCHG(10,%r8) \
|
VPREFETCH_M1(10,%r8) \
|
||||||
VPREFETCHG(11,%r8) \
|
VPREFETCH_M1(11,%r8) \
|
||||||
VPREFETCHG(12,%r8) \
|
VPREFETCH_M1(12,%r8) \
|
||||||
VPREFETCHG(13,%r8) \
|
VPREFETCH_M1(13,%r8) \
|
||||||
VPREFETCHG(14,%r8) \
|
VPREFETCH_M1(14,%r8) \
|
||||||
VPREFETCHG(15,%r8) \
|
VPREFETCH_M1(15,%r8) \
|
||||||
VPREFETCHG(16,%r8) \
|
VPREFETCH_M1(16,%r8) \
|
||||||
VPREFETCHG(17,%r8) \
|
VPREFETCH_M1(17,%r8) \
|
||||||
/*48*/ \
|
/*48*/ \
|
||||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
||||||
VMADDSUB(Z0,T2,UChi_10) \
|
VMADDSUB(Z0,T2,UChi_10) \
|
||||||
@ -689,10 +745,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VMADDSUB(Z1,T2,UChi_11) \
|
VMADDSUB(Z1,T2,UChi_11) \
|
||||||
VMADDSUB(Z2,T1,UChi_02) \
|
VMADDSUB(Z2,T1,UChi_02) \
|
||||||
VMADDSUB(Z2,T2,UChi_12) \
|
VMADDSUB(Z2,T2,UChi_12) \
|
||||||
VPREFETCHG(8,%r9) \
|
VPREFETCH_M1(8,%r9) \
|
||||||
VPREFETCHG(9,%r9) \
|
VPREFETCH_M1(9,%r9) \
|
||||||
VPREFETCHG(10,%r9) \
|
VPREFETCH_M1(10,%r9) \
|
||||||
VPREFETCHG(11,%r9) \
|
VPREFETCH_M1(11,%r9) \
|
||||||
/*55*/ \
|
/*55*/ \
|
||||||
VMADDSUB(Z3,Chi_02,UChi_00) \
|
VMADDSUB(Z3,Chi_02,UChi_00) \
|
||||||
VMADDSUB(Z3,Chi_12,UChi_10) \
|
VMADDSUB(Z3,Chi_12,UChi_10) \
|
||||||
@ -711,56 +767,58 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||||
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||||
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||||
VPREFETCHG(0,%r9) \
|
VPREFETCH_M1(0,%r9) \
|
||||||
VPREFETCHG(1,%r9) \
|
VPREFETCH_M1(1,%r9) \
|
||||||
VPREFETCHG(2,%r9) \
|
VPREFETCH_M1(2,%r9) \
|
||||||
VPREFETCHG(3,%r9) \
|
VPREFETCH_M1(3,%r9) \
|
||||||
/*8*/ \
|
/*8*/ \
|
||||||
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||||
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||||
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||||
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||||
VPREFETCHG(4,%r9) \
|
VPREFETCH_M1(4,%r9) \
|
||||||
VPREFETCHG(5,%r9) \
|
VPREFETCH_M1(5,%r9) \
|
||||||
VPREFETCHG(6,%r9) \
|
VPREFETCH_M1(6,%r9) \
|
||||||
VPREFETCHG(7,%r9) \
|
VPREFETCH_M1(7,%r9) \
|
||||||
/*16*/ \
|
/*16*/ \
|
||||||
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||||
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||||
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||||
VPREFETCHG(8,%r9) \
|
VPREFETCH_M1(8,%r9) \
|
||||||
VPREFETCHG(9,%r9) \
|
VPREFETCH_M1(9,%r9) \
|
||||||
VPREFETCHG(10,%r9) \
|
VPREFETCH_M1(10,%r9) \
|
||||||
VPREFETCHG(11,%r9) \
|
VPREFETCH_M1(11,%r9) \
|
||||||
/*22*/ \
|
/*22*/ \
|
||||||
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||||
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||||
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||||
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||||
VPREFETCH2(12,%r9) \
|
VPREFETCH_M2(12,%r9) \
|
||||||
VPREFETCH2(13,%r9) \
|
VPREFETCH_M2(13,%r9) \
|
||||||
VPREFETCH2(14,%r9) \
|
VPREFETCH_M2(14,%r9) \
|
||||||
VPREFETCH2(15,%r9) \
|
VPREFETCH_M2(15,%r9) \
|
||||||
/*30*/ \
|
/*30*/ \
|
||||||
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||||
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||||
VPREFETCH2(16,%r9) \
|
VPREFETCH_M2(16,%r9) \
|
||||||
VPREFETCH2(17,%r9) \
|
VPREFETCH_M2(17,%r9) \
|
||||||
VPREFETCH2(18,%r9) \
|
VPREFETCH_M2(18,%r9) \
|
||||||
VPREFETCH2(19,%r9) \
|
VPREFETCH_M2(19,%r9) \
|
||||||
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||||
/*36*/ \
|
/*36*/ \
|
||||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||||
VPREFETCH2(20,%r9) \
|
VPREFETCH_M2(20,%r9) \
|
||||||
VPREFETCH2(21,%r9) \
|
VPREFETCH_M2(21,%r9) \
|
||||||
VPREFETCH2(22,%r9) \
|
VPREFETCH_M2(22,%r9) \
|
||||||
VPREFETCH2(23,%r9) \
|
VPREFETCH_M2(23,%r9) \
|
||||||
VPREFETCHG(2,%r8) \
|
VPREFETCH_G1(2,%r8) \
|
||||||
VPREFETCHG(3,%r8) \
|
VPREFETCH_G1(3,%r8) \
|
||||||
VPREFETCH2(4,%r8) \
|
VPREFETCH_G2(4,%r8) \
|
||||||
VPREFETCH2(5,%r8) \
|
VPREFETCH_G2(5,%r8) \
|
||||||
|
VPREFETCH_G2(6,%r8) \
|
||||||
|
VPREFETCH_G2(7,%r8) \
|
||||||
/*42 insns*/ );
|
/*42 insns*/ );
|
||||||
|
|
||||||
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
|
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
|
||||||
@ -793,8 +851,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||||
/* VPREFETCHG(2,%r8)*/ \
|
/* VPREFETCH1(2,%r8)*/ \
|
||||||
/* VPREFETCHG(3,%r8)*/ \
|
/* VPREFETCH1(3,%r8)*/ \
|
||||||
/*42 insns*/ );
|
/*42 insns*/ );
|
||||||
|
|
||||||
|
|
||||||
|
0
lib/stencil/.dirstamp
Normal file
0
lib/stencil/.dirstamp
Normal file
@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
|
|||||||
{
|
{
|
||||||
grid = _grid;
|
grid = _grid;
|
||||||
if ( Block[0]==0) ZGraph();
|
if ( Block[0]==0) ZGraph();
|
||||||
|
else if ( Block[1]==0) NoBlocking();
|
||||||
else CartesianBlocking();
|
else CartesianBlocking();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LebesgueOrder::NoBlocking(void)
|
||||||
|
{
|
||||||
|
std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
|
||||||
|
_LebesgueReorder.resize(0);
|
||||||
|
for ( int s = 0 ; s!= grid->oSites();s++){
|
||||||
|
_LebesgueReorder.push_back(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
void LebesgueOrder::CartesianBlocking(void)
|
void LebesgueOrder::CartesianBlocking(void)
|
||||||
{
|
{
|
||||||
_LebesgueReorder.resize(0);
|
_LebesgueReorder.resize(0);
|
||||||
|
|
||||||
std::cout << GridLogMessage << " CartesianBlocking ";
|
std::cout << GridLogDebug << " CartesianBlocking ";
|
||||||
for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
|
// for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
|
||||||
std::cout<<std::endl;
|
// std::cout<<std::endl;
|
||||||
|
|
||||||
IndexInteger ND = grid->_ndimension;
|
IndexInteger ND = grid->_ndimension;
|
||||||
|
|
||||||
@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND,
|
|||||||
void LebesgueOrder::ZGraph(void)
|
void LebesgueOrder::ZGraph(void)
|
||||||
{
|
{
|
||||||
_LebesgueReorder.resize(0);
|
_LebesgueReorder.resize(0);
|
||||||
|
|
||||||
|
std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
|
||||||
// Align up dimensions to power of two.
|
// Align up dimensions to power of two.
|
||||||
const IndexInteger one=1;
|
const IndexInteger one=1;
|
||||||
|
|
||||||
|
@ -59,6 +59,7 @@ namespace Grid {
|
|||||||
// Cartesian stencil blocking strategy
|
// Cartesian stencil blocking strategy
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
static std::vector<int> Block;
|
static std::vector<int> Block;
|
||||||
|
void NoBlocking(void);
|
||||||
void CartesianBlocking(void);
|
void CartesianBlocking(void);
|
||||||
void IterateO(int ND,int dim,
|
void IterateO(int ND,int dim,
|
||||||
std::vector<IndexInteger> & xo,
|
std::vector<IndexInteger> & xo,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user