diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc new file mode 100644 index 00000000..302059a4 --- /dev/null +++ b/benchmarks/Benchmark_dwf_sweep.cc @@ -0,0 +1,358 @@ + + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +void benchDw(std::vector & L, int Ls, int threads, int report =0 ); +void benchsDw(std::vector & L, int Ls, int threads, int report=0 ); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + const int Ls=16; + int threads = GridThread::GetThreads(); + std::cout< latt4(4,L); + for(int d=4;d>0;d--){ + if ( d<=3 ) latt4[d]*=2; + std::cout << GridLogMessage <<"\t"; + for(int d=0;d latt4(4,16); + std::cout< & latt4, int Ls, int threads,int report ) +{ + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + +#ifdef CHECK + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + LatticeFermion src (FGrid); random(RNG5,src); + LatticeGaugeField Umu(UGrid); + random(RNG4,Umu); +#else + LatticeFermion src (FGrid); src=zero; + LatticeGaugeField Umu(UGrid); Umu=zero; +#endif + + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + ColourMatrix cm = Complex(1.0,0.0); + + + LatticeGaugeField Umu5d(FGrid); + + // replicate across fifth dimension + for(int ss=0;ssoSites();ss++){ + for(int s=0;s U(4,FGrid); + for(int mu=0;mu(Umu5d,mu); + } + +#ifdef CHECK + if (1) + { + ref = zero; + for(int mu=0;mu_Nprocessors; + + DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + double t0=usecond(); + Dw.Dhop(src,result,0); + double t1=usecond(); + + int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); + + if (ncall < 5 ) exit(0); + + Dw.Dhop(src,result,0); + + PerformanceCounter Counter(8); + Counter.Start(); + t0=usecond(); + for(int i=0;i 1.0e-4 ) { + std::cout< & latt4, int Ls, int threads, int report ) +{ + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi()); + GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); + GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + +#ifdef CHECK_SDW + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + LatticeFermion src (FGrid); random(RNG5,src); + LatticeGaugeField Umu(UGrid); + random(RNG4,Umu); +#else + LatticeFermion src (FGrid); src=zero; + LatticeGaugeField Umu(UGrid); Umu=zero; +#endif + + LatticeFermion result(FGrid); result=zero; + LatticeFermion ref(FGrid); ref=zero; + LatticeFermion tmp(FGrid); + LatticeFermion err(FGrid); + + ColourMatrix cm = Complex(1.0,0.0); + + LatticeGaugeField Umu5d(FGrid); + + // replicate across fifth dimension + for(int ss=0;ssoSites();ss++){ + for(int s=0;s WilsonFermion5DF; + LatticeFermionF ssrc(sFGrid); + LatticeFermionF sref(sFGrid); + LatticeFermionF sresult(sFGrid); + WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5); + + for(int x=0;x site({s,x,y,z,t}); + SpinColourVectorF tmp; + peekSite(tmp,src,site); + pokeSite(tmp,ssrc,site); + }}}}} + + double t0=usecond(); + sDw.Dhop(ssrc,sresult,0); + double t1=usecond(); + + int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); + + PerformanceCounter Counter(8); + Counter.Start(); + t0=usecond(); + for(int i=0;iLs);// eps is ignored for higham assert(zdata->n==this->Ls); - std::cout< #define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri) #define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri) -#define VPREFETCHNTA(O,A) -#define VPREFETCH(O,A) #define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" diff --git a/lib/simd/Intel512wilson.h b/lib/simd/Intel512wilson.h index 1955cc6d..2bc0545d 100644 --- a/lib/simd/Intel512wilson.h +++ b/lib/simd/Intel512wilson.h @@ -559,22 +559,23 @@ Author: paboyle VSUB(UChi_02,result_22,result_22)\ VSUB(UChi_12,result_32,result_32) ); -#define PREFETCH_CHIMU(A) \ +#define PREFETCH_CHIMU(A) +/* LOAD64(%r9,A) \ __asm__ ( \ - VPREFETCHG(12,%r9)\ - VPREFETCHG(13,%r9)\ - VPREFETCHG(14,%r9)\ - VPREFETCHG(15,%r9)\ - VPREFETCHG(16,%r9)\ - VPREFETCHG(17,%r9)\ - VPREFETCHG(18,%r9)\ - VPREFETCHG(19,%r9)\ - VPREFETCHG(20,%r9)\ - VPREFETCHG(21,%r9)\ - VPREFETCHG(22,%r9)\ - VPREFETCHG(23,%r9)); - + VPREFETCHG(0,%r9)\ + VPREFETCHG(1,%r9)\ + VPREFETCHG(2,%r9)\ + VPREFETCHG(3,%r9)\ + VPREFETCHG(4,%r9)\ + VPREFETCHG(5,%r9)\ + VPREFETCHG(6,%r9)\ + VPREFETCHG(7,%r9)\ + VPREFETCHG(8,%r9)\ + VPREFETCHG(9,%r9)\ + VPREFETCHG(10,%r9)\ + VPREFETCHG(11,%r9)); +*/ #define PERMUTE_DIR0 __asm__ ( \ VPERM0(Chi_00,Chi_00) \ VPERM0(Chi_01,Chi_01) \ @@ -612,8 +613,7 @@ Author: paboyle LOAD64(%r8,ptr) \ LOAD64(%r9,pf) \ __asm__ ( \ - VPREFETCH2(9,%r8) \ - VPREFETCH2(10,%r8) \ + VPREFETCH2(9,%r8) VPREFETCH2(10,%r8) \ VPREFETCH2(11,%r8) \ VPREFETCH2(12,%r8) \ VPREFETCH2(13,%r8) \ diff --git a/lib/stencil/Lebesgue.cc b/lib/stencil/Lebesgue.cc index 7704e08f..c34b5c96 100644 --- a/lib/stencil/Lebesgue.cc +++ b/lib/stencil/Lebesgue.cc @@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid) { grid = _grid; if ( Block[0]==0) ZGraph(); + else if ( Block[1]==0) NoBlocking(); else CartesianBlocking(); } +void LebesgueOrder::NoBlocking(void) +{ + std::cout<oSites();s++){ + _LebesgueReorder.push_back(s); + } +} void LebesgueOrder::CartesianBlocking(void) { _LebesgueReorder.resize(0); - std::cout << GridLogMessage << " CartesianBlocking "; - for(int d=0;d_ndimension; @@ -116,7 +125,8 @@ void LebesgueOrder::IterateI(int ND, void LebesgueOrder::ZGraph(void) { _LebesgueReorder.resize(0); - + + std::cout << GridLogDebug << " Lebesgue order "< Block; + void NoBlocking(void); void CartesianBlocking(void); void IterateO(int ND,int dim, std::vector & xo,