diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc new file mode 100644 index 00000000..f7bc8e8e --- /dev/null +++ b/benchmarks/Benchmark_zmm.cc @@ -0,0 +1,174 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_zmm.cc + + Copyright (C) 2015 + +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + + +using namespace Grid; +using namespace Grid::QCD; + +void ZmulF(void *ptr1,void *ptr2,void *ptr3); +void Zmul(void *ptr1,void *ptr2,void *ptr3); +void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3); +void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3); +void TimesIAvx512F(void *ptr1,void *ptr3); +void TimesIAvx512(void *ptr1,void *ptr3); +void TimesMinusIAvx512F(void *ptr1,void *ptr3); +void TimesMinusIAvx512(void *ptr1,void *ptr3); + + +int bench(std::ofstream &os, std::vector &latt4,int Ls); + +int main(int argc,char **argv) +{ + Grid_init(&argc,&argv); + std::ofstream os("zmm.dat"); + + os << "#V Ls Lxy Lzt C++ Asm OMP L1 " < grid({L,L,m*L,m*L}); + bench(os,latt4,Ls); + } + } + } +} + +int bench(std::ofstream &os, std::vector &latt4,int Ls) +{ + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + int threads = GridThread::GetThreads(); + + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + + LatticeFermion src (FGrid); + LatticeFermion tmp (FGrid); + LatticeFermion srce(FrbGrid); + + LatticeFermion resulto(FrbGrid); resulto=zero; + LatticeFermion resulta(FrbGrid); resulta=zero; + LatticeFermion junk(FrbGrid); junk=zero; + LatticeFermion diff(FrbGrid); + LatticeGaugeField Umu(UGrid); + + double mfc, mfa, mfo, mfl1; + + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + random(RNG5,src); +#if 1 + random(RNG4,Umu); +#else + int mmu=2; + std::vector U(4,UGrid); + for(int mu=0;mu(Umu,mu); + if ( mu!=mmu ) U[mu] = zero; + if ( mu==mmu ) U[mu] = 1.0; + PokeIndex(Umu,U[mu],mu); + } +#endif + pickCheckerboard(Even,srce,src); + + RealD mass=0.1; + RealD M5 =1.8; + DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + std::cout<::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder #pragma omp parallel { - for(int jjj=0;jjj<1000;jjj++){ + for(int jjj=0;jjj<100;jjj++){ #pragma omp barrier dslashtime -=usecond(); if ( dag == DaggerYes ) { @@ -538,6 +538,124 @@ void WilsonFermion5D::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder alltime+=usecond(); } + +template +void WilsonFermion5D::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U, + const FermionField &in, FermionField &out,int dag) +{ + // assert((dag==DaggerNo) ||(dag==DaggerYes)); + alltime-=usecond(); + Compressor compressor(dag); + + // Assume balanced KMP_AFFINITY; this is forced in GridThread.h + + int threads = GridThread::GetThreads(); + int HT = GridThread::GetHyperThreads(); + int cores = GridThread::GetCores(); + int nwork = U._grid->oSites(); + + commtime -=usecond(); + auto handle = st.HaloExchangeBegin(in,compressor); + st.HaloExchangeComplete(handle); + commtime +=usecond(); + + jointime -=usecond(); + jointime +=usecond(); + + // Dhop takes the 4d grid from U, and makes a 5d index for fermion + // Not loop ordering and data layout. + // Designed to create + // - per thread reuse in L1 cache for U + // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. + +#pragma omp parallel + { + for(int jjj=0;jjj<100;jjj++){ +#pragma omp barrier + dslashtime -=usecond(); + if ( dag == DaggerYes ) { + if( this->HandOptDslash ) { +#pragma omp for + for(int ss=0;ssoSites();ss++){ + int sU=0; + for(int s=0;soSites();ss++){ + { + int sd; + for(sd=0;sdAsmOptDslash ) { + // for(int i=0;i<1;i++){ + // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ + // PerformanceCounter Counter(i); + // Counter.Start(); + +#pragma omp for + for(int t=0;tHandOptDslash ) { +#pragma omp for + + for(int ss=0;ssoSites();ss++){ + int sU=0; + for(int s=0;soSites();ss++){ + int sU=0; + for(int s=0;s void WilsonFermion5D::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U,