mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-25 18:19:34 +01:00 
			
		
		
		
	Adding extra benchmark
This commit is contained in:
		
							
								
								
									
										174
									
								
								benchmarks/Benchmark_zmm.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										174
									
								
								benchmarks/Benchmark_zmm.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,174 @@ | ||||
|     /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
|     Source file: ./tests/Test_zmm.cc | ||||
|  | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|  | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|  | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid.h> | ||||
| #include <PerfCount.h> | ||||
| #include <simd/Intel512wilson.h> | ||||
|  | ||||
|  | ||||
| using namespace Grid; | ||||
| using namespace Grid::QCD; | ||||
|  | ||||
| void ZmulF(void *ptr1,void *ptr2,void *ptr3); | ||||
| void Zmul(void *ptr1,void *ptr2,void *ptr3); | ||||
| void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3); | ||||
| void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3); | ||||
| void TimesIAvx512F(void *ptr1,void *ptr3); | ||||
| void TimesIAvx512(void *ptr1,void *ptr3); | ||||
| void TimesMinusIAvx512F(void *ptr1,void *ptr3); | ||||
| void TimesMinusIAvx512(void *ptr1,void *ptr3); | ||||
|  | ||||
|  | ||||
| int bench(std::ofstream &os, std::vector<int> &latt4,int Ls); | ||||
|  | ||||
| int main(int argc,char **argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|   std::ofstream os("zmm.dat"); | ||||
|  | ||||
|   os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl; | ||||
|   for(int L=4;L<32;L+=2){ | ||||
|     for(int m=1;m<=2;m++){ | ||||
|       for(int Ls=8;Ls<=16;Ls+=8){ | ||||
| 	std::vector<int> grid({L,L,m*L,m*L}); | ||||
| 	bench(os,latt4,Ls); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
| int bench(std::ofstream &os, std::vector<int> &latt4,int Ls) | ||||
| { | ||||
|  | ||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||
|   std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||
|   int threads = GridThread::GetThreads(); | ||||
|  | ||||
|   std::vector<int> seeds4({1,2,3,4}); | ||||
|   std::vector<int> seeds5({5,6,7,8}); | ||||
|  | ||||
|   GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); | ||||
|  | ||||
|   LatticeFermion src (FGrid); | ||||
|   LatticeFermion tmp (FGrid); | ||||
|   LatticeFermion srce(FrbGrid); | ||||
|  | ||||
|   LatticeFermion resulto(FrbGrid); resulto=zero; | ||||
|   LatticeFermion resulta(FrbGrid); resulta=zero; | ||||
|   LatticeFermion junk(FrbGrid); junk=zero; | ||||
|   LatticeFermion diff(FrbGrid);  | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|  | ||||
|   double mfc, mfa, mfo, mfl1; | ||||
|  | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||
|   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||
|   random(RNG5,src); | ||||
| #if 1 | ||||
|   random(RNG4,Umu); | ||||
| #else | ||||
|   int mmu=2; | ||||
|   std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|     U[mu] = PeekIndex<LorentzIndex>(Umu,mu); | ||||
|     if ( mu!=mmu ) U[mu] = zero; | ||||
|     if ( mu==mmu ) U[mu] = 1.0; | ||||
|     PokeIndex<LorentzIndex>(Umu,U[mu],mu); | ||||
|   } | ||||
| #endif | ||||
|  pickCheckerboard(Even,srce,src); | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   RealD M5  =1.8; | ||||
|   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||
|  | ||||
|   std::cout<<GridLogMessage << "Calling Dw"<<std::endl; | ||||
|   int ncall=50; | ||||
|   double t0=usecond(); | ||||
|   for(int i=0;i<ncall;i++){ | ||||
|     Dw.DhopOE(srce,resulto,0); | ||||
|   } | ||||
|   double t1=usecond(); | ||||
|  | ||||
|   double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||
|   double flops=1344*volume/2; | ||||
|  | ||||
|   mfc = flops*ncall/(t1-t0); | ||||
|   std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl; | ||||
|  | ||||
|   QCD::WilsonFermion5DStatic::AsmOptDslash=1; | ||||
|   t0=usecond(); | ||||
|   for(int i=0;i<ncall;i++){ | ||||
|     Dw.DhopOE(srce,resulta,0); | ||||
|   } | ||||
|   t1=usecond(); | ||||
|   mfa = flops*ncall/(t1-t0); | ||||
|   std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl; | ||||
|  | ||||
|   t0=usecond(); | ||||
|   for(int i=0;i<1;i++){ | ||||
|     Dw.DhopInternalOMPbench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag); | ||||
|   } | ||||
|   t1=usecond(); | ||||
|   mfo = flops*100/(t1-t0); | ||||
|   std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl; | ||||
|  | ||||
|   t0=usecond(); | ||||
|   for(int i=0;i<1;i++){ | ||||
|     Dw.DhopInternalL1bench(Dw.StencilOdd,Dw.LebesgueEvenOdd,Dw.UmuEven,srce,junk,dag); | ||||
|   } | ||||
|   t1=usecond(); | ||||
|   mfl1= flops*100/(t1-t0); | ||||
|   std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl; | ||||
|  | ||||
|   os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " " | ||||
|      << mfc<<" " | ||||
|      << mfa<<" " | ||||
|      << mfo<<" " | ||||
|      << mfl1<<std::endl; | ||||
|  | ||||
|   for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ | ||||
|     Dw.DhopOE(srce,resulta,0); | ||||
|     PerformanceCounter Counter(i); | ||||
|     Counter.Start(); | ||||
|     Dw.DhopOE(srce,resulta,0); | ||||
|     Counter.Stop(); | ||||
|     Counter.Report(); | ||||
|   } | ||||
|   //resulta = (-0.5) * resulta; | ||||
|  | ||||
|   diff = resulto-resulta; | ||||
|   std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl; | ||||
|   std::cout<<std::endl; | ||||
| } | ||||
|  | ||||
|  | ||||
| @@ -450,7 +450,7 @@ void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder | ||||
|  | ||||
| #pragma omp parallel  | ||||
|   { | ||||
|   for(int jjj=0;jjj<1000;jjj++){ | ||||
|   for(int jjj=0;jjj<100;jjj++){ | ||||
| #pragma omp barrier | ||||
|   dslashtime -=usecond(); | ||||
|   if ( dag == DaggerYes ) { | ||||
| @@ -538,6 +538,124 @@ void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder | ||||
|   alltime+=usecond(); | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class Impl> | ||||
| void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo, | ||||
| 						DoubledGaugeField & U, | ||||
| 						const FermionField &in, FermionField &out,int dag) | ||||
| { | ||||
|   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); | ||||
|   alltime-=usecond(); | ||||
|   Compressor compressor(dag); | ||||
|  | ||||
|   // Assume balanced KMP_AFFINITY; this is forced in GridThread.h | ||||
|  | ||||
|   int threads = GridThread::GetThreads(); | ||||
|   int HT      = GridThread::GetHyperThreads(); | ||||
|   int cores   = GridThread::GetCores(); | ||||
|   int nwork = U._grid->oSites(); | ||||
|    | ||||
|   commtime -=usecond(); | ||||
|   auto handle = st.HaloExchangeBegin(in,compressor); | ||||
|   st.HaloExchangeComplete(handle); | ||||
|   commtime +=usecond(); | ||||
|  | ||||
|   jointime -=usecond(); | ||||
|   jointime +=usecond(); | ||||
|    | ||||
|   // Dhop takes the 4d grid from U, and makes a 5d index for fermion | ||||
|   // Not loop ordering and data layout. | ||||
|   // Designed to create  | ||||
|   // - per thread reuse in L1 cache for U | ||||
|   // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. | ||||
|  | ||||
| #pragma omp parallel  | ||||
|   { | ||||
|   for(int jjj=0;jjj<100;jjj++){ | ||||
| #pragma omp barrier | ||||
|   dslashtime -=usecond(); | ||||
|   if ( dag == DaggerYes ) { | ||||
|     if( this->HandOptDslash ) { | ||||
| #pragma omp for | ||||
|       for(int ss=0;ss<U._grid->oSites();ss++){ | ||||
| 	int sU=0; | ||||
| 	for(int s=0;s<Ls;s++){ | ||||
| 	  int sF = s+Ls*sU; | ||||
| 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out); | ||||
| 	  } | ||||
|       } | ||||
|     } else {  | ||||
|  | ||||
| #pragma omp for | ||||
|       for(int ss=0;ss<U._grid->oSites();ss++){ | ||||
| 	{ | ||||
| 	  int sd; | ||||
| 	  for(sd=0;sd<Ls;sd++){ | ||||
| 	    int sU=0; | ||||
| 	    int sF = sd+Ls*sU; | ||||
| 	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out); | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|   } else { | ||||
|     if( this->AsmOptDslash ) { | ||||
|       //      for(int i=0;i<1;i++){ | ||||
|       //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ | ||||
|       //	PerformanceCounter Counter(i); | ||||
|       //	Counter.Start(); | ||||
|  | ||||
| #pragma omp for | ||||
|       for(int t=0;t<threads;t++){ | ||||
|  | ||||
| 	int hyperthread = t%HT; | ||||
| 	int core        = t/HT; | ||||
|  | ||||
|         int sswork, swork,soff,ssoff,  sU,sF; | ||||
| 	 | ||||
| 	GridThread::GetWork(nwork,core,sswork,ssoff,cores); | ||||
| 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT); | ||||
|  | ||||
| 	for(int ss=0;ss<sswork;ss++){ | ||||
| 	  for(int s=soff;s<soff+swork;s++){ | ||||
|  | ||||
| 	    sU=0; | ||||
| 	    sF = s+Ls*sU; | ||||
| 	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out); | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|       //      Counter.Stop(); | ||||
|       //      Counter.Report(); | ||||
|       //      } | ||||
|     } else if( this->HandOptDslash ) { | ||||
| #pragma omp for | ||||
|  | ||||
|       for(int ss=0;ss<U._grid->oSites();ss++){ | ||||
| 	int sU=0; | ||||
| 	for(int s=0;s<Ls;s++){ | ||||
| 	  int sF = s+Ls*sU; | ||||
| 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out); | ||||
| 	} | ||||
|       } | ||||
|     } else {  | ||||
| #pragma omp for | ||||
|       for(int ss=0;ss<U._grid->oSites();ss++){ | ||||
| 	int sU=0; | ||||
| 	for(int s=0;s<Ls;s++){ | ||||
| 	  int sF = s+Ls*sU;  | ||||
| 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out); | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   } | ||||
|   } | ||||
|   dslashtime +=usecond(); | ||||
|   alltime+=usecond(); | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class Impl> | ||||
| void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo, | ||||
| 						     DoubledGaugeField & U, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user