1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 01:05:38 +01:00

GPU tweaks for benchmarking; really necessary?

This commit is contained in:
paboyle 2018-06-13 20:26:07 +01:00
parent 94d1ae4c82
commit 25becc9324
3 changed files with 23 additions and 31 deletions

View File

@ -170,12 +170,13 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGpu ) std::cout << GridLogMessage<< "* Using Gpu WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =500;
int ncall =1000;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
@ -201,7 +202,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
// exit(0);
/*
if(( norm2(err)>1.0e-4) ) {
std::cout << "RESULT\n " << result<<std::endl;
@ -256,6 +257,7 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGpu ) std::cout << GridLogMessage<< "* Using Gpu WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
@ -324,12 +326,10 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric )
std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm )
std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGpu ) std::cout << GridLogMessage<< "* Using Gpu WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
LatticeFermion sr_eo(sFGrid);
@ -475,6 +475,7 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGpu ) std::cout << GridLogMessage<< "* Using Gpu WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;

View File

@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
;
int main (int argc, char ** argv)
{
@ -55,8 +54,8 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t lmax=96;
#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
uint64_t lmax=48;
#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
for(int lat=8;lat<=lmax;lat+=8){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
@ -73,14 +72,10 @@ int main (int argc, char ** argv)
double a=2.0;
axpy(z,a,x,y);
double start=usecond();
for(int i=0;i<Nloop;i++){
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
axpy(z,a,x,y);
x_v[0]=z_v[0]; // serial loop dependence to prevent optimise
y_v[4]=z_v[4];
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
@ -112,14 +107,10 @@ int main (int argc, char ** argv)
uint64_t Nloop=NLOOP;
z=a*x-y;
double start=usecond();
for(int i=0;i<Nloop;i++){
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
z=a*x-y;
x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
y_v[4]=z_v[4];
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
@ -151,12 +142,10 @@ int main (int argc, char ** argv)
LatticeVec y(&Grid);// random(pRNG,y);
RealD a=2.0;
z=a*x;
double start=usecond();
for(int i=0;i<Nloop;i++){
auto x_v = x.View();
auto z_v = z.View();
z=a*x;
x_v[0]=z_v[0]*2.0;
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -31,16 +31,17 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
using namespace std;
using namespace Grid;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
#define LMAX (32)
#define LMIN (4)
#define LADD (4)
#define LMAX (48)
#define LMIN (8)
#define LADD (8)
int64_t Nwarm=5;
int64_t Nloop=25;
int64_t Nwarm=50;
int64_t Nloop=500;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
@ -141,6 +142,7 @@ int main (int argc, char ** argv)
double start=usecond();
for(int64_t i=0;i<Nloop;i++){
mult(z,x,y);
// mac(z,x,y);
}
double stop=usecond();
double time = (stop-start)/Nloop*1000.0;
@ -179,7 +181,7 @@ int main (int argc, char ** argv)
double stop=usecond();
double time = (stop-start)/Nloop*1000.0;
double bytes=3*vol*Nc*Nc*sizeof(Complex);
double bytes=4*vol*Nc*Nc*sizeof(Complex);
double flops=Nc*Nc*(8+8+8)*vol;
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;