1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-14 13:57:07 +01:00

merge upstream develop

This commit is contained in:
nmeyer-ur
2020-07-07 20:26:47 +02:00
326 changed files with 10335 additions and 9381 deletions

View File

@ -14,6 +14,7 @@ std::string filestem(const int l)
int main (int argc, char ** argv)
{
#ifdef HAVE_LIME
Grid_init(&argc,&argv);
int64_t threads = GridThread::GetThreads();
@ -42,6 +43,6 @@ int main (int argc, char ** argv)
}
Grid_finalize();
#endif
return EXIT_SUCCESS;
}

View File

@ -2,7 +2,7 @@
#define Benchmark_IO_hpp_
#include <Grid/Grid.h>
#ifdef HAVE_LIME
#define MSG std::cout << GridLogMessage
#define SEP \
"============================================================================="
@ -104,4 +104,5 @@ void readBenchmark(const Coordinate &latt, const std::string filename,
}
#endif //LIME
#endif // Benchmark_IO_hpp_

View File

@ -8,6 +8,7 @@ using namespace Grid;
int main (int argc, char ** argv)
{
#ifdef HAVE_LIME
std::vector<std::string> dir;
unsigned int Ls;
bool rb;
@ -73,6 +74,6 @@ int main (int argc, char ** argv)
}
Grid_finalize();
#endif
return EXIT_SUCCESS;
}

View File

@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
using namespace Grid;
std::vector<int> L_list;
std::vector<int> Ls_list;
std::vector<double> mflop_list;
@ -76,7 +75,6 @@ struct controls {
int Opt;
int CommsOverlap;
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
// int HugePages;
};
class Benchmark {
@ -119,14 +117,15 @@ public:
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
comms_header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
for(int lat=16;lat<=maxlat;lat+=8){
// for(int Ls=8;Ls<=8;Ls*=2){
{ int Ls=12;
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
std::cout << GridLogMessage<< latt_size <<std::endl;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
@ -184,9 +183,6 @@ public:
}
timestat.statistics(t_time);
// for(int i=0;i<t_time.size();i++){
// std::cout << i<<" "<<t_time[i]<<std::endl;
// }
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
@ -199,8 +195,6 @@ public:
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
}
@ -227,14 +221,15 @@ public:
uint64_t NN;
uint64_t lmax=48;
uint64_t lmax=32;
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=4){
for(int lat=8;lat<=lmax;lat+=8){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// NP= Grid.RankCount();
@ -242,9 +237,9 @@ public:
Vec rn ; random(sRNG,rn);
LatticeVec z(&Grid); z=rn;
LatticeVec x(&Grid); x=rn;
LatticeVec y(&Grid); y=rn;
LatticeVec z(&Grid); z=Zero();
LatticeVec x(&Grid); x=Zero();
LatticeVec y(&Grid); y=Zero();
double a=2.0;
uint64_t Nloop=NLOOP;
@ -252,9 +247,9 @@ public:
double start=usecond();
for(int i=0;i<Nloop;i++){
z=a*x-y;
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v , x, CpuWrite);
autoView( y_v , y, CpuWrite);
autoView( z_v , z, CpuRead);
x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
y_v[4]=z_v[4];
}
@ -270,191 +265,8 @@ public:
}
};
#if 0
static double DWF5(int Ls,int L)
{
// RealD mass=0.1;
RealD M5 =1.8;
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
Coordinate internal;
if ( SHM == 1 ) internal = Coordinate({1,1,1,1});
else if ( SHM == 2 ) internal = Coordinate({2,1,1,1});
else if ( SHM == 4 ) internal = Coordinate({2,2,1,1});
else if ( SHM == 8 ) internal = Coordinate({2,2,2,1});
else assert(0);
Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
///////// Source preparation ////////////
LatticeFermion src (sFGrid);
LatticeFermion tmp (sFGrid);
std::cout << GridLogMessage << "allocated src and tmp" << std::endl;
random(RNG5,src);
std::cout << GridLogMessage << "intialised random source" << std::endl;
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
LatticeFermion src_e (sFrbGrid);
LatticeFermion src_o (sFrbGrid);
LatticeFermion r_e (sFrbGrid);
LatticeFermion r_o (sFrbGrid);
LatticeFermion r_eo (sFGrid);
LatticeFermion err (sFGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else
const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O ");
#endif
controls Cases [] = {
#ifdef AVX512
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
for(int c=0;c<num_cases;c++) {
WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 100;
uint64_t ncall = 1000;
double t0=usecond();
sFGrid->Barrier();
for(int i=0;i<nwarm;i++){
sDw.DhopEO(src_o,r_e,DaggerNo);
}
sFGrid->Barrier();
double t1=usecond();
sDw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
sDw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
sFGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node "<< mflops/NN<<std::endl;
sDw.Report();
}
double robust = mflops_worst/mflops_best;;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
return mflops_best;
}
#endif
static double DWF(int Ls,int L, double & robust)
static double DWF(int Ls,int L)
{
RealD mass=0.1;
RealD M5 =1.8;
@ -471,37 +283,30 @@ public:
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
Coordinate internal;
if ( SHM == 1 ) internal = Coordinate({1,1,1,1});
else if ( SHM == 2 ) internal = Coordinate({2,1,1,1});
else if ( SHM == 4 ) internal = Coordinate({2,2,1,1});
else if ( SHM == 8 ) internal = Coordinate({2,2,2,1});
else assert(0);
Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
@ -514,74 +319,31 @@ public:
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
typedef DomainWallFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
///////// Source preparation ////////////
LatticeFermion src (FGrid); random(RNG5,src);
LatticeFermion ref (FGrid);
LatticeFermion tmp (FGrid);
Gauge Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
Fermion src (FGrid); random(RNG5,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
RealD N2 = 1.0/::sqrt(norm2(src));
std::cout<<GridLogMessage << "Normalising src "<< N2 <<std::endl;
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
{
LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid);
auto Umu_v = Umu.View();
auto Umu5d_v = Umu5d.View();
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
}
}
ref = Zero();
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
LatticeFermion r_eo (FGrid);
LatticeFermion err (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else
const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O ");
#endif
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
#ifdef AVX512
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
@ -594,15 +356,12 @@ public:
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 200;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
@ -610,9 +369,7 @@ public:
}
FGrid->Barrier();
double t1=usecond();
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
// if (ncall < 500) ncall = 500;
uint64_t ncall = 1000;
uint64_t ncall = 50;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
@ -649,24 +406,11 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
Dw.Report();
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-ref;
RealD absref = norm2(ref);
RealD abserr = norm2(err);
std::cout<<GridLogMessage << "norm diff "<< abserr << " / " << absref<<std::endl;
assert(abserr<1.0e-4);
}
robust = mflops_worst/mflops_best;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
@ -680,8 +424,166 @@ public:
return mflops_best;
}
static double Staggered(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
typedef ImprovedStaggeredFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU3::HotConfiguration(RNG4,Umu);
typename Action::ImplParams params;
Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
///////// Source preparation ////////////
Fermion src (FGrid); random(RNG4,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
const int num_cases = 4;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
for(int c=0;c<num_cases;c++) {
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
Ds.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Ds.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1146.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
return mflops_best;
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
@ -696,62 +598,50 @@ int main (int argc, char ** argv)
int do_memory=1;
int do_comms =1;
int do_su3 =0;
int do_wilson=1;
int do_dwf =1;
if ( do_su3 ) {
// empty for now
}
#if 1
int sel=2;
Coordinate L_list({8,12,16,24});
#else
int sel=1;
Coordinate L_list({8,12});
#endif
std::vector<int> L_list({16,24,32});
int selm1=sel-1;
std::vector<double> robust_list;
std::vector<double> wilson;
std::vector<double> dwf4;
std::vector<double> dwf5;
std::vector<double> staggered;
if ( do_wilson ) {
int Ls=1;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double robust;
wilson.push_back(Benchmark::DWF(Ls,L_list[l],robust));
}
int Ls=1;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
}
int Ls=16;
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double robust;
double result = Benchmark::DWF(Ls,L_list[l],robust) ;
dwf4.push_back(result);
robust_list.push_back(robust);
}
Ls=12;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double result = Benchmark::DWF(Ls,L_list[l]) ;
dwf4.push_back(result);
}
if ( do_dwf ) {
/*
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double result = Benchmark::Staggered(L_list[l]) ;
staggered.push_back(result);
}
*/
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 " <<std::endl;
std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l] <<std::endl;
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
int NN=NN_global;
if ( do_memory ) {
@ -768,24 +658,20 @@ int main (int argc, char ** argv)
Benchmark::Comms();
}
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4 " <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4 " <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
std::cout<<std::setprecision(3);
std::cout<<GridLogMessage << " Comparison point robustness: " << robust_list[sel] <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
std::cout<<std::setprecision(3);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Grid_finalize();
}

View File

@ -21,7 +21,7 @@
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_NVCC
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
@ -129,8 +129,8 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid);
{
auto Umu5d_v = Umu5d.View();
auto Umu_v = Umu.View();
autoView( Umu5d_v, Umu5d, CpuWrite);
autoView( Umu_v , Umu , CpuRead);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
@ -272,8 +272,8 @@ int main (int argc, char ** argv)
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu+1,1);
{
auto ref_v = ref.View();
auto tmp_v = tmp.View();
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
}
@ -282,8 +282,8 @@ int main (int argc, char ** argv)
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
{
auto ref_v = ref.View();
auto tmp_v = tmp.View();
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
}

View File

@ -130,11 +130,13 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension
auto Umu5d_v = Umu5d.View();
auto Umu_v = Umu.View();
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
{
autoView(Umu5d_v , Umu5d, CpuWrite);
autoView( Umu_v , Umu, CpuRead);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
}
}
}

View File

@ -79,7 +79,7 @@ int main (int argc, char ** argv)
double start=usecond();
thread_for(t,threads,{
auto x_t = x[t].View();
autoView( x_t , x[t],CpuRead);
sum[t] = x_t[0];
for(int i=0;i<Nloop;i++){
for(auto ss=x_t.begin();ss<x_t.end();ss++){

View File

@ -177,9 +177,7 @@ int main (int argc, char ** argv)
Real nn;
double start=usecond();
for(int i=0;i<Nloop;i++){
auto x_v = x.View();
nn=norm2(x);
vsplat(x_v[0]._internal[0],nn);
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;

View File

@ -85,11 +85,11 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
for(int i=0;i<Lblock;i++){
auto lhs_v = lhs[i].View();
autoView(lhs_v, lhs[i], CpuRead);
auto left = conjugate(lhs_v[ss]);
for(int j=0;j<Rblock;j++){
int idx = i+Lblock*j+Lblock*Rblock*r;
auto rhs_v = rhs[j].View();
autoView(rhs_v, rhs[j], CpuRead);
auto right = rhs_v[ss];
vector_type vv = left()(0)(0) * right()(0)(0)
+ left()(0)(1) * right()(0)(1)
@ -221,12 +221,12 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
for(int i=0;i<Lblock;i++){
auto lhs_v=lhs[i].View();
autoView(lhs_v,lhs[i],CpuRead);
auto left = conjugate(lhs_v[ss]);
for(int j=0;j<Rblock;j++){
for(int mu=0;mu<Ngamma;mu++){
auto rhs_v = rhs[j].View();
autoView(rhs_v,rhs[j],CpuRead);
auto right = Gamma(gammas[mu])*rhs_v[ss];
vector_type vv = left()(0)(0) * right()(0)(0)
@ -370,12 +370,12 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
int ss= so+n*stride+b;
for(int i=0;i<Lblock;i++){
auto lhs_v=lhs[i].View();
autoView(lhs_v,lhs[i],CpuRead);
auto left = conjugate(lhs_v[ss]);
for(int j=0;j<Rblock;j++){
SpinMatrix_v vv;
auto rhs_v = rhs[j].View();
autoView(rhs_v,rhs[j],CpuRead);
auto right = rhs_v[ss];
for(int s1=0;s1<Ns;s1++){
for(int s2=0;s2<Ns;s2++){
@ -518,12 +518,12 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
for(int i=0;i<Lblock;i++){
auto lhs_v = lhs[i].View();
autoView(lhs_v,lhs[i],CpuRead);
auto left = conjugate(lhs_v[ss]);
for(int j=0;j<Rblock;j++){
SpinMatrix_v vv;
auto rhs_v = rhs[j].View();
autoView(rhs_v,rhs[j],CpuRead);
auto right = rhs_v[ss];
for(int s1=0;s1<Ns;s1++){
for(int s2=0;s2<Ns;s2++){
@ -537,7 +537,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
// Trigger unroll
for ( int m=0;m<Nmom;m++){
int idx = m+base;
auto mom_v = mom[m].View();
autoView(mom_v,mom[m],CpuRead);
auto phase = mom_v[ss];
mac(&lvSum[idx],&vv,&phase);
}

View File

@ -0,0 +1,176 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
void benchDw(std::vector<int> & L, int Ls);
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=12;
std::vector< std::vector<int> > latts;
#if 1
latts.push_back(std::vector<int> ({24,24,24,24}) );
latts.push_back(std::vector<int> ({48,24,24,24}) );
latts.push_back(std::vector<int> ({96,24,24,24}) );
latts.push_back(std::vector<int> ({96,48,24,24}) );
// latts.push_back(std::vector<int> ({96,48,48,24}) );
// latts.push_back(std::vector<int> ({96,48,48,48}) );
#else
// latts.push_back(std::vector<int> ({96,48,48,48}) );
latts.push_back(std::vector<int> ({96,96,96,192}) );
#endif
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t SchurDiagOne "<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
for (int l=0;l<latts.size();l++){
std::vector<int> latt4 = latts[l];
std::cout << GridLogMessage <<"\t";
for(int d=0;d<Nd;d++){
std::cout<<latt4[d]<<"x";
}
std::cout <<Ls<<"\t" ;
benchDw (latt4,Ls);
}
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
Grid_finalize();
}
void benchDw(std::vector<int> & latt4, int Ls)
{
/////////////////////////////////////////////////////////////////////////////////////
// for Nc=3
/////////////////////////////////////////////////////////////////////////////////////
// Dw : Ls*24*(7+48)= Ls*1320
//
// M5D: Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72
// Meo: Ls*24*(7+48) + Ls*72 = Ls*1392
//
// Mee: 3*Ns*2*Nc*Ls // Chroma 6*N5*Nc*Ns
//
// LeemInv : 2*2*Nc*madd*Ls
// LeeInv : 2*2*Nc*madd*Ls
// DeeInv : 4*2*Nc*mul *Ls
// UeeInv : 2*2*Nc*madd*Ls
// UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops
// QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops
// Mpc => 1452*cbvol*2*Ls flops //
// => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16
/////////////////////////////////////////////////////////////////////////////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
// long unsigned int single_site_flops = 8*Nc*(7+16*Nc)*Ls;
long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls;
long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls;
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
ColourMatrixF cm = ComplexF(1.0,0.0);
int ncall=300;
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
LatticeGaugeFieldF Umu(UGrid); Umu=Zero();
MobiusFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.5,0.5);
LatticeFermionF src_o (FrbGrid); src_o=1.0;
LatticeFermionF r_o (FrbGrid); r_o=Zero();
int order =151;
SchurDiagOneOperator<MobiusFermionF,LatticeFermionF> Mpc(Dw);
Chebyshev<LatticeFermionF> Cheby(0.0,60.0,order);
{
Mpc.Mpc(src_o,r_o);
Mpc.Mpc(src_o,r_o);
Mpc.Mpc(src_o,r_o);
double t0=usecond();
for(int i=0;i<ncall;i++){
Mpc.Mpc(src_o,r_o);
}
double t1=usecond();
double flops=(single_site_mpc_flops*volume*ncall); // Mpc has 1 - Moo^-1 Moe Mee^-1 Meo so CB cancels.
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0);
flops=(single_site_quda_flops*volume*ncall);
std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
// Cheby uses MpcDagMpc so 2x flops
for(int i=0;i<1;i++){
Cheby(Mpc,src_o,r_o);
t0=usecond();
Cheby(Mpc,src_o,r_o);
t1=usecond();
flops=(single_site_mpc_flops*volume*2*order);
std::cout <<"\t"<<flops/(t1-t0);
flops=(single_site_quda_flops*volume*2*order);
std::cout <<"\t"<<flops/(t1-t0) << "\t" << (t1-t0)/1000./1000. <<" s";
std::cout <<std::endl;
}
}
// Dw.Report();
}

View File

@ -88,25 +88,6 @@ int main (int argc, char ** argv)
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
ref = Zero();
/*
{ // Naive wilson implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
// ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
tmp = U[mu]*Cshift(src,mu,1);
for(int i=0;i<ref._odata.size();i++){
ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
}
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu,-1);
for(int i=0;i<ref._odata.size();i++){
ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
}
}
}
ref = -0.5*ref;
*/
RealD mass=0.1;
RealD c1=9.0/8.0;
@ -125,10 +106,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Grid_finalize();
}

View File

@ -0,0 +1,114 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_staggered.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(&Grid);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG(&Grid);
pRNG.SeedFixedIntegers(seeds);
// pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
typedef typename ImprovedStaggeredFermionF::FermionField FermionField;
typename ImprovedStaggeredFermionF::ImplParams params;
FermionField src (&Grid); random(pRNG,src);
FermionField result(&Grid); result=Zero();
FermionField ref(&Grid); ref=Zero();
FermionField tmp(&Grid); tmp=Zero();
FermionField err(&Grid); tmp=Zero();
LatticeGaugeFieldF Umu(&Grid); random(pRNG,Umu);
std::vector<LatticeColourMatrixF> U(4,&Grid);
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
// Only one non-zero (y)
#if 0
Umu=Zero();
Complex cone(1.0,0.0);
for(int nn=0;nn<Nd;nn++){
random(pRNG,U[nn]);
if(1) {
if (nn!=2) { U[nn]=Zero(); std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
// else { U[nn]= cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
else { std::cout<<GridLogMessage << "random gauge field in dir "<<nn<<std::endl; }
}
PokeIndex<LorentzIndex>(Umu,U[nn],nn);
}
#endif
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionF Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
int ncall=1000;
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t0=usecond();
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t1=usecond();
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
Grid_finalize();
}

View File

@ -41,7 +41,7 @@ int main (int argc, char ** argv)
#define LADD (8)
int64_t Nwarm=20;
int64_t Nloop=500;
int64_t Nloop=50;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
@ -66,9 +66,9 @@ int main (int argc, char ** argv)
LatticeColourMatrix x(&Grid);// random(pRNG,x);
LatticeColourMatrix y(&Grid);// random(pRNG,y);
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v , x, AcceleratorRead);
autoView( y_v , y, AcceleratorRead);
autoView( z_v , z, AcceleratorWrite);
const uint64_t Nsite = x_v.size();
const uint64_t nsimd = vComplex::Nsimd();
for(int64_t i=0;i<Nwarm;i++){
@ -116,9 +116,9 @@ int main (int argc, char ** argv)
LatticeColourMatrix x(&Grid);// random(pRNG,x);
LatticeColourMatrix y(&Grid);// random(pRNG,y);
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v , x, AcceleratorWrite);
autoView( y_v , y, AcceleratorRead);
autoView( z_v , z, AcceleratorRead);
const uint64_t Nsite = x_v.size();
const uint64_t nsimd = vComplex::Nsimd();
for(int64_t i=0;i<Nwarm;i++){
@ -167,9 +167,9 @@ int main (int argc, char ** argv)
LatticeColourMatrix x(&Grid);// random(pRNG,x);
LatticeColourMatrix y(&Grid);// random(pRNG,y);
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
autoView( x_v , x, AcceleratorRead);
autoView( y_v , y, AcceleratorRead);
autoView( z_v , z, AcceleratorWrite);
const uint64_t Nsite = x_v.size();
const uint64_t nsimd = vComplex::Nsimd();
for(int64_t i=0;i<Nwarm;i++){
@ -220,10 +220,10 @@ int main (int argc, char ** argv)
LatticeColourMatrix y(&Grid);// random(pRNG,y);
LatticeColourMatrix w(&Grid);// random(pRNG,y);
auto x_v = x.View();
auto y_v = y.View();
auto z_v = z.View();
auto w_v = z.View();
autoView( x_v , x, AcceleratorRead);
autoView( y_v , y, AcceleratorRead);
autoView( z_v , z, AcceleratorRead);
autoView( w_v , w, AcceleratorWrite);
const uint64_t Nsite = x_v.size();
const uint64_t nsimd = vComplex::Nsimd();
for(int64_t i=0;i<Nwarm;i++){

View File

@ -125,8 +125,8 @@ int main (int argc, char ** argv)
// ref = src + Gamma(Gamma::Algebra::GammaX)* src ; // 1-gamma_x
tmp = U[mu]*Cshift(src,mu,1);
{
auto ref_v = ref.View();
auto tmp_v = tmp.View();
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuWrite);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
}
@ -135,8 +135,8 @@ int main (int argc, char ** argv)
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu,-1);
{
auto ref_v = ref.View();
auto tmp_v = tmp.View();
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuWrite);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
}
@ -209,8 +209,8 @@ int main (int argc, char ** argv)
for(int ss=0;ss<0;ss++ ){
for(int i=0;i<Ns;i++){
for(int j=0;j<Nc;j++){
auto ref_v = ref.View();
auto result_v = result.View();
autoView( ref_v, ref, CpuWrite);
autoView( result_v, result, CpuWrite);
ComplexF * ref_p = (ComplexF *)&ref_v[ss]()(i)(j);
ComplexF * res_p = (ComplexF *)&result_v[ss]()(i)(j);
std::cout<<GridLogMessage << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
@ -226,8 +226,8 @@ int main (int argc, char ** argv)
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu,1);
{
auto ref_v = ref.View();
auto tmp_v = tmp.View();
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuWrite);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
}
@ -236,8 +236,8 @@ int main (int argc, char ** argv)
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu,-1);
{
auto ref_v = ref.View();
auto tmp_v = tmp.View();
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuWrite);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
}