1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-17 23:37:06 +01:00

fixed conflicts after merging pabyle develop

This commit is contained in:
Mohammad Atif
2023-07-03 11:46:37 -04:00
408 changed files with 28450 additions and 4699 deletions

View File

@ -420,7 +420,6 @@ public:
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
Dw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
@ -589,7 +588,6 @@ public:
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
Ds.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);

View File

@ -217,10 +217,10 @@ int main (int argc, char ** argv)
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
xmit_to_rank,
xmit_to_rank,1,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes,mu);
recv_from_rank,1,
bytes,bytes,mu);
comm_proc = mpi_layout[mu]-1;
@ -228,10 +228,10 @@ int main (int argc, char ** argv)
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,
xmit_to_rank,1,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes,mu+4);
recv_from_rank,1,
bytes,bytes,mu+4);
}
}
@ -309,10 +309,10 @@ int main (int argc, char ** argv)
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
xmit_to_rank,
xmit_to_rank,1,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes,mu);
recv_from_rank,1,
bytes,bytes,mu);
Grid.StencilSendToRecvFromComplete(requests,mu);
requests.resize(0);
@ -322,10 +322,10 @@ int main (int argc, char ** argv)
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,
xmit_to_rank,1,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes,mu+4);
recv_from_rank,1,
bytes,bytes,mu+4);
Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0);
@ -411,8 +411,8 @@ int main (int argc, char ** argv)
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
int tid = omp_get_thread_num();
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank, bytes,tid);
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
(void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
thread_critical { dbytes+=tbytes; }
}

View File

@ -169,7 +169,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
@ -183,19 +183,16 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =1000;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
Dw.Dhop(src,result,0);
__SSC_STOP;
}
double t1=usecond();
FGrid->Barrier();
@ -239,7 +236,6 @@ int main (int argc, char ** argv)
exit(-1);
}
assert (norm2(err)< 1.0e-4 );
Dw.Report();
}
if (1)
@ -301,7 +297,7 @@ int main (int argc, char ** argv)
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
@ -314,7 +310,6 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
Dw.ZeroCounters();
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
@ -336,7 +331,6 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
Dw.Report();
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);

View File

@ -32,18 +32,18 @@
using namespace std;
using namespace Grid;
template<class d>
struct scal {
d internal;
////////////////////////
/// Move to domains ////
////////////////////////
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
void Benchmark(int Ls, Coordinate Dirichlet);
int main (int argc, char ** argv)
{
@ -52,39 +52,115 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
Coordinate latt4 = GridDefaultLatt();
int Ls=16;
for(int i=0;i<argc;i++)
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
}
//////////////////
// With comms
//////////////////
Coordinate Dirichlet(Nd+1,0);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet);
//////////////////
// Domain decomposed
//////////////////
Coordinate latt4 = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
Coordinate CommDim(Nd);
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
//////////////////////
// Node level
//////////////////////
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
Benchmark(Ls,Dirichlet);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
Benchmark(Ls,Dirichlet);
Grid_finalize();
exit(0);
}
void Benchmark(int Ls, Coordinate Dirichlet)
{
Coordinate latt4 = GridDefaultLatt();
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
#define SINGLE
#ifdef SINGLE
typedef vComplexF Simd;
typedef LatticeFermionF FermionField;
typedef LatticeGaugeFieldF GaugeField;
typedef LatticeColourMatrixF ColourMatrixField;
typedef DomainWallFermionF FermionAction;
#endif
#ifdef DOUBLE
typedef vComplexD Simd;
typedef LatticeFermionD FermionField;
typedef LatticeGaugeFieldD GaugeField;
typedef LatticeColourMatrixD ColourMatrixField;
typedef DomainWallFermionD FermionAction;
#endif
#ifdef DOUBLE2
typedef vComplexD2 Simd;
typedef LatticeFermionD2 FermionField;
typedef LatticeGaugeFieldD2 GaugeField;
typedef LatticeColourMatrixD2 ColourMatrixField;
typedef DomainWallFermionD2 FermionAction;
#endif
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionF src (FGrid); random(RNG5,src);
FermionField src (FGrid); random(RNG5,src);
#if 0
src = Zero();
{
@ -100,37 +176,38 @@ int main (int argc, char ** argv)
src = src*N2;
#endif
LatticeFermionF result(FGrid); result=Zero();
LatticeFermionF ref(FGrid); ref=Zero();
LatticeFermionF tmp(FGrid);
LatticeFermionF err(FGrid);
FermionField result(FGrid); result=Zero();
FermionField ref(FGrid); ref=Zero();
FermionField tmp(FGrid);
FermionField err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
GaugeField Umu(UGrid);
GaugeField UmuCopy(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
UmuCopy=Umu;
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrixF ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Apply BCs
////////////////////////////////////
Coordinate Block(4);
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
DirichletFilter<GaugeField> Filter(Block);
Filter.applyFilter(Umu);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
// LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4,UGrid);
std::vector<ColourMatrixField> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
@ -177,10 +254,8 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@ -190,19 +265,21 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
FermionAction::ImplParams p;
p.dirichlet=Dirichlet;
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
Dw.ImportGauge(Umu);
int ncall =300;
RealD n2e;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
Dw.Dhop(src,result,0);
__SSC_STOP;
}
double t1=usecond();
FGrid->Barrier();
@ -210,8 +287,8 @@ int main (int argc, char ** argv)
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = vComplex::Nsimd();
auto simdwidth = sizeof(vComplex);
auto nsimd = Simd::Nsimd();
auto simdwidth = sizeof(Simd);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
@ -220,29 +297,19 @@ int main (int argc, char ** argv)
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
//exit(0);
n2e = norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
if(( norm2(err)>1.0e-4) ) {
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
if(( n2e>1.0e-4) ) {
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
exit(-1);
}
assert (norm2(err)< 1.0e-4 );
Dw.Report();
assert (n2e< 1.0e-4 );
}
if (1)
@ -286,26 +353,27 @@ int main (int argc, char ** argv)
}
ref = -0.5*ref;
}
// dump=1;
Dw.Dhop(src,result,1);
Dw.Dhop(src,result,DaggerYes);
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermionF src_e (FrbGrid);
LatticeFermionF src_o (FrbGrid);
LatticeFermionF r_e (FrbGrid);
LatticeFermionF r_o (FrbGrid);
LatticeFermionF r_eo (FGrid);
n2e= norm2(err);
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
assert((n2e)<1.0e-4);
FermionField src_e (FrbGrid);
FermionField src_o (FrbGrid);
FermionField r_e (FrbGrid);
FermionField r_o (FrbGrid);
FermionField r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
@ -317,10 +385,8 @@ int main (int argc, char ** argv)
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@ -330,18 +396,11 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
Dw.ZeroCounters();
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
#ifdef CUDA_PROFILE
if(i==10) cudaProfilerStart();
#endif
Dw.DhopEO(src_o,r_e,DaggerNo);
#ifdef CUDA_PROFILE
if(i==20) cudaProfilerStop();
#endif
}
double t1=usecond();
FGrid->Barrier();
@ -352,7 +411,6 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
Dw.Report();
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
@ -366,14 +424,9 @@ int main (int argc, char ** argv)
setCheckerboard(r_eo,r_e);
err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
n2e= norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl;
assert(n2e<1.0e-4);
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
@ -382,6 +435,4 @@ int main (int argc, char ** argv)
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
}

View File

@ -0,0 +1,387 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
template<class d>
struct scal {
d internal;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
Coordinate latt4 = GridDefaultLatt();
int Ls=16;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionF src (FGrid); random(RNG5,src);
LatticeFermionF src1 (FGrid); random(RNG5,src1);
#if 0
src = Zero();
{
Coordinate origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=Zero();
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
#endif
LatticeFermionF result(FGrid); result=Zero();
LatticeFermionF ref(FGrid); ref=Zero();
LatticeFermionF tmp(FGrid);
LatticeFermionF err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrixF ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
// LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
ref = Zero();
for(int mu=0;mu<Nd;mu++){
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =100;
if (1) {
FGrid->Barrier();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src1,result,0);
Dw.Dhop(src,result,0);
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert (norm2(err)< 1.0e-4 );
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = vComplex::Nsimd();
auto simdwidth = sizeof(vComplex);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
//exit(0);
if(( norm2(err)>1.0e-4) ) {
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
exit(-1);
}
assert (norm2(err)< 1.0e-4 );
}
if (1)
{ // Naive wilson dag implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = Cshift(src,mu+1,1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
int i=s+Ls*ss;
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
}
}
}
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
// tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
}
}
}
ref = -0.5*ref;
}
// dump=1;
Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermionF src_e (FrbGrid);
LatticeFermionF src_o (FrbGrid);
LatticeFermionF r_e (FrbGrid);
LatticeFermionF r_o (FrbGrid);
LatticeFermionF r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
#ifdef CUDA_PROFILE
if(i==10) cudaProfilerStart();
#endif
Dw.DhopEO(src_o,r_e,DaggerNo);
#ifdef CUDA_PROFILE
if(i==20) cudaProfilerStop();
#endif
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
Dw.Dhop (src ,result,DaggerNo);
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
}

View File

@ -0,0 +1,465 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
////////////////////////
/// Move to domains ////
////////////////////////
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
void Benchmark(int Ls, Coordinate Dirichlet, int partial);
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
int Ls=8;
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
}
//////////////////
// With comms
//////////////////
Coordinate Dirichlet(Nd+1,0);
for(auto partial : {0}) {
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,partial);
}
//////////////////
// Domain decomposed
//////////////////
Coordinate latt4 = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
Coordinate CommDim(Nd);
//Coordinate shm({2,1,1,1});
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
//////////////////////
// Node level
//////////////////////
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
// for(int d=0;d<Nd;d++) CommDim[d]= 1;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
for(auto partial : {0,1}) {
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,partial);
}
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
for(auto partial : {0,1}) {
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,partial);
}
Grid_finalize();
exit(0);
}
void Benchmark(int Ls, Coordinate Dirichlet, int partial)
{
Coordinate latt4 = GridDefaultLatt();
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
#define SINGLE
#ifdef SINGLE
typedef vComplexF Simd;
typedef LatticeFermionF FermionField;
typedef LatticeGaugeFieldF GaugeField;
typedef LatticeColourMatrixF ColourMatrixField;
typedef DomainWallFermionF FermionAction;
#endif
#ifdef DOUBLE
typedef vComplexD Simd;
typedef LatticeFermionD FermionField;
typedef LatticeGaugeFieldD GaugeField;
typedef LatticeColourMatrixD ColourMatrixField;
typedef DomainWallFermionD FermionAction;
#endif
#ifdef DOUBLE2
typedef vComplexD2 Simd;
typedef LatticeFermionD2 FermionField;
typedef LatticeGaugeFieldD2 GaugeField;
typedef LatticeColourMatrixD2 ColourMatrixField;
typedef DomainWallFermionD2 FermionAction;
#endif
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
FermionField src (FGrid); random(RNG5,src);
#if 0
src = Zero();
{
Coordinate origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=Zero();
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
#endif
FermionField result(FGrid); result=Zero();
FermionField ref(FGrid); ref=Zero();
FermionField tmp(FGrid);
FermionField err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
GaugeField Umu(UGrid);
GaugeField UmuFull(UGrid);
GaugeField UmuCopy(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
UmuCopy=Umu;
UmuFull=Umu;
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
////////////////////////////////////
// Apply BCs
////////////////////////////////////
Coordinate Block(4);
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
DirichletFilter<GaugeField> Filter(Block);
Filter.applyFilter(Umu);
if(!partial) Filter.applyFilter(UmuCopy);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
std::vector<ColourMatrixField> U(4,UGrid);
std::vector<ColourMatrixField> Ucopy(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
ref = Zero();
for(int mu=0;mu<Nd;mu++){
int depth=dwf_compressor_depth;
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
}
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
FermionAction::ImplParams p;
p.dirichlet=Dirichlet;
p.partialDirichlet=partial;
FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
int ncall =1;
RealD n2e;
if (1) {
FGrid->Barrier();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = Simd::Nsimd();
auto simdwidth = sizeof(Simd);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
err = ref-result;
n2e = norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
if(( n2e>1.0e-4) ) {
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
DumpSliceNorm("s-slice ref ",ref,1);
DumpSliceNorm("s-slice res ",result,1);
DumpSliceNorm("s-slice error ",err,1);
exit(-1);
}
assert (n2e< 1.0e-4 );
}
if (1)
{ // Naive wilson dag implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
int depth=dwf_compressor_depth;
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
}
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
Dw.Dhop(src,result,DaggerYes);
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
n2e= norm2(err);
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
assert((n2e)<1.0e-4);
FermionField src_e (FrbGrid);
FermionField src_o (FrbGrid);
FermionField r_e (FrbGrid);
FermionField r_o (FrbGrid);
FermionField r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
Dw.Dhop (src ,result,DaggerNo);
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-result;
n2e= norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
assert(n2e<1.0e-4);
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
}

View File

@ -168,7 +168,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
double t0=usecond();
Dw.Dhop(src,result,0);

View File

@ -93,14 +93,11 @@ int main (int argc, char ** argv)
int ncall =1000;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
Dw.Dhop(src,result,0);
__SSC_STOP;
}
double t1=usecond();
FGrid->Barrier();
@ -114,7 +111,6 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
Dw.Report();
}
@ -136,14 +132,11 @@ int main (int argc, char ** argv)
GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
if (1) {
FGrid_d->Barrier();
DwD.ZeroCounters();
DwD.Dhop(src_d,result_d,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
DwD.Dhop(src_d,result_d,0);
__SSC_STOP;
}
double t1=usecond();
FGrid_d->Barrier();
@ -157,7 +150,6 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
DwD.Report();
}
#endif
Grid_finalize();

View File

@ -67,17 +67,17 @@ int main (int argc, char ** argv)
const int ncall=1000;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop "<<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeFermion src(FGrid); random(RNG5,src);
LatticeFermion result(FGrid);
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
double t0,t1;
typedef typename DomainWallFermionR::Coeff_t Coeff_t;
typedef typename DomainWallFermionD::Coeff_t Coeff_t;
Vector<Coeff_t> diag = Dw.bs;
Vector<Coeff_t> upper= Dw.cs;
Vector<Coeff_t> lower= Dw.cs;
@ -103,35 +103,30 @@ int main (int argc, char ** argv)
#define BENCH_DW(A,...) \
Dw. A (__VA_ARGS__); \
FGrid->Barrier(); \
Dw.CayleyZeroCounters(); \
t0=usecond(); \
for(int i=0;i<ncall;i++){ \
Dw. A (__VA_ARGS__); \
} \
t1=usecond(); \
FGrid->Barrier(); \
Dw.CayleyReport(); \
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl;
#define BENCH_ZDW(A,in,out) \
zDw. A (in,out); \
FGrid->Barrier(); \
zDw.CayleyZeroCounters(); \
t0=usecond(); \
for(int i=0;i<ncall;i++){ \
zDw. A (in,out); \
} \
t1=usecond(); \
FGrid->Barrier(); \
zDw.CayleyReport(); \
std::cout<<GridLogMessage << "Called ZDw " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl;
#define BENCH_DW_SSC(A,in,out) \
Dw. A (in,out); \
FGrid->Barrier(); \
Dw.CayleyZeroCounters(); \
t0=usecond(); \
for(int i=0;i<ncall;i++){ \
__SSC_START ; \
@ -140,7 +135,6 @@ int main (int argc, char ** argv)
} \
t1=usecond(); \
FGrid->Barrier(); \
Dw.CayleyReport(); \
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl;

View File

@ -0,0 +1,189 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_prec_change.cc
Copyright (C) 2015
Author: Christopher Kelly <ckelly@bnl.gov>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int Ls = 12;
Coordinate latt4 = GridDefaultLatt();
GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGridD); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionD field_d(FGridD), tmp_d(FGridD);
random(RNG5,field_d); tmp_d = field_d;
LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
precisionChange(field_d2, field_d); tmp_d2 = field_d2;
LatticeFermionF field_f(FGridF), tmp_f(FGridF);
precisionChange(field_f, field_d); tmp_f = field_f;
int N = 500;
double time_ds = 0, time_sd = 0;
std::cout<<GridLogMessage << "Benchmarking single<->double original implementation (fields initially device-resident)" << std::endl;
for(int i=0;i<N;i++){
//We want to benchmark the typical scenario of both fields being device resident
//To do this, invoke an operation that will open a device view and touch all sites
//with a write operation that invalidates the CPU copy
field_d = tmp_d;
field_f = tmp_f;
double start=usecond();
precisionChangeOrig(field_d,field_f);
double stop=usecond();
time_sd += stop - start;
field_d = tmp_d;
field_f = tmp_f;
start=usecond();
precisionChangeOrig(field_f,field_d);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
std::cout<<GridLogMessage << "Benchmarking single<->double with pregenerated workspace(fields initially device-resident)" << std::endl;
time_sd = time_ds = 0;
for(int i=0;i<N;i++){
field_d = tmp_d;
field_f = tmp_f;
double start=usecond();
precisionChange(field_d,field_f, wk_sp_to_dp);
double stop=usecond();
time_sd += stop - start;
field_d = tmp_d;
field_f = tmp_f;
start=usecond();
precisionChange(field_f,field_d, wk_dp_to_sp);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
std::cout<<GridLogMessage << "Benchmarking single<->double with workspace generated on-the-fly (fields initially device-resident)" << std::endl;
time_sd = time_ds = 0;
for(int i=0;i<N;i++){
field_d = tmp_d;
field_f = tmp_f;
double start=usecond();
precisionChange(field_d,field_f);
double stop=usecond();
time_sd += stop - start;
field_d = tmp_d;
field_f = tmp_f;
start=usecond();
precisionChange(field_f,field_d);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
std::cout<<GridLogMessage << "Benchmarking single<->double2 (fields initially device-resident)" << std::endl;
time_sd = time_ds = 0;
for(int i=0;i<N;i++){
field_d2 = tmp_d2;
field_f = tmp_f;
double start=usecond();
precisionChangeFast(field_d2,field_f);
double stop=usecond();
time_sd += stop - start;
field_d2 = tmp_d2;
field_f = tmp_f;
start=usecond();
precisionChangeFast(field_f,field_d2);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
std::cout<<GridLogMessage << "Benchmarking single<->double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl;
time_sd = time_ds = 0;
for(int i=0;i<N;i++){
field_d2 = tmp_d2;
field_f = tmp_f;
double start=usecond();
precisionChange(field_d2,field_f);
double stop=usecond();
time_sd += stop - start;
field_d2 = tmp_d2;
field_f = tmp_f;
start=usecond();
precisionChange(field_f,field_d2);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
Grid_finalize();
}

View File

@ -53,8 +53,8 @@ int main (int argc, char ** argv)
pRNG.SeedFixedIntegers(seeds);
// pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
typename ImprovedStaggeredFermionR::ImplParams params;
typedef typename ImprovedStaggeredFermionD::FermionField FermionField;
typename ImprovedStaggeredFermionD::ImplParams params;
FermionField src (&Grid); random(pRNG,src);
FermionField result(&Grid); result=Zero();
@ -93,7 +93,7 @@ int main (int argc, char ** argv)
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
int ncall=1000;

View File

@ -146,16 +146,15 @@ int main (int argc, char ** argv)
ref = -0.5*ref;
RealD mass=0.1;
typename WilsonFermionR::ImplParams params;
typename WilsonFermionD::ImplParams params;
WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
WilsonFermionD Dw(Umu,Grid,RBGrid,mass,params);
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
int ncall=1000;
//int ncall=1;
// Counters
Dw.ZeroCounters();
Grid.Barrier();
double t0=usecond();
@ -201,7 +200,6 @@ int main (int argc, char ** argv)
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Dw.Report();
// guard
double err0 = norm2(err);

View File

@ -40,21 +40,21 @@ Gamma::Algebra Gmu [] = {
void bench_wilson (
LatticeFermion & src,
LatticeFermion & result,
WilsonFermionR & Dw,
WilsonFermionD & Dw,
double const volume,
int const dag );
void bench_wilson_eo (
LatticeFermion & src,
LatticeFermion & result,
WilsonFermionR & Dw,
WilsonFermionD & Dw,
double const volume,
int const dag );
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
typename WilsonFermionR::ImplParams params;
typename WilsonFermionD::ImplParams params;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
@ -66,7 +66,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Number of colours "<< Nc <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Benchmarking WilsonFermionD::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
@ -110,7 +110,7 @@ int main (int argc, char ** argv)
double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
WilsonFermionD Dw(Umu,Grid,RBGrid,mass,params);
// Full operator
bench_wilson(src,result,Dw,volume,DaggerNo);
@ -130,7 +130,7 @@ int main (int argc, char ** argv)
void bench_wilson (
LatticeFermion & src,
LatticeFermion & result,
WilsonFermionR & Dw,
WilsonFermionD & Dw,
double const volume,
int const dag )
{
@ -149,7 +149,7 @@ void bench_wilson (
void bench_wilson_eo (
LatticeFermion & src,
LatticeFermion & result,
WilsonFermionR & Dw,
WilsonFermionD & Dw,
double const volume,
int const dag )
{