1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Benchmark prep

This commit is contained in:
Peter Boyle 2017-08-25 09:25:54 +01:00
parent b49bec0cec
commit c3b1263e75
10 changed files with 494 additions and 71 deletions

View File

@ -32,6 +32,19 @@ using namespace std;
using namespace Grid; using namespace Grid;
using namespace Grid::QCD; using namespace Grid::QCD;
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
std::vector<int> L_list;
std::vector<int> Ls_list;
std::vector<double> mflop_list;
double mflop_ref;
double mflop_ref_err;
int NN_global;
struct time_statistics{ struct time_statistics{
double mean; double mean;
@ -95,13 +108,15 @@ public:
static void Comms(void) static void Comms(void)
{ {
int Nloop=100; int Nloop=1000;
int nmu=0; int nmu=0;
int maxlat=32; int maxlat=32;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi(); std::vector<int> mpi_layout = GridDefaultMpi();
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::vector<double> t_time(Nloop); std::vector<double> t_time(Nloop);
time_statistics timestat; time_statistics timestat;
@ -133,13 +148,14 @@ public:
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
} }
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
int ncomm;
double dbytes; double dbytes;
std::vector<double> times(Nloop);
for(int i=0;i<Nloop;i++){ for(int i=0;i<Nloop;i++){
double start=usecond(); double start=usecond();
std::vector<CartesianCommunicator::CommsRequest_t> requests;
dbytes=0; dbytes=0;
ncomm=0; ncomm=0;
@ -150,7 +166,6 @@ public:
if (mpi_layout[mu]>1 ) { if (mpi_layout[mu]>1 ) {
ncomm++;
int xmit_to_rank; int xmit_to_rank;
int recv_from_rank; int recv_from_rank;
if ( dir == mu ) { if ( dir == mu ) {
@ -160,18 +175,18 @@ public:
int comm_proc = mpi_layout[mu]-1; int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} }
#if 1 tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
tbytes= Grid.StencilSendToRecvFromBegin(requests, (void *)&rbuf[dir][0], recv_from_rank,
(void *)&xbuf[dir][0], bytes,dir);
xmit_to_rank,
(void *)&rbuf[dir][0], #ifdef GRID_OMP
recv_from_rank,
bytes,dir);
Grid.StencilSendToRecvFromComplete(requests,dir);
#endif
requests.resize(0);
#pragma omp atomic #pragma omp atomic
#endif
ncomm++;
#ifdef GRID_OMP
#pragma omp atomic
#endif
dbytes+=tbytes; dbytes+=tbytes;
} }
} }
@ -181,13 +196,15 @@ public:
} }
timestat.statistics(t_time); timestat.statistics(t_time);
// for(int i=0;i<t_time.size();i++){
// std::cout << i<<" "<<t_time[i]<<std::endl;
// }
dbytes=dbytes*ppn; dbytes=dbytes*ppn;
double xbytes = dbytes*0.5; double xbytes = dbytes*0.5;
double rbytes = dbytes*0.5; double rbytes = dbytes*0.5;
double bidibytes = dbytes; double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " <<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
@ -196,7 +213,8 @@ public:
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
}
} }
return; return;
@ -218,7 +236,7 @@ public:
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t lmax=48; uint64_t lmax=48;
#define NLOOP (50*lmax*lmax*lmax*lmax/lat/lat/lat/lat) #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=4){ for(int lat=8;lat<=lmax;lat+=4){
@ -253,8 +271,7 @@ public:
} }
}; };
static double DWF5(int Ls,int L)
static void DWF(int Ls,int L)
{ {
RealD mass=0.1; RealD mass=0.1;
RealD M5 =1.8; RealD M5 =1.8;
@ -262,6 +279,7 @@ public:
double mflops; double mflops;
double mflops_best = 0; double mflops_best = 0;
double mflops_worst= 0; double mflops_worst= 0;
std::vector<double> mflops_all;
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// Set/Get the layout & grid size // Set/Get the layout & grid size
@ -274,6 +292,189 @@ public:
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount(); uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount(); uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
std::vector<int> internal;
if ( SHM == 1 ) internal = std::vector<int>({1,1,1,1});
else if ( SHM == 2 ) internal = std::vector<int>({2,1,1,1});
else if ( SHM == 4 ) internal = std::vector<int>({2,2,1,1});
else if ( SHM == 8 ) internal = std::vector<int>({2,2,2,1});
else assert(0);
std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
///////// Source preparation ////////////
LatticeFermion src (sFGrid); random(RNG5,src);
LatticeFermion tmp (sFGrid);
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
LatticeFermion src_e (sFrbGrid);
LatticeFermion src_o (sFrbGrid);
LatticeFermion r_e (sFrbGrid);
LatticeFermion r_o (sFrbGrid);
LatticeFermion r_eo (sFGrid);
LatticeFermion err (sFGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else
const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O ");
#endif
controls Cases [] = {
#ifdef AVX512
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
for(int c=0;c<num_cases;c++) {
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 200;
double t0=usecond();
sFGrid->Barrier();
for(int i=0;i<nwarm;i++){
sDw.DhopEO(src_o,r_e,DaggerNo);
}
sFGrid->Barrier();
double t1=usecond();
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
// if (ncall < 500) ncall = 500;
uint64_t ncall = 1000;
sFGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
sDw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
sDw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
sFGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node "<< mflops/NN<<std::endl;
sDw.Report();
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
return mflops_best;
}
static double DWF(int Ls,int L)
{
RealD mass=0.1;
RealD M5 =1.8;
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
std::vector<int> local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN; uint64_t SHM=NP/NN;
std::vector<int> internal; std::vector<int> internal;
@ -364,13 +565,15 @@ public:
#if defined(AVX512) #if defined(AVX512)
const int num_cases = 6; const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else #else
const int num_cases = 4; const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O ");
#endif #endif
controls Cases [] = { controls Cases [] = {
#ifdef AVX512 #ifdef AVX512
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif #endif
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
@ -394,7 +597,7 @@ public:
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10; int nwarm = 200;
double t0=usecond(); double t0=usecond();
FGrid->Barrier(); FGrid->Barrier();
for(int i=0;i<nwarm;i++){ for(int i=0;i<nwarm;i++){
@ -402,7 +605,10 @@ public:
} }
FGrid->Barrier(); FGrid->Barrier();
double t1=usecond(); double t1=usecond();
uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
// if (ncall < 500) ncall = 500;
uint64_t ncall = 1000;
FGrid->Broadcast(0,&ncall,sizeof(ncall)); FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
@ -428,7 +634,7 @@ public:
mf_err= flops/timestat.min * timestat.err/timestat.mean; mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean; mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops; if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops; if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops; if ( mflops>mflops_best ) mflops_best = mflops;
@ -450,12 +656,20 @@ public:
} }
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best <<std::endl; std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<<std::endl; std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl; std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
} }
return mflops_best;
} }
}; };
@ -493,26 +707,66 @@ int main (int argc, char ** argv)
// empty for now // empty for now
} }
int sel=2;
std::vector<int> L_list({8,12,16,24});
std::vector<double> wilson;
std::vector<double> dwf4;
std::vector<double> dwf5;
if ( do_wilson ) { if ( do_wilson ) {
int Ls=1; int Ls=1;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl; std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::DWF(Ls,16); for(int l=0;l<L_list.size();l++){
Benchmark::DWF(Ls,24); wilson.push_back(Benchmark::DWF(1,L_list[l]));
Benchmark::DWF(Ls,32); }
} }
int Ls=16;
if ( do_dwf ) { if ( do_dwf ) {
int Ls=16;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl; std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::DWF(Ls,8); for(int l=0;l<L_list.size();l++){
Benchmark::DWF(Ls,12); dwf4.push_back(Benchmark::DWF(Ls,L_list[l]));
Benchmark::DWF(Ls,16); }
Benchmark::DWF(Ls,24);
} }
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int NN=NN_global;
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4 \t\t DWF5 " <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Comparison point result: " << dwf4[sel]/NN <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
Grid_finalize(); Grid_finalize();
} }

View File

@ -92,11 +92,16 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount(); RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode; RealD ppn = Nrank/Nnode;
Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
int ncomm; int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls);
rbuf[mu].resize(lat*lat*lat*Ls);
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
}
for(int i=0;i<Nloop;i++){ for(int i=0;i<Nloop;i++){
double start=usecond(); double start=usecond();
@ -112,7 +117,6 @@ int main (int argc, char ** argv)
int comm_proc=1; int comm_proc=1;
int xmit_to_rank; int xmit_to_rank;
int recv_from_rank; int recv_from_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFromBegin(requests, Grid.SendToRecvFromBegin(requests,
(void *)&xbuf[mu][0], (void *)&xbuf[mu][0],
@ -172,9 +176,14 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount(); RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode; RealD ppn = Nrank/Nnode;
Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls);
rbuf[mu].resize(lat*lat*lat*Ls);
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
}
int ncomm; int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@ -493,14 +502,9 @@ int main (int argc, char ** argv)
int comm_proc = mpi_layout[mu]-1; int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} }
tbytes= Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[dir][0], tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
xmit_to_rank, (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
(void *)&rbuf[dir][0],
recv_from_rank,
bytes,dir);
Grid.StencilSendToRecvFromComplete(requests,dir);
requests.resize(0);
#pragma omp atomic #pragma omp atomic
dbytes+=tbytes; dbytes+=tbytes;

View File

@ -92,6 +92,9 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
////////////////// //////////////////
// Hack 2MB align; could make option probably doesn't need configurability // Hack 2MB align; could make option probably doesn't need configurability
////////////////// //////////////////
@ -102,6 +105,7 @@ public:
#else #else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif #endif
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
// First touch optimise in threaded loop // First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr; uint8_t *cp = (uint8_t *)ptr;
#ifdef GRID_OMP #ifdef GRID_OMP
@ -115,6 +119,7 @@ public:
void deallocate(pointer __p, size_type __n) { void deallocate(pointer __p, size_type __n) {
size_type bytes = __n * sizeof(_Tp); size_type bytes = __n * sizeof(_Tp);
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H

View File

@ -153,8 +153,10 @@ void CartesianCommunicator::ShmInitGeneric(void){
if ( Hugepages ) mmap_flag |= MAP_HUGETLB; if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
#endif #endif
ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE); if (ShmCommBuf == (void *)MAP_FAILED) {
std::cout << "ShmCommBuf "<<ShmCommBuf<<std::endl; perror("mmap failed ");
exit(EXIT_FAILURE);
}
#else #else
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
ShmCommBuf=(void *)&ShmBufStorageVector[0]; ShmCommBuf=(void *)&ShmBufStorageVector[0];

View File

@ -221,8 +221,9 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
// Try to force numa domain on the shm segment if we have numaif.h // Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
#ifdef HAVE_NUMAIF_H #if 0
//#ifdef HAVE_NUMAIF_H
int status; int status;
int flags=MPOL_MF_MOVE; int flags=MPOL_MF_MOVE;
#ifdef KNL #ifdef KNL

View File

@ -242,11 +242,24 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank, int recv_from_rank,
int bytes,int dir) int bytes,int dir)
{ {
StencilSendToRecvFrom(xmit,xmit_to_rank,recv,recv_from_rank,bytes,dir); int myrank = _processor;
int ierr;
assert(dir < communicator_halo.size());
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
// Give the CPU to MPI immediately; can use threads to overlap optionally
MPI_Request req[2];
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
list.push_back(req[0]);
list.push_back(req[1]);
return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
// Do nothing int nreq=waitall.size();
MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
}; };
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
@ -262,7 +275,7 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
// Give the CPU to MPI immediately; can use threads to overlap optionally // Give the CPU to MPI immediately; can use threads to overlap optionally
MPI_Request req[2]; MPI_Request req[2];
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]); MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank, communicator_halo[dir], &req[0]); MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
MPI_Waitall(2, req, MPI_STATUSES_IGNORE); MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
return 2.0*bytes; return 2.0*bytes;
} }

View File

@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
for(int i=0;i<Ls;i++){ for(int i=0;i<Ls;i++){
bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0); bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
// assert(fabs(bee[i])>0.0); assert(fabs(bee[i])>0.0);
cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
beo[i]=as[i]*bs[i]; beo[i]=as[i]*bs[i];
ceo[i]=-as[i]*cs[i]; ceo[i]=-as[i]*cs[i];
@ -455,11 +455,17 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
dee[i] = bee[i]; dee[i] = bee[i];
if ( i < Ls-1 ) { if ( i < Ls-1 ) {
assert(fabs(bee[i])>0.0);
assert(fabs(bee[0])>0.0);
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
leem[i]=mass*cee[Ls-1]/bee[0]; leem[i]=mass*cee[Ls-1]/bee[0];
for(int j=0;j<i;j++) leem[i]*= aee[j]/bee[j+1]; for(int j=0;j<i;j++) {
assert(fabs(bee[j+1])>0.0);
leem[i]*= aee[j]/bee[j+1];
}
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
{ {
Coeff_t delta_d=mass*cee[Ls-1]; Coeff_t delta_d=mass*cee[Ls-1];
for(int j=0;j<Ls-1;j++) { for(int j=0;j<Ls-1;j++) {
// assert(fabs(bee[j])>0.0); assert(fabs(bee[j])>0.0);
delta_d *= cee[j]/bee[j]; delta_d *= cee[j]/bee[j];
} }
dee[Ls-1] += delta_d; dee[Ls-1] += delta_d;

View File

@ -238,7 +238,35 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
template<class vobj,class cobj> template<class vobj,class cobj>
class WilsonStencil : public CartesianStencil<vobj,cobj> { class WilsonStencil : public CartesianStencil<vobj,cobj> {
public: public:
double timer0;
double timer1;
double timer2;
double timer3;
double timer4;
double timer5;
double timer6;
uint64_t callsi;
void ZeroCountersi(void)
{
std::cout << GridLogMessage << " ZeroCountersi()"<<std::endl;
timer0=0;
timer1=0;
timer2=0;
timer3=0;
timer4=0;
timer5=0;
timer6=0;
callsi=0;
}
void Reporti(int calls)
{
std::cout << GridLogMessage << " Reporti() calls " <<callsi << calls<<std::endl;
if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate) " <<timer1/calls <<std::endl;
if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge ) " <<timer2/calls <<std::endl;
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
}
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
std::vector<int> same_node; std::vector<int> same_node;
@ -252,6 +280,7 @@ public:
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
same_node(npoints) same_node(npoints)
{ {
ZeroCountersi();
surface_list.resize(0); surface_list.resize(0);
}; };
@ -282,17 +311,25 @@ public:
{ {
std::vector<std::vector<CommsRequest_t> > reqs; std::vector<std::vector<CommsRequest_t> > reqs;
this->HaloExchangeOptGather(source,compress); this->HaloExchangeOptGather(source,compress);
double t1=usecond();
this->CommunicateBegin(reqs); this->CommunicateBegin(reqs);
this->CommunicateComplete(reqs); this->CommunicateComplete(reqs);
double t2=usecond(); timer1 += t2-t1;
this->CommsMerge(compress); this->CommsMerge(compress);
double t3=usecond(); timer2 += t3-t2;
this->CommsMergeSHM(compress); this->CommsMergeSHM(compress);
double t4=usecond(); timer3 += t4-t3;
} }
template <class compressor> template <class compressor>
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
{ {
this->Prepare(); this->Prepare();
double t0=usecond();
this->HaloGatherOpt(source,compress); this->HaloGatherOpt(source,compress);
double t1=usecond();
timer0 += t1-t0;
callsi++;
} }
template <class compressor> template <class compressor>
@ -304,7 +341,9 @@ public:
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
this->mpi3synctime_g-=usecond();
this->_grid->StencilBarrier(); this->_grid->StencilBarrier();
this->mpi3synctime_g+=usecond();
assert(source._grid==this->_grid); assert(source._grid==this->_grid);
this->halogtime-=usecond(); this->halogtime-=usecond();

View File

@ -185,6 +185,11 @@ void WilsonFermion5D<Impl>::Report(void)
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report(); std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report(); std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
} }
if ( DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
}
} }
template<class Impl> template<class Impl>
@ -204,6 +209,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
Stencil.ZeroCounters(); Stencil.ZeroCounters();
StencilEven.ZeroCounters(); StencilEven.ZeroCounters();
StencilOdd.ZeroCounters(); StencilOdd.ZeroCounters();
Stencil.ZeroCountersi();
StencilEven.ZeroCountersi();
StencilOdd.ZeroCountersi();
} }
@ -445,6 +453,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
DhopCommTime += ctime; DhopCommTime += ctime;
DhopComputeTime+=ptime; DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();

View File

@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
// Timing info; ugly; possibly temporary // Timing info; ugly; possibly temporary
///////////////////////////////////////// /////////////////////////////////////////
double commtime; double commtime;
double mpi3synctime;
double mpi3synctime_g;
double shmmergetime;
double gathertime; double gathertime;
double gathermtime; double gathermtime;
double halogtime; double halogtime;
@ -185,8 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
double splicetime; double splicetime;
double nosplicetime; double nosplicetime;
double calls; double calls;
std::vector<double> comms_bytesthr; std::vector<double> comm_bytes_thr;
std::vector<double> commtimethr; std::vector<double> comm_time_thr;
std::vector<double> comm_enter_thr;
std::vector<double> comm_leave_thr;
//////////////////////////////////////// ////////////////////////////////////////
// Stencil query // Stencil query
@ -262,18 +267,45 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
#endif #endif
if (nthreads == -1) nthreads = 1; if (nthreads == -1) nthreads = 1;
if (mythread < nthreads) { if (mythread < nthreads) {
comm_enter_thr[mythread] = usecond();
for (int i = mythread; i < Packets.size(); i += nthreads) { for (int i = mythread; i < Packets.size(); i += nthreads) {
double start = usecond();
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank, Packets[i].to_rank,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank, Packets[i].from_rank,
Packets[i].bytes,i); Packets[i].bytes,i);
comms_bytesthr[mythread] += bytes; comm_bytes_thr[mythread] += bytes;
commtimethr[mythread] += usecond() - start;
} }
comm_leave_thr[mythread]= usecond();
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
} }
} }
void CollateThreads(void)
{
int nthreads = CartesianCommunicator::nCommThreads;
double first=0.0;
double last =0.0;
for(int t=0;t<nthreads;t++) {
double t0 = comm_enter_thr[t];
double t1 = comm_leave_thr[t];
comms_bytes+=comm_bytes_thr[t];
comm_enter_thr[t] = 0.0;
comm_leave_thr[t] = 0.0;
comm_time_thr[t] = 0.0;
comm_bytes_thr[t]=0;
if ( first == 0.0 ) first = t0; // first is t0
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
if ( t1 > last ) last = t1; // max time seen
}
commtime+= last-first;
}
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
reqs.resize(Packets.size()); reqs.resize(Packets.size());
@ -295,14 +327,48 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
} }
commtime+=usecond(); commtime+=usecond();
} }
void Communicate(void)
{
#ifdef GRID_OMP
#pragma omp parallel
{
// must be called in parallel region
int mythread = omp_get_thread_num();
int maxthreads= omp_get_max_threads();
int nthreads = CartesianCommunicator::nCommThreads;
assert(nthreads <= maxthreads);
if (nthreads == -1) nthreads = 1;
#else
int mythread = 0;
int nthreads = 1;
#endif
if (mythread < nthreads) {
for (int i = mythread; i < Packets.size(); i += nthreads) {
double start = usecond();
comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
comm_time_thr[mythread] += usecond() - start;
}
}
#ifdef GRID_OMP
}
#endif
}
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
{ {
std::vector<std::vector<CommsRequest_t> > reqs; std::vector<std::vector<CommsRequest_t> > reqs;
Prepare(); Prepare();
HaloGather(source,compress); HaloGather(source,compress);
// Concurrent
CommunicateBegin(reqs); CommunicateBegin(reqs);
CommunicateComplete(reqs); CommunicateComplete(reqs);
// Sequential
// Communicate();
CommsMergeSHM(compress); CommsMergeSHM(compress);
CommsMerge(compress); CommsMerge(compress);
} }
@ -363,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
template<class compressor> template<class compressor>
void HaloGather(const Lattice<vobj> &source,compressor &compress) void HaloGather(const Lattice<vobj> &source,compressor &compress)
{ {
mpi3synctime_g-=usecond();
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime_g+=usecond();
// conformable(source._grid,_grid); // conformable(source._grid,_grid);
assert(source._grid==_grid); assert(source._grid==_grid);
@ -423,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
CommsMerge(decompress,Mergers,Decompressions); CommsMerge(decompress,Mergers,Decompressions);
} }
template<class decompressor> void CommsMergeSHM(decompressor decompress) { template<class decompressor> void CommsMergeSHM(decompressor decompress) {
mpi3synctime-=usecond();
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime+=usecond();
shmmergetime-=usecond();
CommsMerge(decompress,MergersSHM,DecompressionsSHM); CommsMerge(decompress,MergersSHM,DecompressionsSHM);
shmmergetime+=usecond();
} }
template<class decompressor> template<class decompressor>
@ -470,8 +542,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
const std::vector<int> &distances) const std::vector<int> &distances)
: _permute_type(npoints), : _permute_type(npoints),
_comm_buf_size(npoints), _comm_buf_size(npoints),
comms_bytesthr(npoints), comm_bytes_thr(npoints),
commtimethr(npoints) comm_enter_thr(npoints),
comm_leave_thr(npoints),
comm_time_thr(npoints)
{ {
face_table_computed=0; face_table_computed=0;
_npoints = npoints; _npoints = npoints;
@ -1025,8 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
void ZeroCounters(void) { void ZeroCounters(void) {
gathertime = 0.; gathertime = 0.;
commtime = 0.; commtime = 0.;
memset(&commtimethr[0], 0, sizeof(commtimethr)); mpi3synctime=0.;
memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr)); mpi3synctime_g=0.;
shmmergetime=0.;
for(int i=0;i<_npoints;i++){
comm_time_thr[i]=0;
comm_bytes_thr[i]=0;
comm_enter_thr[i]=0;
comm_leave_thr[i]=0;
}
halogtime = 0.; halogtime = 0.;
mergetime = 0.; mergetime = 0.;
decompresstime = 0.; decompresstime = 0.;
@ -1043,13 +1124,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
RealD NP = _grid->_Nprocessors; RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount(); RealD NN = _grid->NodeCount();
double t = 0; double t = 0;
// if commtimethr is set they were all done in parallel so take the max // if comm_time_thr is set they were all done in parallel so take the max
// but add up the bytes // but add up the bytes
int threaded = 0 ;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
comms_bytes += comms_bytesthr[i]; if ( comm_time_thr[i]>0.0 ) {
if (t < commtimethr[i]) t = commtimethr[i]; threaded = 1;
comms_bytes += comm_bytes_thr[i];
if (t < comm_time_thr[i]) t = comm_time_thr[i];
}
} }
commtime += t; if (threaded) commtime += t;
_grid->GlobalSum(commtime); commtime/=NP; _grid->GlobalSum(commtime); commtime/=NP;
if ( calls > 0. ) { if ( calls > 0. ) {
@ -1065,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl; std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl; std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
} }
PRINTIT(mpi3synctime);
PRINTIT(mpi3synctime_g);
PRINTIT(shmmergetime);
PRINTIT(splicetime); PRINTIT(splicetime);
PRINTIT(nosplicetime); PRINTIT(nosplicetime);
} }