mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Benchmark prep
This commit is contained in:
parent
b49bec0cec
commit
c3b1263e75
@ -32,6 +32,19 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
||||||
|
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
|
||||||
|
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<int> L_list;
|
||||||
|
std::vector<int> Ls_list;
|
||||||
|
std::vector<double> mflop_list;
|
||||||
|
|
||||||
|
double mflop_ref;
|
||||||
|
double mflop_ref_err;
|
||||||
|
|
||||||
|
int NN_global;
|
||||||
|
|
||||||
struct time_statistics{
|
struct time_statistics{
|
||||||
double mean;
|
double mean;
|
||||||
@ -95,13 +108,15 @@ public:
|
|||||||
|
|
||||||
static void Comms(void)
|
static void Comms(void)
|
||||||
{
|
{
|
||||||
int Nloop=100;
|
int Nloop=1000;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
int maxlat=32;
|
int maxlat=32;
|
||||||
|
|
||||||
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
|
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
|
||||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
|
||||||
|
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
|
||||||
|
|
||||||
std::vector<double> t_time(Nloop);
|
std::vector<double> t_time(Nloop);
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
|
|
||||||
@ -133,13 +148,14 @@ public:
|
|||||||
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
||||||
}
|
}
|
||||||
|
|
||||||
int ncomm;
|
|
||||||
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||||
|
int ncomm;
|
||||||
double dbytes;
|
double dbytes;
|
||||||
|
std::vector<double> times(Nloop);
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
|
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
|
|
||||||
std::vector<CartesianCommunicator::CommsRequest_t> requests;
|
|
||||||
dbytes=0;
|
dbytes=0;
|
||||||
ncomm=0;
|
ncomm=0;
|
||||||
|
|
||||||
@ -150,7 +166,6 @@ public:
|
|||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
if (mpi_layout[mu]>1 ) {
|
||||||
|
|
||||||
ncomm++;
|
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
if ( dir == mu ) {
|
if ( dir == mu ) {
|
||||||
@ -160,18 +175,18 @@ public:
|
|||||||
int comm_proc = mpi_layout[mu]-1;
|
int comm_proc = mpi_layout[mu]-1;
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
}
|
}
|
||||||
#if 1
|
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
||||||
tbytes= Grid.StencilSendToRecvFromBegin(requests,
|
(void *)&rbuf[dir][0], recv_from_rank,
|
||||||
(void *)&xbuf[dir][0],
|
bytes,dir);
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&rbuf[dir][0],
|
#ifdef GRID_OMP
|
||||||
recv_from_rank,
|
|
||||||
bytes,dir);
|
|
||||||
Grid.StencilSendToRecvFromComplete(requests,dir);
|
|
||||||
#endif
|
|
||||||
requests.resize(0);
|
|
||||||
|
|
||||||
#pragma omp atomic
|
#pragma omp atomic
|
||||||
|
#endif
|
||||||
|
ncomm++;
|
||||||
|
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
#pragma omp atomic
|
||||||
|
#endif
|
||||||
dbytes+=tbytes;
|
dbytes+=tbytes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -181,13 +196,15 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
|
// for(int i=0;i<t_time.size();i++){
|
||||||
|
// std::cout << i<<" "<<t_time[i]<<std::endl;
|
||||||
|
// }
|
||||||
|
|
||||||
dbytes=dbytes*ppn;
|
dbytes=dbytes*ppn;
|
||||||
double xbytes = dbytes*0.5;
|
double xbytes = dbytes*0.5;
|
||||||
double rbytes = dbytes*0.5;
|
double rbytes = dbytes*0.5;
|
||||||
double bidibytes = dbytes;
|
double bidibytes = dbytes;
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
||||||
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
||||||
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
||||||
@ -196,7 +213,8 @@ public:
|
|||||||
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
@ -218,7 +236,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
|
|
||||||
uint64_t lmax=48;
|
uint64_t lmax=48;
|
||||||
#define NLOOP (50*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
|
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
|
||||||
|
|
||||||
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||||
for(int lat=8;lat<=lmax;lat+=4){
|
for(int lat=8;lat<=lmax;lat+=4){
|
||||||
@ -253,8 +271,7 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static double DWF5(int Ls,int L)
|
||||||
static void DWF(int Ls,int L)
|
|
||||||
{
|
{
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
RealD M5 =1.8;
|
RealD M5 =1.8;
|
||||||
@ -262,6 +279,7 @@ public:
|
|||||||
double mflops;
|
double mflops;
|
||||||
double mflops_best = 0;
|
double mflops_best = 0;
|
||||||
double mflops_worst= 0;
|
double mflops_worst= 0;
|
||||||
|
std::vector<double> mflops_all;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////
|
||||||
// Set/Get the layout & grid size
|
// Set/Get the layout & grid size
|
||||||
@ -274,6 +292,189 @@ public:
|
|||||||
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
uint64_t NP = TmpGrid->RankCount();
|
uint64_t NP = TmpGrid->RankCount();
|
||||||
uint64_t NN = TmpGrid->NodeCount();
|
uint64_t NN = TmpGrid->NodeCount();
|
||||||
|
NN_global=NN;
|
||||||
|
uint64_t SHM=NP/NN;
|
||||||
|
|
||||||
|
std::vector<int> internal;
|
||||||
|
if ( SHM == 1 ) internal = std::vector<int>({1,1,1,1});
|
||||||
|
else if ( SHM == 2 ) internal = std::vector<int>({2,1,1,1});
|
||||||
|
else if ( SHM == 4 ) internal = std::vector<int>({2,2,1,1});
|
||||||
|
else if ( SHM == 8 ) internal = std::vector<int>({2,2,2,1});
|
||||||
|
else assert(0);
|
||||||
|
|
||||||
|
std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
|
||||||
|
std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
|
||||||
|
|
||||||
|
///////// Welcome message ////////////
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
///////// Lattice Init ////////////
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
///////// RNG Init ////////////
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||||
|
|
||||||
|
///////// Source preparation ////////////
|
||||||
|
LatticeFermion src (sFGrid); random(RNG5,src);
|
||||||
|
LatticeFermion tmp (sFGrid);
|
||||||
|
|
||||||
|
RealD N2 = 1.0/::sqrt(norm2(src));
|
||||||
|
src = src*N2;
|
||||||
|
|
||||||
|
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
|
||||||
|
|
||||||
|
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
|
||||||
|
LatticeFermion src_e (sFrbGrid);
|
||||||
|
LatticeFermion src_o (sFrbGrid);
|
||||||
|
LatticeFermion r_e (sFrbGrid);
|
||||||
|
LatticeFermion r_o (sFrbGrid);
|
||||||
|
LatticeFermion r_eo (sFGrid);
|
||||||
|
LatticeFermion err (sFGrid);
|
||||||
|
{
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
#if defined(AVX512)
|
||||||
|
const int num_cases = 6;
|
||||||
|
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
|
||||||
|
#else
|
||||||
|
const int num_cases = 4;
|
||||||
|
std::string fmt("U/S ; U/O ; G/S ; G/O ");
|
||||||
|
#endif
|
||||||
|
controls Cases [] = {
|
||||||
|
#ifdef AVX512
|
||||||
|
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
#endif
|
||||||
|
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
|
||||||
|
};
|
||||||
|
|
||||||
|
for(int c=0;c<num_cases;c++) {
|
||||||
|
|
||||||
|
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
|
||||||
|
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
|
||||||
|
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
int nwarm = 200;
|
||||||
|
double t0=usecond();
|
||||||
|
sFGrid->Barrier();
|
||||||
|
for(int i=0;i<nwarm;i++){
|
||||||
|
sDw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
|
sFGrid->Barrier();
|
||||||
|
double t1=usecond();
|
||||||
|
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
|
||||||
|
// if (ncall < 500) ncall = 500;
|
||||||
|
uint64_t ncall = 1000;
|
||||||
|
|
||||||
|
sFGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||||
|
|
||||||
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
sDw.ZeroCounters();
|
||||||
|
|
||||||
|
time_statistics timestat;
|
||||||
|
std::vector<double> t_time(ncall);
|
||||||
|
for(uint64_t i=0;i<ncall;i++){
|
||||||
|
t0=usecond();
|
||||||
|
sDw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
t1=usecond();
|
||||||
|
t_time[i] = t1-t0;
|
||||||
|
}
|
||||||
|
sFGrid->Barrier();
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume)/2;
|
||||||
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
|
timestat.statistics(t_time);
|
||||||
|
mf_hi = flops/timestat.min;
|
||||||
|
mf_lo = flops/timestat.max;
|
||||||
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
|
mflops = flops/timestat.mean;
|
||||||
|
mflops_all.push_back(mflops);
|
||||||
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
|
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
|
||||||
|
sDw.Report();
|
||||||
|
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl;
|
||||||
|
std::cout<<GridLogMessage <<fmt << std::endl;
|
||||||
|
std::cout<<GridLogMessage ;
|
||||||
|
|
||||||
|
for(int i=0;i<mflops_all.size();i++){
|
||||||
|
std::cout<<mflops_all[i]/NN<<" ; " ;
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
return mflops_best;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double DWF(int Ls,int L)
|
||||||
|
{
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
|
||||||
|
double mflops;
|
||||||
|
double mflops_best = 0;
|
||||||
|
double mflops_worst= 0;
|
||||||
|
std::vector<double> mflops_all;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Set/Get the layout & grid size
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
|
||||||
|
std::vector<int> local({L,L,L,L});
|
||||||
|
|
||||||
|
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),
|
||||||
|
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
uint64_t NP = TmpGrid->RankCount();
|
||||||
|
uint64_t NN = TmpGrid->NodeCount();
|
||||||
|
NN_global=NN;
|
||||||
uint64_t SHM=NP/NN;
|
uint64_t SHM=NP/NN;
|
||||||
|
|
||||||
std::vector<int> internal;
|
std::vector<int> internal;
|
||||||
@ -364,13 +565,15 @@ public:
|
|||||||
|
|
||||||
#if defined(AVX512)
|
#if defined(AVX512)
|
||||||
const int num_cases = 6;
|
const int num_cases = 6;
|
||||||
|
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
|
||||||
#else
|
#else
|
||||||
const int num_cases = 4;
|
const int num_cases = 4;
|
||||||
|
std::string fmt("U/S ; U/O ; G/S ; G/O ");
|
||||||
#endif
|
#endif
|
||||||
controls Cases [] = {
|
controls Cases [] = {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
#endif
|
#endif
|
||||||
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
@ -394,7 +597,7 @@ public:
|
|||||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
int nwarm = 10;
|
int nwarm = 200;
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
for(int i=0;i<nwarm;i++){
|
for(int i=0;i<nwarm;i++){
|
||||||
@ -402,7 +605,10 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
|
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
|
||||||
|
// if (ncall < 500) ncall = 500;
|
||||||
|
uint64_t ncall = 1000;
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
@ -428,7 +634,7 @@ public:
|
|||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean;
|
||||||
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
@ -450,12 +656,20 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best <<std::endl;
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<<std::endl;
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl;
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl;
|
||||||
|
std::cout<<GridLogMessage <<fmt << std::endl;
|
||||||
|
std::cout<<GridLogMessage ;
|
||||||
|
|
||||||
|
for(int i=0;i<mflops_all.size();i++){
|
||||||
|
std::cout<<mflops_all[i]/NN<<" ; " ;
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
return mflops_best;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -493,26 +707,66 @@ int main (int argc, char ** argv)
|
|||||||
// empty for now
|
// empty for now
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int sel=2;
|
||||||
|
std::vector<int> L_list({8,12,16,24});
|
||||||
|
std::vector<double> wilson;
|
||||||
|
std::vector<double> dwf4;
|
||||||
|
std::vector<double> dwf5;
|
||||||
|
|
||||||
if ( do_wilson ) {
|
if ( do_wilson ) {
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
|
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
Benchmark::DWF(Ls,16);
|
for(int l=0;l<L_list.size();l++){
|
||||||
Benchmark::DWF(Ls,24);
|
wilson.push_back(Benchmark::DWF(1,L_list[l]));
|
||||||
Benchmark::DWF(Ls,32);
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int Ls=16;
|
||||||
if ( do_dwf ) {
|
if ( do_dwf ) {
|
||||||
int Ls=16;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
|
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
Benchmark::DWF(Ls,8);
|
for(int l=0;l<L_list.size();l++){
|
||||||
Benchmark::DWF(Ls,12);
|
dwf4.push_back(Benchmark::DWF(Ls,L_list[l]));
|
||||||
Benchmark::DWF(Ls,16);
|
}
|
||||||
Benchmark::DWF(Ls,24);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( do_dwf ) {
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
int NN=NN_global;
|
||||||
|
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4 \t\t DWF5 " <<std::endl;
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Comparison point result: " << dwf4[sel]/NN <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
@ -92,11 +92,16 @@ int main (int argc, char ** argv)
|
|||||||
RealD Nnode = Grid.NodeCount();
|
RealD Nnode = Grid.NodeCount();
|
||||||
RealD ppn = Nrank/Nnode;
|
RealD ppn = Nrank/Nnode;
|
||||||
|
|
||||||
Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
|
||||||
Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
|
||||||
|
|
||||||
int ncomm;
|
int ncomm;
|
||||||
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||||
|
for(int mu=0;mu<8;mu++){
|
||||||
|
xbuf[mu].resize(lat*lat*lat*Ls);
|
||||||
|
rbuf[mu].resize(lat*lat*lat*Ls);
|
||||||
|
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
@ -112,7 +117,6 @@ int main (int argc, char ** argv)
|
|||||||
int comm_proc=1;
|
int comm_proc=1;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
Grid.SendToRecvFromBegin(requests,
|
Grid.SendToRecvFromBegin(requests,
|
||||||
(void *)&xbuf[mu][0],
|
(void *)&xbuf[mu][0],
|
||||||
@ -172,9 +176,14 @@ int main (int argc, char ** argv)
|
|||||||
RealD Nnode = Grid.NodeCount();
|
RealD Nnode = Grid.NodeCount();
|
||||||
RealD ppn = Nrank/Nnode;
|
RealD ppn = Nrank/Nnode;
|
||||||
|
|
||||||
Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
|
||||||
Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
|
||||||
|
|
||||||
|
for(int mu=0;mu<8;mu++){
|
||||||
|
xbuf[mu].resize(lat*lat*lat*Ls);
|
||||||
|
rbuf[mu].resize(lat*lat*lat*Ls);
|
||||||
|
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
int ncomm;
|
int ncomm;
|
||||||
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||||
@ -493,14 +502,9 @@ int main (int argc, char ** argv)
|
|||||||
int comm_proc = mpi_layout[mu]-1;
|
int comm_proc = mpi_layout[mu]-1;
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
}
|
}
|
||||||
tbytes= Grid.StencilSendToRecvFromBegin(requests,
|
|
||||||
(void *)&xbuf[dir][0],
|
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
||||||
xmit_to_rank,
|
(void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
|
||||||
(void *)&rbuf[dir][0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes,dir);
|
|
||||||
Grid.StencilSendToRecvFromComplete(requests,dir);
|
|
||||||
requests.resize(0);
|
|
||||||
|
|
||||||
#pragma omp atomic
|
#pragma omp atomic
|
||||||
dbytes+=tbytes;
|
dbytes+=tbytes;
|
||||||
|
@ -92,6 +92,9 @@ public:
|
|||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
|
|
||||||
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
||||||
|
// if ( ptr != NULL )
|
||||||
|
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
|
||||||
|
|
||||||
//////////////////
|
//////////////////
|
||||||
// Hack 2MB align; could make option probably doesn't need configurability
|
// Hack 2MB align; could make option probably doesn't need configurability
|
||||||
//////////////////
|
//////////////////
|
||||||
@ -102,6 +105,7 @@ public:
|
|||||||
#else
|
#else
|
||||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
||||||
#endif
|
#endif
|
||||||
|
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
|
||||||
// First touch optimise in threaded loop
|
// First touch optimise in threaded loop
|
||||||
uint8_t *cp = (uint8_t *)ptr;
|
uint8_t *cp = (uint8_t *)ptr;
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
@ -115,6 +119,7 @@ public:
|
|||||||
|
|
||||||
void deallocate(pointer __p, size_type __n) {
|
void deallocate(pointer __p, size_type __n) {
|
||||||
size_type bytes = __n * sizeof(_Tp);
|
size_type bytes = __n * sizeof(_Tp);
|
||||||
|
|
||||||
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
||||||
|
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
|
@ -153,8 +153,10 @@ void CartesianCommunicator::ShmInitGeneric(void){
|
|||||||
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
|
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
|
||||||
#endif
|
#endif
|
||||||
ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
|
ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
|
||||||
if (ShmCommBuf == (void *)MAP_FAILED) exit(EXIT_FAILURE);
|
if (ShmCommBuf == (void *)MAP_FAILED) {
|
||||||
std::cout << "ShmCommBuf "<<ShmCommBuf<<std::endl;
|
perror("mmap failed ");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
|
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
|
||||||
ShmCommBuf=(void *)&ShmBufStorageVector[0];
|
ShmCommBuf=(void *)&ShmBufStorageVector[0];
|
||||||
|
@ -221,8 +221,9 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
||||||
assert(((uint64_t)ptr&0x3F)==0);
|
assert(((uint64_t)ptr&0x3F)==0);
|
||||||
|
|
||||||
// Try to force numa domain on the shm segment if we have numaif.h
|
// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
|
||||||
#ifdef HAVE_NUMAIF_H
|
#if 0
|
||||||
|
//#ifdef HAVE_NUMAIF_H
|
||||||
int status;
|
int status;
|
||||||
int flags=MPOL_MF_MOVE;
|
int flags=MPOL_MF_MOVE;
|
||||||
#ifdef KNL
|
#ifdef KNL
|
||||||
|
@ -242,11 +242,24 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes,int dir)
|
int bytes,int dir)
|
||||||
{
|
{
|
||||||
StencilSendToRecvFrom(xmit,xmit_to_rank,recv,recv_from_rank,bytes,dir);
|
int myrank = _processor;
|
||||||
|
int ierr;
|
||||||
|
assert(dir < communicator_halo.size());
|
||||||
|
|
||||||
|
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
|
||||||
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||||
|
MPI_Request req[2];
|
||||||
|
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
|
||||||
|
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
|
||||||
|
|
||||||
|
list.push_back(req[0]);
|
||||||
|
list.push_back(req[1]);
|
||||||
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
// Do nothing
|
int nreq=waitall.size();
|
||||||
|
MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
|
||||||
};
|
};
|
||||||
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
|
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
@ -262,7 +275,7 @@ double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
|
|||||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||||
MPI_Request req[2];
|
MPI_Request req[2];
|
||||||
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
|
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
|
||||||
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank,myrank, communicator_halo[dir], &req[0]);
|
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
|
||||||
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
|
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
|
@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
|||||||
|
|
||||||
for(int i=0;i<Ls;i++){
|
for(int i=0;i<Ls;i++){
|
||||||
bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
|
bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
|
||||||
// assert(fabs(bee[i])>0.0);
|
assert(fabs(bee[i])>0.0);
|
||||||
cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
|
cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
|
||||||
beo[i]=as[i]*bs[i];
|
beo[i]=as[i]*bs[i];
|
||||||
ceo[i]=-as[i]*cs[i];
|
ceo[i]=-as[i]*cs[i];
|
||||||
@ -455,11 +455,17 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
|||||||
dee[i] = bee[i];
|
dee[i] = bee[i];
|
||||||
|
|
||||||
if ( i < Ls-1 ) {
|
if ( i < Ls-1 ) {
|
||||||
|
|
||||||
|
assert(fabs(bee[i])>0.0);
|
||||||
|
assert(fabs(bee[0])>0.0);
|
||||||
|
|
||||||
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
|
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
|
||||||
|
|
||||||
leem[i]=mass*cee[Ls-1]/bee[0];
|
leem[i]=mass*cee[Ls-1]/bee[0];
|
||||||
for(int j=0;j<i;j++) leem[i]*= aee[j]/bee[j+1];
|
for(int j=0;j<i;j++) {
|
||||||
|
assert(fabs(bee[j+1])>0.0);
|
||||||
|
leem[i]*= aee[j]/bee[j+1];
|
||||||
|
}
|
||||||
|
|
||||||
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
|
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
|
||||||
|
|
||||||
@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
|||||||
{
|
{
|
||||||
Coeff_t delta_d=mass*cee[Ls-1];
|
Coeff_t delta_d=mass*cee[Ls-1];
|
||||||
for(int j=0;j<Ls-1;j++) {
|
for(int j=0;j<Ls-1;j++) {
|
||||||
// assert(fabs(bee[j])>0.0);
|
assert(fabs(bee[j])>0.0);
|
||||||
delta_d *= cee[j]/bee[j];
|
delta_d *= cee[j]/bee[j];
|
||||||
}
|
}
|
||||||
dee[Ls-1] += delta_d;
|
dee[Ls-1] += delta_d;
|
||||||
|
@ -238,7 +238,35 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
|
|||||||
template<class vobj,class cobj>
|
template<class vobj,class cobj>
|
||||||
class WilsonStencil : public CartesianStencil<vobj,cobj> {
|
class WilsonStencil : public CartesianStencil<vobj,cobj> {
|
||||||
public:
|
public:
|
||||||
|
double timer0;
|
||||||
|
double timer1;
|
||||||
|
double timer2;
|
||||||
|
double timer3;
|
||||||
|
double timer4;
|
||||||
|
double timer5;
|
||||||
|
double timer6;
|
||||||
|
uint64_t callsi;
|
||||||
|
void ZeroCountersi(void)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage << " ZeroCountersi()"<<std::endl;
|
||||||
|
timer0=0;
|
||||||
|
timer1=0;
|
||||||
|
timer2=0;
|
||||||
|
timer3=0;
|
||||||
|
timer4=0;
|
||||||
|
timer5=0;
|
||||||
|
timer6=0;
|
||||||
|
callsi=0;
|
||||||
|
}
|
||||||
|
void Reporti(int calls)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage << " Reporti() calls " <<callsi << calls<<std::endl;
|
||||||
|
if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
|
||||||
|
if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate) " <<timer1/calls <<std::endl;
|
||||||
|
if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge ) " <<timer2/calls <<std::endl;
|
||||||
|
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
|
||||||
|
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
||||||
|
}
|
||||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||||
|
|
||||||
std::vector<int> same_node;
|
std::vector<int> same_node;
|
||||||
@ -252,6 +280,7 @@ public:
|
|||||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
||||||
same_node(npoints)
|
same_node(npoints)
|
||||||
{
|
{
|
||||||
|
ZeroCountersi();
|
||||||
surface_list.resize(0);
|
surface_list.resize(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -282,17 +311,25 @@ public:
|
|||||||
{
|
{
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
this->HaloExchangeOptGather(source,compress);
|
this->HaloExchangeOptGather(source,compress);
|
||||||
|
double t1=usecond();
|
||||||
this->CommunicateBegin(reqs);
|
this->CommunicateBegin(reqs);
|
||||||
this->CommunicateComplete(reqs);
|
this->CommunicateComplete(reqs);
|
||||||
|
double t2=usecond(); timer1 += t2-t1;
|
||||||
this->CommsMerge(compress);
|
this->CommsMerge(compress);
|
||||||
|
double t3=usecond(); timer2 += t3-t2;
|
||||||
this->CommsMergeSHM(compress);
|
this->CommsMergeSHM(compress);
|
||||||
|
double t4=usecond(); timer3 += t4-t3;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class compressor>
|
template <class compressor>
|
||||||
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
this->Prepare();
|
this->Prepare();
|
||||||
|
double t0=usecond();
|
||||||
this->HaloGatherOpt(source,compress);
|
this->HaloGatherOpt(source,compress);
|
||||||
|
double t1=usecond();
|
||||||
|
timer0 += t1-t0;
|
||||||
|
callsi++;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class compressor>
|
template <class compressor>
|
||||||
@ -304,7 +341,9 @@ public:
|
|||||||
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
|
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
|
||||||
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
|
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
|
||||||
|
|
||||||
|
this->mpi3synctime_g-=usecond();
|
||||||
this->_grid->StencilBarrier();
|
this->_grid->StencilBarrier();
|
||||||
|
this->mpi3synctime_g+=usecond();
|
||||||
|
|
||||||
assert(source._grid==this->_grid);
|
assert(source._grid==this->_grid);
|
||||||
this->halogtime-=usecond();
|
this->halogtime-=usecond();
|
||||||
|
@ -185,6 +185,11 @@ void WilsonFermion5D<Impl>::Report(void)
|
|||||||
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
||||||
}
|
}
|
||||||
|
if ( DhopCalls > 0){
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -204,6 +209,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
|
|||||||
Stencil.ZeroCounters();
|
Stencil.ZeroCounters();
|
||||||
StencilEven.ZeroCounters();
|
StencilEven.ZeroCounters();
|
||||||
StencilOdd.ZeroCounters();
|
StencilOdd.ZeroCounters();
|
||||||
|
Stencil.ZeroCountersi();
|
||||||
|
StencilEven.ZeroCountersi();
|
||||||
|
StencilOdd.ZeroCountersi();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -445,6 +453,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
DhopCommTime += ctime;
|
DhopCommTime += ctime;
|
||||||
DhopComputeTime+=ptime;
|
DhopComputeTime+=ptime;
|
||||||
|
|
||||||
|
// First to enter, last to leave timing
|
||||||
|
st.CollateThreads();
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
// Timing info; ugly; possibly temporary
|
// Timing info; ugly; possibly temporary
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
double commtime;
|
double commtime;
|
||||||
|
double mpi3synctime;
|
||||||
|
double mpi3synctime_g;
|
||||||
|
double shmmergetime;
|
||||||
double gathertime;
|
double gathertime;
|
||||||
double gathermtime;
|
double gathermtime;
|
||||||
double halogtime;
|
double halogtime;
|
||||||
@ -185,8 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
double splicetime;
|
double splicetime;
|
||||||
double nosplicetime;
|
double nosplicetime;
|
||||||
double calls;
|
double calls;
|
||||||
std::vector<double> comms_bytesthr;
|
std::vector<double> comm_bytes_thr;
|
||||||
std::vector<double> commtimethr;
|
std::vector<double> comm_time_thr;
|
||||||
|
std::vector<double> comm_enter_thr;
|
||||||
|
std::vector<double> comm_leave_thr;
|
||||||
|
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Stencil query
|
// Stencil query
|
||||||
@ -262,18 +267,45 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
#endif
|
#endif
|
||||||
if (nthreads == -1) nthreads = 1;
|
if (nthreads == -1) nthreads = 1;
|
||||||
if (mythread < nthreads) {
|
if (mythread < nthreads) {
|
||||||
|
comm_enter_thr[mythread] = usecond();
|
||||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||||
double start = usecond();
|
|
||||||
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||||
Packets[i].to_rank,
|
Packets[i].to_rank,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes,i);
|
Packets[i].bytes,i);
|
||||||
comms_bytesthr[mythread] += bytes;
|
comm_bytes_thr[mythread] += bytes;
|
||||||
commtimethr[mythread] += usecond() - start;
|
|
||||||
}
|
}
|
||||||
|
comm_leave_thr[mythread]= usecond();
|
||||||
|
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void CollateThreads(void)
|
||||||
|
{
|
||||||
|
int nthreads = CartesianCommunicator::nCommThreads;
|
||||||
|
double first=0.0;
|
||||||
|
double last =0.0;
|
||||||
|
|
||||||
|
for(int t=0;t<nthreads;t++) {
|
||||||
|
|
||||||
|
double t0 = comm_enter_thr[t];
|
||||||
|
double t1 = comm_leave_thr[t];
|
||||||
|
comms_bytes+=comm_bytes_thr[t];
|
||||||
|
|
||||||
|
comm_enter_thr[t] = 0.0;
|
||||||
|
comm_leave_thr[t] = 0.0;
|
||||||
|
comm_time_thr[t] = 0.0;
|
||||||
|
comm_bytes_thr[t]=0;
|
||||||
|
|
||||||
|
if ( first == 0.0 ) first = t0; // first is t0
|
||||||
|
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
|
||||||
|
|
||||||
|
if ( t1 > last ) last = t1; // max time seen
|
||||||
|
|
||||||
|
}
|
||||||
|
commtime+= last-first;
|
||||||
|
}
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
reqs.resize(Packets.size());
|
reqs.resize(Packets.size());
|
||||||
@ -295,14 +327,48 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
|
void Communicate(void)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
// must be called in parallel region
|
||||||
|
int mythread = omp_get_thread_num();
|
||||||
|
int maxthreads= omp_get_max_threads();
|
||||||
|
int nthreads = CartesianCommunicator::nCommThreads;
|
||||||
|
assert(nthreads <= maxthreads);
|
||||||
|
|
||||||
|
if (nthreads == -1) nthreads = 1;
|
||||||
|
#else
|
||||||
|
int mythread = 0;
|
||||||
|
int nthreads = 1;
|
||||||
|
#endif
|
||||||
|
if (mythread < nthreads) {
|
||||||
|
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||||
|
double start = usecond();
|
||||||
|
comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||||
|
Packets[i].to_rank,
|
||||||
|
Packets[i].recv_buf,
|
||||||
|
Packets[i].from_rank,
|
||||||
|
Packets[i].bytes,i);
|
||||||
|
comm_time_thr[mythread] += usecond() - start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
Prepare();
|
Prepare();
|
||||||
HaloGather(source,compress);
|
HaloGather(source,compress);
|
||||||
|
// Concurrent
|
||||||
CommunicateBegin(reqs);
|
CommunicateBegin(reqs);
|
||||||
CommunicateComplete(reqs);
|
CommunicateComplete(reqs);
|
||||||
|
// Sequential
|
||||||
|
// Communicate();
|
||||||
CommsMergeSHM(compress);
|
CommsMergeSHM(compress);
|
||||||
CommsMerge(compress);
|
CommsMerge(compress);
|
||||||
}
|
}
|
||||||
@ -363,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
template<class compressor>
|
template<class compressor>
|
||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
|
mpi3synctime_g-=usecond();
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
mpi3synctime_g+=usecond();
|
||||||
|
|
||||||
// conformable(source._grid,_grid);
|
// conformable(source._grid,_grid);
|
||||||
assert(source._grid==_grid);
|
assert(source._grid==_grid);
|
||||||
@ -423,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
CommsMerge(decompress,Mergers,Decompressions);
|
CommsMerge(decompress,Mergers,Decompressions);
|
||||||
}
|
}
|
||||||
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
||||||
|
mpi3synctime-=usecond();
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
mpi3synctime+=usecond();
|
||||||
|
shmmergetime-=usecond();
|
||||||
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
|
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
|
||||||
|
shmmergetime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class decompressor>
|
template<class decompressor>
|
||||||
@ -470,8 +542,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: _permute_type(npoints),
|
: _permute_type(npoints),
|
||||||
_comm_buf_size(npoints),
|
_comm_buf_size(npoints),
|
||||||
comms_bytesthr(npoints),
|
comm_bytes_thr(npoints),
|
||||||
commtimethr(npoints)
|
comm_enter_thr(npoints),
|
||||||
|
comm_leave_thr(npoints),
|
||||||
|
comm_time_thr(npoints)
|
||||||
{
|
{
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_npoints = npoints;
|
_npoints = npoints;
|
||||||
@ -1025,8 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
void ZeroCounters(void) {
|
void ZeroCounters(void) {
|
||||||
gathertime = 0.;
|
gathertime = 0.;
|
||||||
commtime = 0.;
|
commtime = 0.;
|
||||||
memset(&commtimethr[0], 0, sizeof(commtimethr));
|
mpi3synctime=0.;
|
||||||
memset(&comms_bytesthr[0], 0, sizeof(comms_bytesthr));
|
mpi3synctime_g=0.;
|
||||||
|
shmmergetime=0.;
|
||||||
|
for(int i=0;i<_npoints;i++){
|
||||||
|
comm_time_thr[i]=0;
|
||||||
|
comm_bytes_thr[i]=0;
|
||||||
|
comm_enter_thr[i]=0;
|
||||||
|
comm_leave_thr[i]=0;
|
||||||
|
}
|
||||||
halogtime = 0.;
|
halogtime = 0.;
|
||||||
mergetime = 0.;
|
mergetime = 0.;
|
||||||
decompresstime = 0.;
|
decompresstime = 0.;
|
||||||
@ -1043,13 +1124,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
RealD NP = _grid->_Nprocessors;
|
RealD NP = _grid->_Nprocessors;
|
||||||
RealD NN = _grid->NodeCount();
|
RealD NN = _grid->NodeCount();
|
||||||
double t = 0;
|
double t = 0;
|
||||||
// if commtimethr is set they were all done in parallel so take the max
|
// if comm_time_thr is set they were all done in parallel so take the max
|
||||||
// but add up the bytes
|
// but add up the bytes
|
||||||
|
int threaded = 0 ;
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < 8; ++i) {
|
||||||
comms_bytes += comms_bytesthr[i];
|
if ( comm_time_thr[i]>0.0 ) {
|
||||||
if (t < commtimethr[i]) t = commtimethr[i];
|
threaded = 1;
|
||||||
|
comms_bytes += comm_bytes_thr[i];
|
||||||
|
if (t < comm_time_thr[i]) t = comm_time_thr[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
commtime += t;
|
if (threaded) commtime += t;
|
||||||
|
|
||||||
_grid->GlobalSum(commtime); commtime/=NP;
|
_grid->GlobalSum(commtime); commtime/=NP;
|
||||||
if ( calls > 0. ) {
|
if ( calls > 0. ) {
|
||||||
@ -1065,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
||||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||||
}
|
}
|
||||||
|
PRINTIT(mpi3synctime);
|
||||||
|
PRINTIT(mpi3synctime_g);
|
||||||
|
PRINTIT(shmmergetime);
|
||||||
PRINTIT(splicetime);
|
PRINTIT(splicetime);
|
||||||
PRINTIT(nosplicetime);
|
PRINTIT(nosplicetime);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user