mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
This commit is contained in:
commit
a5fe07c077
68
.travis.yml
68
.travis.yml
@ -9,68 +9,6 @@ matrix:
|
|||||||
- os: osx
|
- os: osx
|
||||||
osx_image: xcode8.3
|
osx_image: xcode8.3
|
||||||
compiler: clang
|
compiler: clang
|
||||||
- compiler: gcc
|
|
||||||
dist: trusty
|
|
||||||
sudo: required
|
|
||||||
addons:
|
|
||||||
apt:
|
|
||||||
sources:
|
|
||||||
- ubuntu-toolchain-r-test
|
|
||||||
packages:
|
|
||||||
- g++-4.9
|
|
||||||
- libmpfr-dev
|
|
||||||
- libgmp-dev
|
|
||||||
- libmpc-dev
|
|
||||||
- libopenmpi-dev
|
|
||||||
- openmpi-bin
|
|
||||||
- binutils-dev
|
|
||||||
env: VERSION=-4.9
|
|
||||||
- compiler: gcc
|
|
||||||
dist: trusty
|
|
||||||
sudo: required
|
|
||||||
addons:
|
|
||||||
apt:
|
|
||||||
sources:
|
|
||||||
- ubuntu-toolchain-r-test
|
|
||||||
packages:
|
|
||||||
- g++-5
|
|
||||||
- libmpfr-dev
|
|
||||||
- libgmp-dev
|
|
||||||
- libmpc-dev
|
|
||||||
- libopenmpi-dev
|
|
||||||
- openmpi-bin
|
|
||||||
- binutils-dev
|
|
||||||
env: VERSION=-5
|
|
||||||
- compiler: clang
|
|
||||||
dist: trusty
|
|
||||||
addons:
|
|
||||||
apt:
|
|
||||||
sources:
|
|
||||||
- ubuntu-toolchain-r-test
|
|
||||||
packages:
|
|
||||||
- g++-4.8
|
|
||||||
- libmpfr-dev
|
|
||||||
- libgmp-dev
|
|
||||||
- libmpc-dev
|
|
||||||
- libopenmpi-dev
|
|
||||||
- openmpi-bin
|
|
||||||
- binutils-dev
|
|
||||||
env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
|
||||||
- compiler: clang
|
|
||||||
dist: trusty
|
|
||||||
addons:
|
|
||||||
apt:
|
|
||||||
sources:
|
|
||||||
- ubuntu-toolchain-r-test
|
|
||||||
packages:
|
|
||||||
- g++-4.8
|
|
||||||
- libmpfr-dev
|
|
||||||
- libgmp-dev
|
|
||||||
- libmpc-dev
|
|
||||||
- libopenmpi-dev
|
|
||||||
- openmpi-bin
|
|
||||||
- binutils-dev
|
|
||||||
env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- export GRIDDIR=`pwd`
|
- export GRIDDIR=`pwd`
|
||||||
@ -106,9 +44,3 @@ script:
|
|||||||
- make -j4
|
- make -j4
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
||||||
- make check
|
- make check
|
||||||
- echo make clean
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
|
|
||||||
|
|
||||||
|
|
||||||
|
16
README.md
16
README.md
@ -1,18 +1,4 @@
|
|||||||
# Grid
|
# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td>Last stable release</td>
|
|
||||||
<td><a href="https://travis-ci.org/paboyle/Grid">
|
|
||||||
<img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>Development branch</td>
|
|
||||||
<td><a href="https://travis-ci.org/paboyle/Grid">
|
|
||||||
<img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
**Data parallel C++ mathematical object library.**
|
**Data parallel C++ mathematical object library.**
|
||||||
|
|
||||||
|
4
TODO
4
TODO
@ -6,6 +6,7 @@ Large item work list:
|
|||||||
1)- BG/Q port and check
|
1)- BG/Q port and check
|
||||||
2)- Christoph's local basis expansion Lanczos
|
2)- Christoph's local basis expansion Lanczos
|
||||||
3)- Precision conversion and sort out localConvert <-- partial
|
3)- Precision conversion and sort out localConvert <-- partial
|
||||||
|
|
||||||
- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
|
- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
|
||||||
4)- Physical propagator interface
|
4)- Physical propagator interface
|
||||||
5)- Conserved currents
|
5)- Conserved currents
|
||||||
@ -13,7 +14,8 @@ Large item work list:
|
|||||||
7)- HDCR resume
|
7)- HDCR resume
|
||||||
|
|
||||||
Recent DONE
|
Recent DONE
|
||||||
-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
|
|
||||||
|
-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O. <--- DONE
|
||||||
-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
|
-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
|
||||||
-- GaugeFix into central location <-- DONE
|
-- GaugeFix into central location <-- DONE
|
||||||
-- Scidac and Ildg metadata handling <-- DONE
|
-- Scidac and Ildg metadata handling <-- DONE
|
||||||
|
@ -32,6 +32,19 @@ using namespace std;
|
|||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
|
||||||
|
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
|
||||||
|
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<int> L_list;
|
||||||
|
std::vector<int> Ls_list;
|
||||||
|
std::vector<double> mflop_list;
|
||||||
|
|
||||||
|
double mflop_ref;
|
||||||
|
double mflop_ref_err;
|
||||||
|
|
||||||
|
int NN_global;
|
||||||
|
|
||||||
struct time_statistics{
|
struct time_statistics{
|
||||||
double mean;
|
double mean;
|
||||||
@ -95,13 +108,15 @@ public:
|
|||||||
|
|
||||||
static void Comms(void)
|
static void Comms(void)
|
||||||
{
|
{
|
||||||
int Nloop=100;
|
int Nloop=200;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
int maxlat=32;
|
int maxlat=32;
|
||||||
|
|
||||||
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
|
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
|
||||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
|
||||||
|
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
|
||||||
|
|
||||||
std::vector<double> t_time(Nloop);
|
std::vector<double> t_time(Nloop);
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
|
|
||||||
@ -133,13 +148,14 @@ public:
|
|||||||
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
||||||
}
|
}
|
||||||
|
|
||||||
int ncomm;
|
|
||||||
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||||
|
int ncomm;
|
||||||
double dbytes;
|
double dbytes;
|
||||||
|
std::vector<double> times(Nloop);
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
|
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
|
|
||||||
std::vector<CartesianCommunicator::CommsRequest_t> requests;
|
|
||||||
dbytes=0;
|
dbytes=0;
|
||||||
ncomm=0;
|
ncomm=0;
|
||||||
|
|
||||||
@ -150,7 +166,6 @@ public:
|
|||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
if (mpi_layout[mu]>1 ) {
|
||||||
|
|
||||||
ncomm++;
|
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
if ( dir == mu ) {
|
if ( dir == mu ) {
|
||||||
@ -160,18 +175,18 @@ public:
|
|||||||
int comm_proc = mpi_layout[mu]-1;
|
int comm_proc = mpi_layout[mu]-1;
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
}
|
}
|
||||||
#if 0
|
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
||||||
tbytes= Grid.StencilSendToRecvFromBegin(requests,
|
(void *)&rbuf[dir][0], recv_from_rank,
|
||||||
(void *)&xbuf[dir][0],
|
bytes,dir);
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&rbuf[dir][0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes,dir);
|
|
||||||
Grid.StencilSendToRecvFromComplete(requests,dir);
|
|
||||||
#endif
|
|
||||||
requests.resize(0);
|
|
||||||
|
|
||||||
|
#ifdef GRID_OMP
|
||||||
#pragma omp atomic
|
#pragma omp atomic
|
||||||
|
#endif
|
||||||
|
ncomm++;
|
||||||
|
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
#pragma omp atomic
|
||||||
|
#endif
|
||||||
dbytes+=tbytes;
|
dbytes+=tbytes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -181,13 +196,15 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
|
// for(int i=0;i<t_time.size();i++){
|
||||||
|
// std::cout << i<<" "<<t_time[i]<<std::endl;
|
||||||
|
// }
|
||||||
|
|
||||||
dbytes=dbytes*ppn;
|
dbytes=dbytes*ppn;
|
||||||
double xbytes = dbytes*0.5;
|
double xbytes = dbytes*0.5;
|
||||||
double rbytes = dbytes*0.5;
|
double rbytes = dbytes*0.5;
|
||||||
double bidibytes = dbytes;
|
double bidibytes = dbytes;
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
||||||
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
||||||
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
||||||
@ -196,7 +213,8 @@ public:
|
|||||||
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
@ -218,7 +236,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
|
|
||||||
uint64_t lmax=48;
|
uint64_t lmax=48;
|
||||||
#define NLOOP (10*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
|
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
|
||||||
|
|
||||||
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||||
for(int lat=8;lat<=lmax;lat+=4){
|
for(int lat=8;lat<=lmax;lat+=4){
|
||||||
@ -253,8 +271,7 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static double DWF5(int Ls,int L)
|
||||||
static void DWF(int Ls,int L)
|
|
||||||
{
|
{
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
RealD M5 =1.8;
|
RealD M5 =1.8;
|
||||||
@ -262,6 +279,7 @@ public:
|
|||||||
double mflops;
|
double mflops;
|
||||||
double mflops_best = 0;
|
double mflops_best = 0;
|
||||||
double mflops_worst= 0;
|
double mflops_worst= 0;
|
||||||
|
std::vector<double> mflops_all;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////
|
||||||
// Set/Get the layout & grid size
|
// Set/Get the layout & grid size
|
||||||
@ -274,6 +292,189 @@ public:
|
|||||||
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
uint64_t NP = TmpGrid->RankCount();
|
uint64_t NP = TmpGrid->RankCount();
|
||||||
uint64_t NN = TmpGrid->NodeCount();
|
uint64_t NN = TmpGrid->NodeCount();
|
||||||
|
NN_global=NN;
|
||||||
|
uint64_t SHM=NP/NN;
|
||||||
|
|
||||||
|
std::vector<int> internal;
|
||||||
|
if ( SHM == 1 ) internal = std::vector<int>({1,1,1,1});
|
||||||
|
else if ( SHM == 2 ) internal = std::vector<int>({2,1,1,1});
|
||||||
|
else if ( SHM == 4 ) internal = std::vector<int>({2,2,1,1});
|
||||||
|
else if ( SHM == 8 ) internal = std::vector<int>({2,2,2,1});
|
||||||
|
else assert(0);
|
||||||
|
|
||||||
|
std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
|
||||||
|
std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
|
||||||
|
|
||||||
|
///////// Welcome message ////////////
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
///////// Lattice Init ////////////
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
///////// RNG Init ////////////
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||||
|
|
||||||
|
///////// Source preparation ////////////
|
||||||
|
LatticeFermion src (sFGrid); random(RNG5,src);
|
||||||
|
LatticeFermion tmp (sFGrid);
|
||||||
|
|
||||||
|
RealD N2 = 1.0/::sqrt(norm2(src));
|
||||||
|
src = src*N2;
|
||||||
|
|
||||||
|
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
|
||||||
|
|
||||||
|
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
|
||||||
|
LatticeFermion src_e (sFrbGrid);
|
||||||
|
LatticeFermion src_o (sFrbGrid);
|
||||||
|
LatticeFermion r_e (sFrbGrid);
|
||||||
|
LatticeFermion r_o (sFrbGrid);
|
||||||
|
LatticeFermion r_eo (sFGrid);
|
||||||
|
LatticeFermion err (sFGrid);
|
||||||
|
{
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
#if defined(AVX512)
|
||||||
|
const int num_cases = 6;
|
||||||
|
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
|
||||||
|
#else
|
||||||
|
const int num_cases = 4;
|
||||||
|
std::string fmt("U/S ; U/O ; G/S ; G/O ");
|
||||||
|
#endif
|
||||||
|
controls Cases [] = {
|
||||||
|
#ifdef AVX512
|
||||||
|
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
#endif
|
||||||
|
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
|
||||||
|
};
|
||||||
|
|
||||||
|
for(int c=0;c<num_cases;c++) {
|
||||||
|
|
||||||
|
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
|
||||||
|
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
|
||||||
|
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||||
|
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
int nwarm = 100;
|
||||||
|
double t0=usecond();
|
||||||
|
sFGrid->Barrier();
|
||||||
|
for(int i=0;i<nwarm;i++){
|
||||||
|
sDw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
|
sFGrid->Barrier();
|
||||||
|
double t1=usecond();
|
||||||
|
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
|
||||||
|
// if (ncall < 500) ncall = 500;
|
||||||
|
uint64_t ncall = 500;
|
||||||
|
|
||||||
|
sFGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||||
|
|
||||||
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
sDw.ZeroCounters();
|
||||||
|
|
||||||
|
time_statistics timestat;
|
||||||
|
std::vector<double> t_time(ncall);
|
||||||
|
for(uint64_t i=0;i<ncall;i++){
|
||||||
|
t0=usecond();
|
||||||
|
sDw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
t1=usecond();
|
||||||
|
t_time[i] = t1-t0;
|
||||||
|
}
|
||||||
|
sFGrid->Barrier();
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume)/2;
|
||||||
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
|
timestat.statistics(t_time);
|
||||||
|
mf_hi = flops/timestat.min;
|
||||||
|
mf_lo = flops/timestat.max;
|
||||||
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
|
mflops = flops/timestat.mean;
|
||||||
|
mflops_all.push_back(mflops);
|
||||||
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
|
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
|
||||||
|
sDw.Report();
|
||||||
|
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl;
|
||||||
|
std::cout<<GridLogMessage <<fmt << std::endl;
|
||||||
|
std::cout<<GridLogMessage ;
|
||||||
|
|
||||||
|
for(int i=0;i<mflops_all.size();i++){
|
||||||
|
std::cout<<mflops_all[i]/NN<<" ; " ;
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
return mflops_best;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double DWF(int Ls,int L)
|
||||||
|
{
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
|
||||||
|
double mflops;
|
||||||
|
double mflops_best = 0;
|
||||||
|
double mflops_worst= 0;
|
||||||
|
std::vector<double> mflops_all;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Set/Get the layout & grid size
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
|
||||||
|
std::vector<int> local({L,L,L,L});
|
||||||
|
|
||||||
|
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),
|
||||||
|
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
uint64_t NP = TmpGrid->RankCount();
|
||||||
|
uint64_t NN = TmpGrid->NodeCount();
|
||||||
|
NN_global=NN;
|
||||||
uint64_t SHM=NP/NN;
|
uint64_t SHM=NP/NN;
|
||||||
|
|
||||||
std::vector<int> internal;
|
std::vector<int> internal;
|
||||||
@ -364,13 +565,15 @@ public:
|
|||||||
|
|
||||||
#if defined(AVX512)
|
#if defined(AVX512)
|
||||||
const int num_cases = 6;
|
const int num_cases = 6;
|
||||||
|
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
|
||||||
#else
|
#else
|
||||||
const int num_cases = 4;
|
const int num_cases = 4;
|
||||||
|
std::string fmt("U/S ; U/O ; G/S ; G/O ");
|
||||||
#endif
|
#endif
|
||||||
controls Cases [] = {
|
controls Cases [] = {
|
||||||
#if defined(AVX512)
|
#ifdef AVX512
|
||||||
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
|
||||||
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
|
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
#endif
|
#endif
|
||||||
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
|
||||||
@ -380,6 +583,10 @@ public:
|
|||||||
|
|
||||||
for(int c=0;c<num_cases;c++) {
|
for(int c=0;c<num_cases;c++) {
|
||||||
|
|
||||||
|
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
|
||||||
|
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
|
||||||
|
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||||
@ -390,11 +597,7 @@ public:
|
|||||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
int nwarm = 200;
|
||||||
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
|
|
||||||
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
|
||||||
int nwarm = 10;
|
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
for(int i=0;i<nwarm;i++){
|
for(int i=0;i<nwarm;i++){
|
||||||
@ -402,7 +605,10 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
|
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
|
||||||
|
// if (ncall < 500) ncall = 500;
|
||||||
|
uint64_t ncall = 1000;
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
@ -428,7 +634,7 @@ public:
|
|||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean;
|
||||||
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
@ -450,12 +656,20 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best <<std::endl;
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<<std::endl;
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
||||||
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl;
|
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl;
|
||||||
|
std::cout<<GridLogMessage <<fmt << std::endl;
|
||||||
|
std::cout<<GridLogMessage ;
|
||||||
|
|
||||||
|
for(int i=0;i<mflops_all.size();i++){
|
||||||
|
std::cout<<mflops_all[i]/NN<<" ; " ;
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
return mflops_best;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -465,8 +679,11 @@ int main (int argc, char ** argv)
|
|||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||||
|
#ifdef KNL
|
||||||
|
LebesgueOrder::Block = std::vector<int>({8,2,2,2});
|
||||||
|
#else
|
||||||
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
||||||
|
#endif
|
||||||
Benchmark::Decomposition();
|
Benchmark::Decomposition();
|
||||||
|
|
||||||
int do_memory=1;
|
int do_memory=1;
|
||||||
@ -493,26 +710,66 @@ int main (int argc, char ** argv)
|
|||||||
// empty for now
|
// empty for now
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int sel=2;
|
||||||
|
std::vector<int> L_list({8,12,16,24});
|
||||||
|
std::vector<double> wilson;
|
||||||
|
std::vector<double> dwf4;
|
||||||
|
std::vector<double> dwf5;
|
||||||
|
|
||||||
if ( do_wilson ) {
|
if ( do_wilson ) {
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
|
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
Benchmark::DWF(Ls,16);
|
for(int l=0;l<L_list.size();l++){
|
||||||
Benchmark::DWF(Ls,24);
|
wilson.push_back(Benchmark::DWF(1,L_list[l]));
|
||||||
Benchmark::DWF(Ls,32);
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int Ls=16;
|
||||||
if ( do_dwf ) {
|
if ( do_dwf ) {
|
||||||
int Ls=16;
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
|
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
Benchmark::DWF(Ls,8);
|
for(int l=0;l<L_list.size();l++){
|
||||||
Benchmark::DWF(Ls,12);
|
dwf4.push_back(Benchmark::DWF(Ls,L_list[l]));
|
||||||
Benchmark::DWF(Ls,16);
|
}
|
||||||
Benchmark::DWF(Ls,24);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( do_dwf ) {
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
int NN=NN_global;
|
||||||
|
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4 \t\t DWF5 " <<std::endl;
|
||||||
|
for(int l=0;l<L_list.size();l++){
|
||||||
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " Comparison point result: " << dwf4[sel]/NN <<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
@ -68,7 +68,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int Nloop=100;
|
int Nloop=100;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
int maxlat=24;
|
int maxlat=32;
|
||||||
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
|
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
|
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
|
||||||
@ -80,7 +80,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
header();
|
header();
|
||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=32;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
lat*mpi_layout[1],
|
lat*mpi_layout[1],
|
||||||
@ -92,11 +92,16 @@ int main (int argc, char ** argv)
|
|||||||
RealD Nnode = Grid.NodeCount();
|
RealD Nnode = Grid.NodeCount();
|
||||||
RealD ppn = Nrank/Nnode;
|
RealD ppn = Nrank/Nnode;
|
||||||
|
|
||||||
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
|
||||||
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
|
||||||
|
|
||||||
int ncomm;
|
int ncomm;
|
||||||
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||||
|
for(int mu=0;mu<8;mu++){
|
||||||
|
xbuf[mu].resize(lat*lat*lat*Ls);
|
||||||
|
rbuf[mu].resize(lat*lat*lat*Ls);
|
||||||
|
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
@ -112,7 +117,6 @@ int main (int argc, char ** argv)
|
|||||||
int comm_proc=1;
|
int comm_proc=1;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
Grid.SendToRecvFromBegin(requests,
|
Grid.SendToRecvFromBegin(requests,
|
||||||
(void *)&xbuf[mu][0],
|
(void *)&xbuf[mu][0],
|
||||||
@ -163,7 +167,7 @@ int main (int argc, char ** argv)
|
|||||||
header();
|
header();
|
||||||
|
|
||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=32;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat,lat,lat,lat});
|
std::vector<int> latt_size ({lat,lat,lat,lat});
|
||||||
|
|
||||||
@ -172,9 +176,14 @@ int main (int argc, char ** argv)
|
|||||||
RealD Nnode = Grid.NodeCount();
|
RealD Nnode = Grid.NodeCount();
|
||||||
RealD ppn = Nrank/Nnode;
|
RealD ppn = Nrank/Nnode;
|
||||||
|
|
||||||
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
|
||||||
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
|
std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
|
||||||
|
|
||||||
|
for(int mu=0;mu<8;mu++){
|
||||||
|
xbuf[mu].resize(lat*lat*lat*Ls);
|
||||||
|
rbuf[mu].resize(lat*lat*lat*Ls);
|
||||||
|
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
int ncomm;
|
int ncomm;
|
||||||
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||||
@ -249,7 +258,7 @@ int main (int argc, char ** argv)
|
|||||||
header();
|
header();
|
||||||
|
|
||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=32;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
lat*mpi_layout[1],
|
lat*mpi_layout[1],
|
||||||
@ -299,7 +308,7 @@ int main (int argc, char ** argv)
|
|||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&rbuf[mu][0],
|
(void *)&rbuf[mu][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes,mu);
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
comm_proc = mpi_layout[mu]-1;
|
||||||
|
|
||||||
@ -310,11 +319,11 @@ int main (int argc, char ** argv)
|
|||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&rbuf[mu+4][0],
|
(void *)&rbuf[mu+4][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes,mu+4);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Grid.StencilSendToRecvFromComplete(requests);
|
Grid.StencilSendToRecvFromComplete(requests,0);
|
||||||
Grid.Barrier();
|
Grid.Barrier();
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
t_time[i] = stop-start; // microseconds
|
t_time[i] = stop-start; // microseconds
|
||||||
@ -346,7 +355,7 @@ int main (int argc, char ** argv)
|
|||||||
header();
|
header();
|
||||||
|
|
||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=32;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat*mpi_layout[0],
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
lat*mpi_layout[1],
|
lat*mpi_layout[1],
|
||||||
@ -393,8 +402,8 @@ int main (int argc, char ** argv)
|
|||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&rbuf[mu][0],
|
(void *)&rbuf[mu][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes,mu);
|
||||||
Grid.StencilSendToRecvFromComplete(requests);
|
Grid.StencilSendToRecvFromComplete(requests,mu);
|
||||||
requests.resize(0);
|
requests.resize(0);
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
comm_proc = mpi_layout[mu]-1;
|
||||||
@ -406,8 +415,8 @@ int main (int argc, char ** argv)
|
|||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&rbuf[mu+4][0],
|
(void *)&rbuf[mu+4][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes,mu+4);
|
||||||
Grid.StencilSendToRecvFromComplete(requests);
|
Grid.StencilSendToRecvFromComplete(requests,mu+4);
|
||||||
requests.resize(0);
|
requests.resize(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -436,5 +445,97 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
|
header();
|
||||||
|
|
||||||
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
|
lat*mpi_layout[1],
|
||||||
|
lat*mpi_layout[2],
|
||||||
|
lat*mpi_layout[3]});
|
||||||
|
|
||||||
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
RealD Nrank = Grid._Nprocessors;
|
||||||
|
RealD Nnode = Grid.NodeCount();
|
||||||
|
RealD ppn = Nrank/Nnode;
|
||||||
|
|
||||||
|
std::vector<HalfSpinColourVectorD *> xbuf(8);
|
||||||
|
std::vector<HalfSpinColourVectorD *> rbuf(8);
|
||||||
|
Grid.ShmBufferFreeAll();
|
||||||
|
for(int d=0;d<8;d++){
|
||||||
|
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
||||||
|
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
||||||
|
bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
||||||
|
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
|
||||||
|
}
|
||||||
|
|
||||||
|
int ncomm;
|
||||||
|
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||||
|
double dbytes;
|
||||||
|
for(int i=0;i<Nloop;i++){
|
||||||
|
double start=usecond();
|
||||||
|
|
||||||
|
std::vector<CartesianCommunicator::CommsRequest_t> requests;
|
||||||
|
dbytes=0;
|
||||||
|
ncomm=0;
|
||||||
|
|
||||||
|
parallel_for(int dir=0;dir<8;dir++){
|
||||||
|
|
||||||
|
double tbytes;
|
||||||
|
int mu =dir % 4;
|
||||||
|
|
||||||
|
if (mpi_layout[mu]>1 ) {
|
||||||
|
|
||||||
|
ncomm++;
|
||||||
|
int xmit_to_rank;
|
||||||
|
int recv_from_rank;
|
||||||
|
if ( dir == mu ) {
|
||||||
|
int comm_proc=1;
|
||||||
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
} else {
|
||||||
|
int comm_proc = mpi_layout[mu]-1;
|
||||||
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
}
|
||||||
|
|
||||||
|
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
||||||
|
(void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
|
||||||
|
|
||||||
|
#pragma omp atomic
|
||||||
|
dbytes+=tbytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Grid.Barrier();
|
||||||
|
double stop=usecond();
|
||||||
|
t_time[i] = stop-start; // microseconds
|
||||||
|
}
|
||||||
|
|
||||||
|
timestat.statistics(t_time);
|
||||||
|
|
||||||
|
dbytes=dbytes*ppn;
|
||||||
|
double xbytes = dbytes*0.5;
|
||||||
|
double rbytes = dbytes*0.5;
|
||||||
|
double bidibytes = dbytes;
|
||||||
|
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
||||||
|
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
||||||
|
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
||||||
|
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
|
||||||
|
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
|
||||||
|
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
@ -503,9 +503,9 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
||||||
|
|
||||||
//assert(norm2(src_e)<1.0e-4);
|
assert(norm2(src_e)<1.0e-4);
|
||||||
//assert(norm2(src_o)<1.0e-4);
|
assert(norm2(src_o)<1.0e-4);
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
10
configure.ac
10
configure.ac
@ -342,14 +342,14 @@ case ${ac_COMMS} in
|
|||||||
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
|
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
|
||||||
comms_type='none'
|
comms_type='none'
|
||||||
;;
|
;;
|
||||||
mpi3l*)
|
|
||||||
AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
|
|
||||||
comms_type='mpi3l'
|
|
||||||
;;
|
|
||||||
mpi3*)
|
mpi3*)
|
||||||
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
|
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
|
||||||
comms_type='mpi3'
|
comms_type='mpi3'
|
||||||
;;
|
;;
|
||||||
|
mpit)
|
||||||
|
AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
|
||||||
|
comms_type='mpit'
|
||||||
|
;;
|
||||||
mpi*)
|
mpi*)
|
||||||
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
|
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
|
||||||
comms_type='mpi'
|
comms_type='mpi'
|
||||||
@ -377,7 +377,7 @@ esac
|
|||||||
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
|
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
|
AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
|
AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
|
AM_CONDITIONAL(BUILD_COMMS_MPIT, [ test "${comms_type}X" == "mpitX" ] )
|
||||||
AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
|
AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
|
||||||
|
|
||||||
############### RNG selection
|
############### RNG selection
|
||||||
|
@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
|
|||||||
extra_sources+=communicator/Communicator_base.cc
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
if BUILD_COMMS_MPI3L
|
if BUILD_COMMS_MPIT
|
||||||
extra_sources+=communicator/Communicator_mpi3_leader.cc
|
extra_sources+=communicator/Communicator_mpit.cc
|
||||||
extra_sources+=communicator/Communicator_base.cc
|
extra_sources+=communicator/Communicator_base.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||||
|
{
|
||||||
|
#ifdef __linux__
|
||||||
|
int fd = open("/proc/self/pagemap", O_RDONLY);
|
||||||
|
assert(fd >= 0);
|
||||||
|
const int page_size = 4096;
|
||||||
|
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||||
|
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||||
|
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||||
|
uint64_t pagedata[npages];
|
||||||
|
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||||
|
assert(ret == offset);
|
||||||
|
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||||
|
assert(ret == sizeof(uint64_t) * npages);
|
||||||
|
int nhugepages = npages / 512;
|
||||||
|
int n4ktotal, nnothuge;
|
||||||
|
n4ktotal = 0;
|
||||||
|
nnothuge = 0;
|
||||||
|
for (int i = 0; i < nhugepages; ++i) {
|
||||||
|
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
|
||||||
|
for (int j = 0; j < 512; ++j) {
|
||||||
|
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
|
||||||
|
++n4ktotal;
|
||||||
|
if (pageaddr != baseaddr + j * page_size)
|
||||||
|
++nnothuge;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int rank = CartesianCommunicator::RankWorld();
|
||||||
|
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -64,6 +64,8 @@ namespace Grid {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void check_huge_pages(void *Buf,uint64_t BYTES);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// A lattice of something, but assume the something is SIMDized.
|
// A lattice of something, but assume the something is SIMDized.
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
@ -92,12 +94,20 @@ public:
|
|||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
|
|
||||||
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
||||||
|
// if ( ptr != NULL )
|
||||||
|
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
|
||||||
|
|
||||||
|
//////////////////
|
||||||
|
// Hack 2MB align; could make option probably doesn't need configurability
|
||||||
|
//////////////////
|
||||||
|
//define GRID_ALLOC_ALIGN (128)
|
||||||
|
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
|
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
|
||||||
#else
|
#else
|
||||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
|
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
||||||
#endif
|
#endif
|
||||||
|
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
|
||||||
// First touch optimise in threaded loop
|
// First touch optimise in threaded loop
|
||||||
uint8_t *cp = (uint8_t *)ptr;
|
uint8_t *cp = (uint8_t *)ptr;
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
@ -111,6 +121,7 @@ public:
|
|||||||
|
|
||||||
void deallocate(pointer __p, size_type __n) {
|
void deallocate(pointer __p, size_type __n) {
|
||||||
size_type bytes = __n * sizeof(_Tp);
|
size_type bytes = __n * sizeof(_Tp);
|
||||||
|
|
||||||
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
||||||
|
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
@ -189,9 +200,9 @@ public:
|
|||||||
pointer allocate(size_type __n, const void* _p= 0)
|
pointer allocate(size_type __n, const void* _p= 0)
|
||||||
{
|
{
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
|
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
|
||||||
#else
|
#else
|
||||||
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
|
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
|
||||||
#endif
|
#endif
|
||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
uint8_t *cp = (uint8_t *)ptr;
|
uint8_t *cp = (uint8_t *)ptr;
|
||||||
|
@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
@ -34,7 +38,10 @@ namespace Grid {
|
|||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void * CartesianCommunicator::ShmCommBuf;
|
void * CartesianCommunicator::ShmCommBuf;
|
||||||
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
|
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
|
||||||
CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
|
CartesianCommunicator::CommunicatorPolicy_t
|
||||||
|
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
|
||||||
|
int CartesianCommunicator::nCommThreads = -1;
|
||||||
|
int CartesianCommunicator::Hugepages = 0;
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
// Alloc, free shmem region
|
// Alloc, free shmem region
|
||||||
@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
|||||||
GlobalSumVector((double *)c,2*N);
|
GlobalSumVector((double *)c,2*N);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
|
#if !defined( GRID_COMMS_MPI3)
|
||||||
|
|
||||||
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
|
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
|
||||||
int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
|
int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
|
||||||
|
#endif
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
|
||||||
void *xmit,
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
void *recv,
|
void *recv,
|
||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
|
std::vector<CommsRequest_t> list;
|
||||||
|
// Discard the "dir"
|
||||||
|
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||||
|
SendToRecvFromComplete(list);
|
||||||
|
return 2.0*bytes;
|
||||||
|
}
|
||||||
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int xmit_to_rank,
|
||||||
|
void *recv,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes, int dir)
|
||||||
|
{
|
||||||
|
// Discard the "dir"
|
||||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
SendToRecvFromComplete(waitall);
|
SendToRecvFromComplete(waitall);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined( GRID_COMMS_MPI3)
|
||||||
|
|
||||||
void CartesianCommunicator::StencilBarrier(void){};
|
void CartesianCommunicator::StencilBarrier(void){};
|
||||||
|
|
||||||
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
|
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
|
||||||
@ -121,8 +146,22 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::ShmInitGeneric(void){
|
void CartesianCommunicator::ShmInitGeneric(void){
|
||||||
|
#if 1
|
||||||
|
|
||||||
|
int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
|
||||||
|
#ifdef MAP_HUGETLB
|
||||||
|
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
|
||||||
|
#endif
|
||||||
|
ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
|
||||||
|
if (ShmCommBuf == (void *)MAP_FAILED) {
|
||||||
|
perror("mmap failed ");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
#else
|
||||||
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
|
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
|
||||||
ShmCommBuf=(void *)&ShmBufStorageVector[0];
|
ShmCommBuf=(void *)&ShmBufStorageVector[0];
|
||||||
|
#endif
|
||||||
|
bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_COMMS_MPI3
|
#ifdef GRID_COMMS_MPI3
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_COMMS_MPI3L
|
#ifdef GRID_COMMS_MPIT
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
@ -50,12 +50,24 @@ namespace Grid {
|
|||||||
class CartesianCommunicator {
|
class CartesianCommunicator {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// 65536 ranks per node adequate for now
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
// Isend/Irecv/Wait, or Sendrecv blocking
|
||||||
|
////////////////////////////////////////////
|
||||||
|
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
|
||||||
|
static CommunicatorPolicy_t CommunicatorPolicy;
|
||||||
|
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Up to 65536 ranks per node adequate for now
|
||||||
// 128MB shared memory for comms enought for 48^4 local vol comms
|
// 128MB shared memory for comms enought for 48^4 local vol comms
|
||||||
// Give external control (command line override?) of this
|
// Give external control (command line override?) of this
|
||||||
|
///////////////////////////////////////////
|
||||||
static const int MAXLOG2RANKSPERNODE = 16;
|
static const int MAXLOG2RANKSPERNODE = 16;
|
||||||
static uint64_t MAX_MPI_SHM_BYTES;
|
static uint64_t MAX_MPI_SHM_BYTES;
|
||||||
|
static int nCommThreads;
|
||||||
|
// use explicit huge pages
|
||||||
|
static int Hugepages;
|
||||||
|
|
||||||
// Communicator should know nothing of the physics grid, only processor grid.
|
// Communicator should know nothing of the physics grid, only processor grid.
|
||||||
int _Nprocessors; // How many in all
|
int _Nprocessors; // How many in all
|
||||||
@ -64,14 +76,18 @@ class CartesianCommunicator {
|
|||||||
std::vector<int> _processor_coor; // linear processor coordinate
|
std::vector<int> _processor_coor; // linear processor coordinate
|
||||||
unsigned long _ndimension;
|
unsigned long _ndimension;
|
||||||
|
|
||||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
|
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
||||||
static MPI_Comm communicator_world;
|
static MPI_Comm communicator_world;
|
||||||
MPI_Comm communicator;
|
|
||||||
|
MPI_Comm communicator;
|
||||||
|
std::vector<MPI_Comm> communicator_halo;
|
||||||
|
|
||||||
typedef MPI_Request CommsRequest_t;
|
typedef MPI_Request CommsRequest_t;
|
||||||
#else
|
#else
|
||||||
typedef int CommsRequest_t;
|
typedef int CommsRequest_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Helper functionality for SHM Windows common to all other impls
|
// Helper functionality for SHM Windows common to all other impls
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
@ -117,10 +133,6 @@ class CartesianCommunicator {
|
|||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
static void * ShmCommBuf;
|
static void * ShmCommBuf;
|
||||||
|
|
||||||
// Isend/Irecv/Wait, or Sendrecv blocking
|
|
||||||
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
|
|
||||||
static CommunicatorPolicy_t CommunicatorPolicy;
|
|
||||||
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
|
|
||||||
|
|
||||||
size_t heap_top;
|
size_t heap_top;
|
||||||
size_t heap_bytes;
|
size_t heap_bytes;
|
||||||
@ -211,14 +223,21 @@ class CartesianCommunicator {
|
|||||||
|
|
||||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
||||||
|
|
||||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double StencilSendToRecvFrom(void *xmit,
|
||||||
void *xmit,
|
int xmit_to_rank,
|
||||||
int xmit_to_rank,
|
void *recv,
|
||||||
void *recv,
|
int recv_from_rank,
|
||||||
int recv_from_rank,
|
int bytes,int dir);
|
||||||
int bytes);
|
|
||||||
|
|
||||||
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int xmit_to_rank,
|
||||||
|
void *recv,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes,int dir);
|
||||||
|
|
||||||
|
|
||||||
|
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
|
||||||
void StencilBarrier(void);
|
void StencilBarrier(void);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
|
@ -41,9 +41,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef HAVE_NUMAIF_H
|
#ifdef HAVE_NUMAIF_H
|
||||||
#include <numaif.h>
|
#include <numaif.h>
|
||||||
#endif
|
#endif
|
||||||
#ifndef SHM_HUGETLB
|
|
||||||
#define SHM_HUGETLB 04000
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
@ -214,12 +212,18 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
|
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
|
||||||
ftruncate(fd, size);
|
ftruncate(fd, size);
|
||||||
|
|
||||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
int mmap_flag = MAP_SHARED;
|
||||||
|
#ifdef MAP_HUGETLB
|
||||||
|
if (Hugepages) mmap_flag |= MAP_HUGETLB;
|
||||||
|
#endif
|
||||||
|
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
|
||||||
|
|
||||||
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
|
||||||
assert(((uint64_t)ptr&0x3F)==0);
|
assert(((uint64_t)ptr&0x3F)==0);
|
||||||
|
|
||||||
// Try to force numa domain on the shm segment if we have numaif.h
|
// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
|
||||||
#ifdef HAVE_NUMAIF_H
|
#if 0
|
||||||
|
//#ifdef HAVE_NUMAIF_H
|
||||||
int status;
|
int status;
|
||||||
int flags=MPOL_MF_MOVE;
|
int flags=MPOL_MF_MOVE;
|
||||||
#ifdef KNL
|
#ifdef KNL
|
||||||
@ -266,7 +270,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
for(int r=0;r<ShmSize;r++){
|
for(int r=0;r<ShmSize;r++){
|
||||||
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
|
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
|
||||||
key_t key = 0x4545 + r;
|
key_t key = 0x4545 + r;
|
||||||
if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
|
int flags = IPC_CREAT | SHM_R | SHM_W;
|
||||||
|
#ifdef SHM_HUGETLB
|
||||||
|
flags|=SHM_HUGETLB;
|
||||||
|
#endif
|
||||||
|
if ((shmids[r]= shmget(key,size, flags)) < 0) {
|
||||||
int errsv = errno;
|
int errsv = errno;
|
||||||
printf("Errno %d\n",errsv);
|
printf("Errno %d\n",errsv);
|
||||||
perror("shmget");
|
perror("shmget");
|
||||||
@ -397,8 +405,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
|||||||
{
|
{
|
||||||
int ierr;
|
int ierr;
|
||||||
communicator=communicator_world;
|
communicator=communicator_world;
|
||||||
|
|
||||||
_ndimension = processors.size();
|
_ndimension = processors.size();
|
||||||
|
|
||||||
|
communicator_halo.resize (2*_ndimension);
|
||||||
|
for(int i=0;i<_ndimension*2;i++){
|
||||||
|
MPI_Comm_dup(communicator,&communicator_halo[i]);
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Assert power of two shm_size.
|
// Assert power of two shm_size.
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
@ -621,13 +635,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||||
void *xmit,
|
int dest,
|
||||||
int dest,
|
void *recv,
|
||||||
void *recv,
|
int from,
|
||||||
int from,
|
int bytes,int dir)
|
||||||
int bytes)
|
|
||||||
{
|
{
|
||||||
|
std::vector<CommsRequest_t> list;
|
||||||
|
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
|
||||||
|
StencilSendToRecvFromComplete(list,dir);
|
||||||
|
return offbytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes,int dir)
|
||||||
|
{
|
||||||
|
assert(dir < communicator_halo.size());
|
||||||
|
|
||||||
MPI_Request xrq;
|
MPI_Request xrq;
|
||||||
MPI_Request rrq;
|
MPI_Request rrq;
|
||||||
|
|
||||||
@ -646,26 +674,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
gfrom = MPI_UNDEFINED;
|
gfrom = MPI_UNDEFINED;
|
||||||
#endif
|
#endif
|
||||||
if ( gfrom ==MPI_UNDEFINED) {
|
if ( gfrom ==MPI_UNDEFINED) {
|
||||||
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(rrq);
|
list.push_back(rrq);
|
||||||
off_node_bytes+=bytes;
|
off_node_bytes+=bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( gdest == MPI_UNDEFINED ) {
|
if ( gdest == MPI_UNDEFINED ) {
|
||||||
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=bytes;
|
off_node_bytes+=bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
|
||||||
this->StencilSendToRecvFromComplete(list);
|
this->StencilSendToRecvFromComplete(list,dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
SendToRecvFromComplete(waitall);
|
SendToRecvFromComplete(waitall);
|
||||||
}
|
}
|
||||||
|
286
lib/communicator/Communicator_mpit.cc
Normal file
286
lib/communicator/Communicator_mpit.cc
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/communicator/Communicator_mpi.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/GridCore.h>
|
||||||
|
#include <Grid/GridQCDcore.h>
|
||||||
|
#include <Grid/qcd/action/ActionCore.h>
|
||||||
|
#include <mpi.h>
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Info that is setup once and indept of cartesian layout
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
MPI_Comm CartesianCommunicator::communicator_world;
|
||||||
|
|
||||||
|
// Should error check all MPI calls.
|
||||||
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||||
|
int flag;
|
||||||
|
int provided;
|
||||||
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||||
|
if ( !flag ) {
|
||||||
|
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||||
|
if ( provided != MPI_THREAD_MULTIPLE ) {
|
||||||
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||||
|
ShmInitGeneric();
|
||||||
|
}
|
||||||
|
|
||||||
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
|
{
|
||||||
|
_ndimension = processors.size();
|
||||||
|
std::vector<int> periodic(_ndimension,1);
|
||||||
|
|
||||||
|
_Nprocessors=1;
|
||||||
|
_processors = processors;
|
||||||
|
_processor_coor.resize(_ndimension);
|
||||||
|
|
||||||
|
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||||
|
MPI_Comm_rank(communicator,&_processor);
|
||||||
|
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
|
||||||
|
|
||||||
|
for(int i=0;i<_ndimension;i++){
|
||||||
|
_Nprocessors*=_processors[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
communicator_halo.resize (2*_ndimension);
|
||||||
|
for(int i=0;i<_ndimension*2;i++){
|
||||||
|
MPI_Comm_dup(communicator,&communicator_halo[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
int Size;
|
||||||
|
MPI_Comm_size(communicator,&Size);
|
||||||
|
|
||||||
|
assert(Size==_Nprocessors);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||||
|
{
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||||
|
{
|
||||||
|
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
int rank;
|
||||||
|
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
|
||||||
|
assert(ierr==0);
|
||||||
|
return rank;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
coor.resize(_ndimension);
|
||||||
|
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
std::vector<CommsRequest_t> reqs(0);
|
||||||
|
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||||
|
SendToRecvFromComplete(reqs);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||||
|
void *recv,
|
||||||
|
int sender,
|
||||||
|
int receiver,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
MPI_Status stat;
|
||||||
|
assert(sender != receiver);
|
||||||
|
int tag = sender;
|
||||||
|
if ( _processor == sender ) {
|
||||||
|
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||||
|
}
|
||||||
|
if ( _processor == receiver ) {
|
||||||
|
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
int myrank = _processor;
|
||||||
|
int ierr;
|
||||||
|
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||||
|
MPI_Request xrq;
|
||||||
|
MPI_Request rrq;
|
||||||
|
|
||||||
|
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
||||||
|
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
||||||
|
|
||||||
|
assert(ierr==0);
|
||||||
|
list.push_back(xrq);
|
||||||
|
list.push_back(rrq);
|
||||||
|
} else {
|
||||||
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||||
|
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||||
|
recv,bytes,MPI_CHAR,from, from,
|
||||||
|
communicator,MPI_STATUS_IGNORE);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
|
{
|
||||||
|
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
||||||
|
int nreq=list.size();
|
||||||
|
std::vector<MPI_Status> status(nreq);
|
||||||
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::Barrier(void)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Barrier(communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||||
|
{
|
||||||
|
int ierr=MPI_Bcast(data,
|
||||||
|
bytes,
|
||||||
|
MPI_BYTE,
|
||||||
|
root,
|
||||||
|
communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Should only be used prior to Grid Init finished.
|
||||||
|
// Check for this?
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
int CartesianCommunicator::RankWorld(void){
|
||||||
|
int r;
|
||||||
|
MPI_Comm_rank(communicator_world,&r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||||
|
{
|
||||||
|
int ierr= MPI_Bcast(data,
|
||||||
|
bytes,
|
||||||
|
MPI_BYTE,
|
||||||
|
root,
|
||||||
|
communicator_world);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int xmit_to_rank,
|
||||||
|
void *recv,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes,int dir)
|
||||||
|
{
|
||||||
|
int myrank = _processor;
|
||||||
|
int ierr;
|
||||||
|
assert(dir < communicator_halo.size());
|
||||||
|
|
||||||
|
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
|
||||||
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||||
|
MPI_Request req[2];
|
||||||
|
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
|
||||||
|
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
|
||||||
|
|
||||||
|
list.push_back(req[0]);
|
||||||
|
list.push_back(req[1]);
|
||||||
|
return 2.0*bytes;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
|
{
|
||||||
|
int nreq=waitall.size();
|
||||||
|
MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
|
||||||
|
};
|
||||||
|
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
|
||||||
|
int xmit_to_rank,
|
||||||
|
void *recv,
|
||||||
|
int recv_from_rank,
|
||||||
|
int bytes,int dir)
|
||||||
|
{
|
||||||
|
int myrank = _processor;
|
||||||
|
int ierr;
|
||||||
|
assert(dir < communicator_halo.size());
|
||||||
|
|
||||||
|
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
|
||||||
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||||
|
MPI_Request req[2];
|
||||||
|
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
|
||||||
|
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
|
||||||
|
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
|
||||||
|
return 2.0*bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/cshift/Cshift_mpi.h>
|
#include <Grid/cshift/Cshift_mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_COMMS_MPI3L
|
#ifdef GRID_COMMS_MPIT
|
||||||
#include <Grid/cshift/Cshift_mpi.h>
|
#include <Grid/cshift/Cshift_mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
void Grid_quiesce_nodes(void) {
|
void Grid_quiesce_nodes(void) {
|
||||||
int me = 0;
|
int me = 0;
|
||||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
|
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
@ -29,7 +29,7 @@
|
|||||||
#ifndef GRID_BINARY_IO_H
|
#ifndef GRID_BINARY_IO_H
|
||||||
#define GRID_BINARY_IO_H
|
#define GRID_BINARY_IO_H
|
||||||
|
|
||||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
|
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
|
||||||
#define USE_MPI_IO
|
#define USE_MPI_IO
|
||||||
#else
|
#else
|
||||||
#undef USE_MPI_IO
|
#undef USE_MPI_IO
|
||||||
|
@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
|||||||
for(int i=0; i < Ls; i++){
|
for(int i=0; i < Ls; i++){
|
||||||
as[i] = 1.0;
|
as[i] = 1.0;
|
||||||
omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
|
omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
|
||||||
// assert(fabs(omega[i])>0.0);
|
assert(omega[i]!=Coeff_t(0.0));
|
||||||
bs[i] = 0.5*(bpc/omega[i] + bmc);
|
bs[i] = 0.5*(bpc/omega[i] + bmc);
|
||||||
cs[i] = 0.5*(bpc/omega[i] - bmc);
|
cs[i] = 0.5*(bpc/omega[i] - bmc);
|
||||||
}
|
}
|
||||||
@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
|||||||
|
|
||||||
for(int i=0;i<Ls;i++){
|
for(int i=0;i<Ls;i++){
|
||||||
bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
|
bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
|
||||||
// assert(fabs(bee[i])>0.0);
|
assert(bee[i]!=Coeff_t(0.0));
|
||||||
cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
|
cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
|
||||||
beo[i]=as[i]*bs[i];
|
beo[i]=as[i]*bs[i];
|
||||||
ceo[i]=-as[i]*cs[i];
|
ceo[i]=-as[i]*cs[i];
|
||||||
@ -456,10 +456,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
|||||||
|
|
||||||
if ( i < Ls-1 ) {
|
if ( i < Ls-1 ) {
|
||||||
|
|
||||||
|
assert(bee[i]!=Coeff_t(0.0));
|
||||||
|
assert(bee[0]!=Coeff_t(0.0));
|
||||||
|
|
||||||
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
|
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
|
||||||
|
|
||||||
leem[i]=mass*cee[Ls-1]/bee[0];
|
leem[i]=mass*cee[Ls-1]/bee[0];
|
||||||
for(int j=0;j<i;j++) leem[i]*= aee[j]/bee[j+1];
|
for(int j=0;j<i;j++) {
|
||||||
|
assert(bee[j+1]!=Coeff_t(0.0));
|
||||||
|
leem[i]*= aee[j]/bee[j+1];
|
||||||
|
}
|
||||||
|
|
||||||
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
|
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
|
||||||
|
|
||||||
@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
|||||||
{
|
{
|
||||||
Coeff_t delta_d=mass*cee[Ls-1];
|
Coeff_t delta_d=mass*cee[Ls-1];
|
||||||
for(int j=0;j<Ls-1;j++) {
|
for(int j=0;j<Ls-1;j++) {
|
||||||
// assert(fabs(bee[j])>0.0);
|
assert(bee[j] != Coeff_t(0.0));
|
||||||
delta_d *= cee[j]/bee[j];
|
delta_d *= cee[j]/bee[j];
|
||||||
}
|
}
|
||||||
dee[Ls-1] += delta_d;
|
dee[Ls-1] += delta_d;
|
||||||
|
@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
|
|||||||
template<class vobj,class cobj>
|
template<class vobj,class cobj>
|
||||||
class WilsonStencil : public CartesianStencil<vobj,cobj> {
|
class WilsonStencil : public CartesianStencil<vobj,cobj> {
|
||||||
public:
|
public:
|
||||||
|
double timer0;
|
||||||
|
double timer1;
|
||||||
|
double timer2;
|
||||||
|
double timer3;
|
||||||
|
double timer4;
|
||||||
|
double timer5;
|
||||||
|
double timer6;
|
||||||
|
uint64_t callsi;
|
||||||
|
void ZeroCountersi(void)
|
||||||
|
{
|
||||||
|
timer0=0;
|
||||||
|
timer1=0;
|
||||||
|
timer2=0;
|
||||||
|
timer3=0;
|
||||||
|
timer4=0;
|
||||||
|
timer5=0;
|
||||||
|
timer6=0;
|
||||||
|
callsi=0;
|
||||||
|
}
|
||||||
|
void Reporti(int calls)
|
||||||
|
{
|
||||||
|
if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
|
||||||
|
if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate) " <<timer1/calls <<std::endl;
|
||||||
|
if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge ) " <<timer2/calls <<std::endl;
|
||||||
|
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
|
||||||
|
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
||||||
|
}
|
||||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||||
|
|
||||||
std::vector<int> same_node;
|
std::vector<int> same_node;
|
||||||
@ -252,6 +278,7 @@ public:
|
|||||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
||||||
same_node(npoints)
|
same_node(npoints)
|
||||||
{
|
{
|
||||||
|
ZeroCountersi();
|
||||||
surface_list.resize(0);
|
surface_list.resize(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -261,7 +288,6 @@ public:
|
|||||||
// Here we know the distance is 1 for WilsonStencil
|
// Here we know the distance is 1 for WilsonStencil
|
||||||
for(int point=0;point<this->_npoints;point++){
|
for(int point=0;point<this->_npoints;point++){
|
||||||
same_node[point] = this->SameNode(point);
|
same_node[point] = this->SameNode(point);
|
||||||
// std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int site = 0 ;site< vol4;site++){
|
for(int site = 0 ;site< vol4;site++){
|
||||||
@ -282,17 +308,28 @@ public:
|
|||||||
{
|
{
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
this->HaloExchangeOptGather(source,compress);
|
this->HaloExchangeOptGather(source,compress);
|
||||||
this->CommunicateBegin(reqs);
|
double t1=usecond();
|
||||||
this->CommunicateComplete(reqs);
|
// Asynchronous MPI calls multidirectional, Isend etc...
|
||||||
|
// this->CommunicateBegin(reqs);
|
||||||
|
// this->CommunicateComplete(reqs);
|
||||||
|
// Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
|
||||||
|
this->Communicate();
|
||||||
|
double t2=usecond(); timer1 += t2-t1;
|
||||||
this->CommsMerge(compress);
|
this->CommsMerge(compress);
|
||||||
|
double t3=usecond(); timer2 += t3-t2;
|
||||||
this->CommsMergeSHM(compress);
|
this->CommsMergeSHM(compress);
|
||||||
|
double t4=usecond(); timer3 += t4-t3;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class compressor>
|
template <class compressor>
|
||||||
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
this->Prepare();
|
this->Prepare();
|
||||||
|
double t0=usecond();
|
||||||
this->HaloGatherOpt(source,compress);
|
this->HaloGatherOpt(source,compress);
|
||||||
|
double t1=usecond();
|
||||||
|
timer0 += t1-t0;
|
||||||
|
callsi++;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class compressor>
|
template <class compressor>
|
||||||
@ -304,7 +341,9 @@ public:
|
|||||||
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
|
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
|
||||||
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
|
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
|
||||||
|
|
||||||
|
this->mpi3synctime_g-=usecond();
|
||||||
this->_grid->StencilBarrier();
|
this->_grid->StencilBarrier();
|
||||||
|
this->mpi3synctime_g+=usecond();
|
||||||
|
|
||||||
assert(source._grid==this->_grid);
|
assert(source._grid==this->_grid);
|
||||||
this->halogtime-=usecond();
|
this->halogtime-=usecond();
|
||||||
@ -323,7 +362,6 @@ public:
|
|||||||
int dag = compress.dag;
|
int dag = compress.dag;
|
||||||
int face_idx=0;
|
int face_idx=0;
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
// std::cout << " Optimised Dagger compress " <<std::endl;
|
|
||||||
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
||||||
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
||||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
||||||
|
@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
int vol4;
|
int vol4;
|
||||||
vol4=FourDimGrid.oSites();
|
vol4=FourDimGrid.oSites();
|
||||||
Stencil.BuildSurfaceList(LLs,vol4);
|
Stencil.BuildSurfaceList(LLs,vol4);
|
||||||
|
|
||||||
vol4=FourDimRedBlackGrid.oSites();
|
vol4=FourDimRedBlackGrid.oSites();
|
||||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||||
|
|
||||||
std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
|
// std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
|
||||||
<<" " << StencilEven.surface_list.size()<<std::endl;
|
// <<" " << StencilEven.surface_list.size()<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::Report(void)
|
void WilsonFermion5D<Impl>::Report(void)
|
||||||
{
|
{
|
||||||
std::vector<int> latt = GridDefaultLatt();
|
RealD NP = _FourDimGrid->_Nprocessors;
|
||||||
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
RealD NN = _FourDimGrid->NodeCount();
|
||||||
RealD NP = _FourDimGrid->_Nprocessors;
|
RealD volume = Ls;
|
||||||
RealD NN = _FourDimGrid->NodeCount();
|
std::vector<int> latt = _FourDimGrid->GlobalDimensions();
|
||||||
|
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||||
|
|
||||||
if ( DhopCalls > 0 ) {
|
if ( DhopCalls > 0 ) {
|
||||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||||
@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
|
|||||||
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
||||||
}
|
}
|
||||||
|
if ( DhopCalls > 0){
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
|
|||||||
Stencil.ZeroCounters();
|
Stencil.ZeroCounters();
|
||||||
StencilEven.ZeroCounters();
|
StencilEven.ZeroCounters();
|
||||||
StencilOdd.ZeroCounters();
|
StencilOdd.ZeroCounters();
|
||||||
|
Stencil.ZeroCountersi();
|
||||||
|
StencilEven.ZeroCountersi();
|
||||||
|
StencilOdd.ZeroCountersi();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
|
||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
|
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.HaloExchangeOptGather(in,compressor);
|
st.HaloExchangeOptGather(in,compressor);
|
||||||
DhopFaceTime+=usecond();
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
|
||||||
|
|
||||||
// Rely on async comms; start comms before merge of local data
|
|
||||||
DhopCommTime-=usecond();
|
|
||||||
st.CommunicateBegin(reqs);
|
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
|
||||||
st.CommsMergeSHM(compressor);
|
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
// Perhaps use omp task and region
|
double ctime=0;
|
||||||
#pragma omp parallel
|
double ptime=0;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||||
{
|
{
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
int nthreads = omp_get_num_threads();
|
int nthreads = omp_get_num_threads();
|
||||||
int me = omp_get_thread_num();
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
int myoff, mywork;
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
double start = usecond();
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = U._grid->oSites();
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
|
// do the compute
|
||||||
int sF = LLs * myoff;
|
if (dag == DaggerYes) {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
if ( me == 0 ) {
|
int sU = ss;
|
||||||
st.CommunicateComplete(reqs);
|
int sF = LLs * sU;
|
||||||
DhopCommTime+=usecond();
|
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
||||||
} else {
|
}
|
||||||
// Interior links in stencil
|
} else {
|
||||||
if ( me==1 ) DhopComputeTime-=usecond();
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
|
int sU = ss;
|
||||||
else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
|
int sF = LLs * sU;
|
||||||
if ( me==1 ) DhopComputeTime+=usecond();
|
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ptime = usecond() - start;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
double start = usecond();
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
ctime = usecond() - start;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
DhopCommTime += ctime;
|
||||||
|
DhopComputeTime+=ptime;
|
||||||
|
|
||||||
|
// First to enter, last to leave timing
|
||||||
|
st.CollateThreads();
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
// Load imbalance alert. Should use dynamic schedule OMP for loop
|
|
||||||
// Perhaps create a list of only those sites with face work, and
|
|
||||||
// load balance process the list.
|
|
||||||
DhopComputeTime2-=usecond();
|
DhopComputeTime2-=usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
int sz=st.surface_list.size();
|
int sz=st.surface_list.size();
|
||||||
@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
|
@ -26,6 +26,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
//#include <Grid/Grid.h>
|
//#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
#ifndef GRID_QCD_GAUGE_FIX_H
|
||||||
|
#define GRID_QCD_GAUGE_FIX_H
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
@ -188,3 +190,4 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
// Timing info; ugly; possibly temporary
|
// Timing info; ugly; possibly temporary
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
double commtime;
|
double commtime;
|
||||||
|
double mpi3synctime;
|
||||||
|
double mpi3synctime_g;
|
||||||
|
double shmmergetime;
|
||||||
double gathertime;
|
double gathertime;
|
||||||
double gathermtime;
|
double gathermtime;
|
||||||
double halogtime;
|
double halogtime;
|
||||||
@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
double splicetime;
|
double splicetime;
|
||||||
double nosplicetime;
|
double nosplicetime;
|
||||||
double calls;
|
double calls;
|
||||||
|
std::vector<double> comm_bytes_thr;
|
||||||
|
std::vector<double> comm_time_thr;
|
||||||
|
std::vector<double> comm_enter_thr;
|
||||||
|
std::vector<double> comm_leave_thr;
|
||||||
|
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Stencil query
|
// Stencil query
|
||||||
@ -248,35 +255,120 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// Comms packet queue for asynch thread
|
// Comms packet queue for asynch thread
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
|
void CommunicateThreaded()
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
// must be called in parallel region
|
||||||
|
int mythread = omp_get_thread_num();
|
||||||
|
int nthreads = CartesianCommunicator::nCommThreads;
|
||||||
|
#else
|
||||||
|
int mythread = 0;
|
||||||
|
int nthreads = 1;
|
||||||
|
#endif
|
||||||
|
if (nthreads == -1) nthreads = 1;
|
||||||
|
if (mythread < nthreads) {
|
||||||
|
comm_enter_thr[mythread] = usecond();
|
||||||
|
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||||
|
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||||
|
Packets[i].to_rank,
|
||||||
|
Packets[i].recv_buf,
|
||||||
|
Packets[i].from_rank,
|
||||||
|
Packets[i].bytes,i);
|
||||||
|
comm_bytes_thr[mythread] += bytes;
|
||||||
|
}
|
||||||
|
comm_leave_thr[mythread]= usecond();
|
||||||
|
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CollateThreads(void)
|
||||||
|
{
|
||||||
|
int nthreads = CartesianCommunicator::nCommThreads;
|
||||||
|
double first=0.0;
|
||||||
|
double last =0.0;
|
||||||
|
|
||||||
|
for(int t=0;t<nthreads;t++) {
|
||||||
|
|
||||||
|
double t0 = comm_enter_thr[t];
|
||||||
|
double t1 = comm_leave_thr[t];
|
||||||
|
comms_bytes+=comm_bytes_thr[t];
|
||||||
|
|
||||||
|
comm_enter_thr[t] = 0.0;
|
||||||
|
comm_leave_thr[t] = 0.0;
|
||||||
|
comm_time_thr[t] = 0.0;
|
||||||
|
comm_bytes_thr[t]=0;
|
||||||
|
|
||||||
|
if ( first == 0.0 ) first = t0; // first is t0
|
||||||
|
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
|
||||||
|
|
||||||
|
if ( t1 > last ) last = t1; // max time seen
|
||||||
|
|
||||||
|
}
|
||||||
|
commtime+= last-first;
|
||||||
|
}
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
reqs.resize(Packets.size());
|
reqs.resize(Packets.size());
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
|
comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
Packets[i].to_rank,
|
Packets[i].to_rank,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes);
|
Packets[i].bytes,i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromComplete(reqs[i]);
|
_grid->StencilSendToRecvFromComplete(reqs[i],i);
|
||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
|
void Communicate(void)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
// must be called in parallel region
|
||||||
|
int mythread = omp_get_thread_num();
|
||||||
|
int maxthreads= omp_get_max_threads();
|
||||||
|
int nthreads = CartesianCommunicator::nCommThreads;
|
||||||
|
assert(nthreads <= maxthreads);
|
||||||
|
|
||||||
|
if (nthreads == -1) nthreads = 1;
|
||||||
|
#else
|
||||||
|
int mythread = 0;
|
||||||
|
int nthreads = 1;
|
||||||
|
#endif
|
||||||
|
if (mythread < nthreads) {
|
||||||
|
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
||||||
|
double start = usecond();
|
||||||
|
comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
||||||
|
Packets[i].to_rank,
|
||||||
|
Packets[i].recv_buf,
|
||||||
|
Packets[i].from_rank,
|
||||||
|
Packets[i].bytes,i);
|
||||||
|
comm_time_thr[mythread] += usecond() - start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
std::vector<std::vector<CommsRequest_t> > reqs;
|
std::vector<std::vector<CommsRequest_t> > reqs;
|
||||||
Prepare();
|
Prepare();
|
||||||
HaloGather(source,compress);
|
HaloGather(source,compress);
|
||||||
CommunicateBegin(reqs);
|
// Concurrent
|
||||||
CommunicateComplete(reqs);
|
//CommunicateBegin(reqs);
|
||||||
|
//CommunicateComplete(reqs);
|
||||||
|
// Sequential, possibly threaded
|
||||||
|
Communicate();
|
||||||
CommsMergeSHM(compress);
|
CommsMergeSHM(compress);
|
||||||
CommsMerge(compress);
|
CommsMerge(compress);
|
||||||
}
|
}
|
||||||
@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
template<class compressor>
|
template<class compressor>
|
||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
|
mpi3synctime_g-=usecond();
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
mpi3synctime_g+=usecond();
|
||||||
|
|
||||||
// conformable(source._grid,_grid);
|
// conformable(source._grid,_grid);
|
||||||
assert(source._grid==_grid);
|
assert(source._grid==_grid);
|
||||||
@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
CommsMerge(decompress,Mergers,Decompressions);
|
CommsMerge(decompress,Mergers,Decompressions);
|
||||||
}
|
}
|
||||||
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
||||||
|
mpi3synctime-=usecond();
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
mpi3synctime+=usecond();
|
||||||
|
shmmergetime-=usecond();
|
||||||
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
|
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
|
||||||
|
shmmergetime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class decompressor>
|
template<class decompressor>
|
||||||
@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: _permute_type(npoints), _comm_buf_size(npoints)
|
: _permute_type(npoints),
|
||||||
|
_comm_buf_size(npoints),
|
||||||
|
comm_bytes_thr(npoints),
|
||||||
|
comm_enter_thr(npoints),
|
||||||
|
comm_leave_thr(npoints),
|
||||||
|
comm_time_thr(npoints)
|
||||||
{
|
{
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_npoints = npoints;
|
_npoints = npoints;
|
||||||
@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
void ZeroCounters(void) {
|
void ZeroCounters(void) {
|
||||||
gathertime = 0.;
|
gathertime = 0.;
|
||||||
commtime = 0.;
|
commtime = 0.;
|
||||||
|
mpi3synctime=0.;
|
||||||
|
mpi3synctime_g=0.;
|
||||||
|
shmmergetime=0.;
|
||||||
|
for(int i=0;i<_npoints;i++){
|
||||||
|
comm_time_thr[i]=0;
|
||||||
|
comm_bytes_thr[i]=0;
|
||||||
|
comm_enter_thr[i]=0;
|
||||||
|
comm_leave_thr[i]=0;
|
||||||
|
}
|
||||||
halogtime = 0.;
|
halogtime = 0.;
|
||||||
mergetime = 0.;
|
mergetime = 0.;
|
||||||
decompresstime = 0.;
|
decompresstime = 0.;
|
||||||
@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
|
||||||
RealD NP = _grid->_Nprocessors;
|
RealD NP = _grid->_Nprocessors;
|
||||||
RealD NN = _grid->NodeCount();
|
RealD NN = _grid->NodeCount();
|
||||||
|
double t = 0;
|
||||||
|
// if comm_time_thr is set they were all done in parallel so take the max
|
||||||
|
// but add up the bytes
|
||||||
|
int threaded = 0 ;
|
||||||
|
for (int i = 0; i < 8; ++i) {
|
||||||
|
if ( comm_time_thr[i]>0.0 ) {
|
||||||
|
threaded = 1;
|
||||||
|
comms_bytes += comm_bytes_thr[i];
|
||||||
|
if (t < comm_time_thr[i]) t = comm_time_thr[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (threaded) commtime += t;
|
||||||
|
|
||||||
_grid->GlobalSum(commtime); commtime/=NP;
|
_grid->GlobalSum(commtime); commtime/=NP;
|
||||||
if ( calls > 0. ) {
|
if ( calls > 0. ) {
|
||||||
@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
|
||||||
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
|
||||||
}
|
}
|
||||||
|
PRINTIT(mpi3synctime);
|
||||||
|
PRINTIT(mpi3synctime_g);
|
||||||
|
PRINTIT(shmmergetime);
|
||||||
PRINTIT(splicetime);
|
PRINTIT(splicetime);
|
||||||
PRINTIT(nosplicetime);
|
PRINTIT(nosplicetime);
|
||||||
}
|
}
|
||||||
|
@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
|
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
|
||||||
|
CartesianCommunicator::Hugepages = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
||||||
Grid_debug_handler_init();
|
Grid_debug_handler_init();
|
||||||
}
|
}
|
||||||
@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
|
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
|
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
|
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
@ -317,7 +323,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;
|
std::cout<<GridLogMessage<<" --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;
|
std::cout<<GridLogMessage<<" --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
|
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
||||||
@ -356,10 +362,15 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
}
|
}
|
||||||
|
CartesianCommunicator::nCommThreads = -1;
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
|
||||||
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
|
||||||
|
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
|
||||||
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
||||||
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
||||||
@ -374,7 +385,10 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
Grid_default_latt,
|
Grid_default_latt,
|
||||||
Grid_default_mpi);
|
Grid_default_mpi);
|
||||||
|
|
||||||
std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
|
std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
|
||||||
|
if ( CartesianCommunicator::Hugepages) {
|
||||||
|
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||||
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
||||||
@ -393,7 +407,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
|
|
||||||
void Grid_finalize(void)
|
void Grid_finalize(void)
|
||||||
{
|
{
|
||||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
|
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
Grid_unquiesce_nodes();
|
Grid_unquiesce_nodes();
|
||||||
#endif
|
#endif
|
||||||
|
@ -28,6 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid/Grid.h>
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
std::vector<int> seeds({1,2,3,4});
|
std::vector<int> seeds({1,2,3,4});
|
||||||
@ -82,6 +85,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
Uorg = Uorg - Umu;
|
Uorg = Uorg - Umu;
|
||||||
std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
|
std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
|
||||||
|
std::cout << " Norm "<< norm2(Umu) << std::endl;
|
||||||
|
|
||||||
|
|
||||||
std::cout<< "*****************************************************************" <<std::endl;
|
std::cout<< "*****************************************************************" <<std::endl;
|
||||||
|
Loading…
Reference in New Issue
Block a user