1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Merge branch 'develop' into feature/json-fix

This commit is contained in:
Guido Cossu 2017-09-08 13:42:20 +01:00
commit 13fa70ac1a
40 changed files with 2426 additions and 563 deletions

View File

@ -9,68 +9,6 @@ matrix:
- os: osx - os: osx
osx_image: xcode8.3 osx_image: xcode8.3
compiler: clang compiler: clang
- compiler: gcc
dist: trusty
sudo: required
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.9
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: VERSION=-4.9
- compiler: gcc
dist: trusty
sudo: required
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-5
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: VERSION=-5
- compiler: clang
dist: trusty
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
- compiler: clang
dist: trusty
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
before_install: before_install:
- export GRIDDIR=`pwd` - export GRIDDIR=`pwd`
@ -106,9 +44,3 @@ script:
- make -j4 - make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check - make check
- echo make clean
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi

View File

@ -1,18 +1,4 @@
# Grid # Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
<table>
<tr>
<td>Last stable release</td>
<td><a href="https://travis-ci.org/paboyle/Grid">
<img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
</td>
</tr>
<tr>
<td>Development branch</td>
<td><a href="https://travis-ci.org/paboyle/Grid">
<img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
</td>
</tr>
</table>
**Data parallel C++ mathematical object library.** **Data parallel C++ mathematical object library.**
@ -324,6 +310,13 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
``` ```
This is the default. This is the default.
#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping):
mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18
TBA
### Build setup for AMD EPYC / RYZEN ### Build setup for AMD EPYC / RYZEN
The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores. The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
@ -378,6 +371,14 @@ echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
$@ $@
``` ```
Performance:
#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping):
mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
TBA
### Build setup for BlueGene/Q ### Build setup for BlueGene/Q
To be written... To be written...

16
TODO
View File

@ -2,18 +2,20 @@ TODO:
--------------- ---------------
Large item work list: Large item work list:
1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
1)- BG/Q port and check
2)- Christoph's local basis expansion Lanczos 2)- Christoph's local basis expansion Lanczos
3)- BG/Q port and check 3)- Precision conversion and sort out localConvert <-- partial
4)- Precision conversion and sort out localConvert <-- partial
- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
5)- Physical propagator interface 4)- Physical propagator interface
6)- Conserved currents 5)- Conserved currents
7)- Multigrid Wilson and DWF, compare to other Multigrid implementations 6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
8)- HDCR resume 7)- HDCR resume
Recent DONE Recent DONE
-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O. <--- DONE
-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
-- GaugeFix into central location <-- DONE -- GaugeFix into central location <-- DONE
-- Scidac and Ildg metadata handling <-- DONE -- Scidac and Ildg metadata handling <-- DONE

797
benchmarks/Benchmark_ITT.cc Normal file
View File

@ -0,0 +1,797 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
std::vector<int> L_list;
std::vector<int> Ls_list;
std::vector<double> mflop_list;
double mflop_ref;
double mflop_ref_err;
int NN_global;
struct time_statistics{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v){
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void comms_header(){
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
struct controls {
int Opt;
int CommsOverlap;
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
// int HugePages;
};
class Benchmark {
public:
static void Decomposition (void ) {
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvReal : "<<sizeof(vReal )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplex : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
static void Comms(void)
{
int Nloop=200;
int nmu=0;
int maxlat=32;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::vector<double> t_time(Nloop);
time_statistics timestat;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
comms_header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
int ncomm;
double dbytes;
std::vector<double> times(Nloop);
for(int i=0;i<Nloop;i++){
double start=usecond();
dbytes=0;
ncomm=0;
parallel_for(int dir=0;dir<8;dir++){
double tbytes;
int mu =dir % 4;
if (mpi_layout[mu]>1 ) {
int xmit_to_rank;
int recv_from_rank;
if ( dir == mu ) {
int comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} else {
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank,
bytes,dir);
#ifdef GRID_OMP
#pragma omp atomic
#endif
ncomm++;
#ifdef GRID_OMP
#pragma omp atomic
#endif
dbytes+=tbytes;
}
}
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);
// for(int i=0;i<t_time.size();i++){
// std::cout << i<<" "<<t_time[i]<<std::endl;
// }
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
double rbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
}
return;
}
static void Memory(void)
{
const int Nvec=8;
typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
typedef iVector<vReal,Nvec> Vec;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t NP;
uint64_t NN;
uint64_t lmax=48;
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=4){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
NP= Grid.RankCount();
NN =Grid.NodeCount();
Vec rn ; random(sRNG,rn);
LatticeVec z(&Grid); z=rn;
LatticeVec x(&Grid); x=rn;
LatticeVec y(&Grid); y=rn;
double a=2.0;
uint64_t Nloop=NLOOP;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=a*x-y;
x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
y._odata[4]=z._odata[4];
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout<<GridLogMessage<<std::setprecision(3)
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
<< "\t\t"<< bytes/time/NN <<std::endl;
}
};
static double DWF5(int Ls,int L)
{
RealD mass=0.1;
RealD M5 =1.8;
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
std::vector<int> local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
std::vector<int> internal;
if ( SHM == 1 ) internal = std::vector<int>({1,1,1,1});
else if ( SHM == 2 ) internal = std::vector<int>({2,1,1,1});
else if ( SHM == 4 ) internal = std::vector<int>({2,2,1,1});
else if ( SHM == 8 ) internal = std::vector<int>({2,2,2,1});
else assert(0);
std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
///////// Source preparation ////////////
LatticeFermion src (sFGrid); random(RNG5,src);
LatticeFermion tmp (sFGrid);
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
LatticeFermion src_e (sFrbGrid);
LatticeFermion src_o (sFrbGrid);
LatticeFermion r_e (sFrbGrid);
LatticeFermion r_o (sFrbGrid);
LatticeFermion r_eo (sFGrid);
LatticeFermion err (sFGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else
const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O ");
#endif
controls Cases [] = {
#ifdef AVX512
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
for(int c=0;c<num_cases;c++) {
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 100;
uint64_t ncall = 1000;
double t0=usecond();
sFGrid->Barrier();
for(int i=0;i<nwarm;i++){
sDw.DhopEO(src_o,r_e,DaggerNo);
}
sFGrid->Barrier();
double t1=usecond();
sDw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
sDw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
sFGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node "<< mflops/NN<<std::endl;
sDw.Report();
}
double robust = mflops_worst/mflops_best;;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
return mflops_best;
}
static double DWF(int Ls,int L, double & robust)
{
RealD mass=0.1;
RealD M5 =1.8;
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
std::vector<int> local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
std::vector<int> internal;
if ( SHM == 1 ) internal = std::vector<int>({1,1,1,1});
else if ( SHM == 2 ) internal = std::vector<int>({2,1,1,1});
else if ( SHM == 4 ) internal = std::vector<int>({2,2,1,1});
else if ( SHM == 8 ) internal = std::vector<int>({2,2,2,1});
else assert(0);
std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
///////// Source preparation ////////////
LatticeFermion src (FGrid); random(RNG5,src);
LatticeFermion ref (FGrid);
LatticeFermion tmp (FGrid);
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
{
LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
ref = zero;
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
LatticeFermion r_eo (FGrid);
LatticeFermion err (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else
const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O ");
#endif
controls Cases [] = {
#ifdef AVX512
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
for(int c=0;c<num_cases;c++) {
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 200;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
// if (ncall < 500) ncall = 500;
uint64_t ncall = 1000;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
Dw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
Dw.Report();
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-ref;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert((norm2(err)<1.0e-4));
}
robust = mflops_worst/mflops_best;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
return mflops_best;
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
#ifdef KNL
LebesgueOrder::Block = std::vector<int>({8,2,2,2});
#else
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
#endif
Benchmark::Decomposition();
int do_memory=1;
int do_comms =1;
int do_su3 =0;
int do_wilson=1;
int do_dwf =1;
if ( do_su3 ) {
// empty for now
}
int sel=2;
std::vector<int> L_list({8,12,16,24});
//int sel=1;
// std::vector<int> L_list({8,12});
std::vector<double> robust_list;
std::vector<double> wilson;
std::vector<double> dwf4;
std::vector<double> dwf5;
if ( do_wilson ) {
int Ls=1;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double robust;
wilson.push_back(Benchmark::DWF(1,L_list[l],robust));
}
}
int Ls=16;
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double robust;
double result = Benchmark::DWF(Ls,L_list[l],robust) ;
dwf4.push_back(result);
robust_list.push_back(robust);
}
}
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
}
}
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
int NN=NN_global;
if ( do_memory ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Memory();
}
if ( do_comms && (NN>1) ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Comms();
}
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4 \t\t DWF5 " <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Comparison point result: " << dwf4[sel]/NN << " Mflop/s per node"<<std::endl;
std::cout<<std::setprecision(3);
std::cout<<GridLogMessage << " Comparison point robustness: " << robust_list[sel] <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
Grid_finalize();
}

View File

@ -68,7 +68,7 @@ int main (int argc, char ** argv)
int Nloop=100; int Nloop=100;
int nmu=0; int nmu=0;
int maxlat=24; int maxlat=32;
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@ -80,7 +80,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header(); header();
for(int lat=4;lat<=maxlat;lat+=4){ for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){ for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0], std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1], lat*mpi_layout[1],
@ -92,11 +92,16 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount(); RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode; RealD ppn = Nrank/Nnode;
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
int ncomm; int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls);
rbuf[mu].resize(lat*lat*lat*Ls);
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
}
for(int i=0;i<Nloop;i++){ for(int i=0;i<Nloop;i++){
double start=usecond(); double start=usecond();
@ -112,7 +117,6 @@ int main (int argc, char ** argv)
int comm_proc=1; int comm_proc=1;
int xmit_to_rank; int xmit_to_rank;
int recv_from_rank; int recv_from_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFromBegin(requests, Grid.SendToRecvFromBegin(requests,
(void *)&xbuf[mu][0], (void *)&xbuf[mu][0],
@ -163,7 +167,7 @@ int main (int argc, char ** argv)
header(); header();
for(int lat=4;lat<=maxlat;lat+=4){ for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){ for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat}); std::vector<int> latt_size ({lat,lat,lat,lat});
@ -172,9 +176,14 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount(); RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode; RealD ppn = Nrank/Nnode;
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls);
rbuf[mu].resize(lat*lat*lat*Ls);
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
}
int ncomm; int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@ -249,7 +258,7 @@ int main (int argc, char ** argv)
header(); header();
for(int lat=4;lat<=maxlat;lat+=4){ for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){ for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0], std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1], lat*mpi_layout[1],
@ -299,7 +308,7 @@ int main (int argc, char ** argv)
xmit_to_rank, xmit_to_rank,
(void *)&rbuf[mu][0], (void *)&rbuf[mu][0],
recv_from_rank, recv_from_rank,
bytes); bytes,mu);
comm_proc = mpi_layout[mu]-1; comm_proc = mpi_layout[mu]-1;
@ -310,11 +319,11 @@ int main (int argc, char ** argv)
xmit_to_rank, xmit_to_rank,
(void *)&rbuf[mu+4][0], (void *)&rbuf[mu+4][0],
recv_from_rank, recv_from_rank,
bytes); bytes,mu+4);
} }
} }
Grid.StencilSendToRecvFromComplete(requests); Grid.StencilSendToRecvFromComplete(requests,0);
Grid.Barrier(); Grid.Barrier();
double stop=usecond(); double stop=usecond();
t_time[i] = stop-start; // microseconds t_time[i] = stop-start; // microseconds
@ -346,7 +355,7 @@ int main (int argc, char ** argv)
header(); header();
for(int lat=4;lat<=maxlat;lat+=4){ for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){ for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0], std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1], lat*mpi_layout[1],
@ -393,8 +402,8 @@ int main (int argc, char ** argv)
xmit_to_rank, xmit_to_rank,
(void *)&rbuf[mu][0], (void *)&rbuf[mu][0],
recv_from_rank, recv_from_rank,
bytes); bytes,mu);
Grid.StencilSendToRecvFromComplete(requests); Grid.StencilSendToRecvFromComplete(requests,mu);
requests.resize(0); requests.resize(0);
comm_proc = mpi_layout[mu]-1; comm_proc = mpi_layout[mu]-1;
@ -406,8 +415,8 @@ int main (int argc, char ** argv)
xmit_to_rank, xmit_to_rank,
(void *)&rbuf[mu+4][0], (void *)&rbuf[mu+4][0],
recv_from_rank, recv_from_rank,
bytes); bytes,mu+4);
Grid.StencilSendToRecvFromComplete(requests); Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0); requests.resize(0);
} }
@ -436,5 +445,97 @@ int main (int argc, char ** argv)
} }
} }
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
double dbytes;
for(int i=0;i<Nloop;i++){
double start=usecond();
std::vector<CartesianCommunicator::CommsRequest_t> requests;
dbytes=0;
ncomm=0;
parallel_for(int dir=0;dir<8;dir++){
double tbytes;
int mu =dir % 4;
if (mpi_layout[mu]>1 ) {
ncomm++;
int xmit_to_rank;
int recv_from_rank;
if ( dir == mu ) {
int comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} else {
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
#pragma omp atomic
dbytes+=tbytes;
}
}
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
double rbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
Grid_finalize(); Grid_finalize();
} }

View File

@ -503,9 +503,9 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl; std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl; std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
//assert(norm2(src_e)<1.0e-4); assert(norm2(src_e)<1.0e-4);
//assert(norm2(src_o)<1.0e-4); assert(norm2(src_o)<1.0e-4);
Grid_finalize(); Grid_finalize();
exit(0);
} }

View File

@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
################ Get git info ################ Get git info
#AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])]) #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
################ Set flags
# do not move!
CXXFLAGS="-O3 $CXXFLAGS"
############### Checks for programs ############### Checks for programs
AC_PROG_CXX AC_PROG_CXX
AC_PROG_RANLIB AC_PROG_RANLIB
@ -27,7 +31,6 @@ AX_GXX_VERSION
AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
[version of g++ that will compile the code]) [version of g++ that will compile the code])
CXXFLAGS="-O3 $CXXFLAGS"
############### Checks for typedefs, structures, and compiler characteristics ############### Checks for typedefs, structures, and compiler characteristics
@ -55,6 +58,10 @@ AC_CHECK_HEADERS(numaif.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
############## Standard libraries
AC_CHECK_LIB([m],[cos])
AC_CHECK_LIB([stdc++],[abort])
############### GMP and MPFR ############### GMP and MPFR
AC_ARG_WITH([gmp], AC_ARG_WITH([gmp],
[AS_HELP_STRING([--with-gmp=prefix], [AS_HELP_STRING([--with-gmp=prefix],
@ -324,8 +331,41 @@ case ${ac_PRECISION} in
double) double)
AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
;; ;;
*)
AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
;;
esac esac
###################### Shared memory allocation technique under MPI3
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs],
[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
case ${ac_SHM} in
shmget)
AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
;;
shmopen)
AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
;;
hugetlbfs)
AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
;;
*)
AC_MSG_ERROR([${ac_SHM} unsupported --enable-shm option]);
;;
esac
###################### Shared base path for SHMMMAP
AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
[Select SHM mmap base path for hugetlbfs])],
[ac_SHMPATH=${enable_shmpath}],
[ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
############### communication type selection ############### communication type selection
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem], AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
@ -335,14 +375,14 @@ case ${ac_COMMS} in
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
comms_type='none' comms_type='none'
;; ;;
mpi3l*)
AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
comms_type='mpi3l'
;;
mpi3*) mpi3*)
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
comms_type='mpi3' comms_type='mpi3'
;; ;;
mpit)
AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
comms_type='mpit'
;;
mpi*) mpi*)
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
comms_type='mpi' comms_type='mpi'
@ -370,7 +410,7 @@ esac
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ]) AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ]) AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] ) AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] ) AM_CONDITIONAL(BUILD_COMMS_MPIT, [ test "${comms_type}X" == "mpitX" ] )
AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ]) AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
############### RNG selection ############### RNG selection
@ -475,6 +515,8 @@ compiler version : ${ax_cv_gxx_version}
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
Threading : ${ac_openmp} Threading : ${ac_openmp}
Communications type : ${comms_type} Communications type : ${comms_type}
Shared memory allocator : ${ac_SHM}
Shared memory mmap path : ${ac_SHMPATH}
Default precision : ${ac_PRECISION} Default precision : ${ac_PRECISION}
Software FP16 conversion : ${ac_SFW_FP16} Software FP16 conversion : ${ac_SFW_FP16}
RNG choice : ${ac_RNG} RNG choice : ${ac_RNG}

View File

@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_base.cc extra_sources+=communicator/Communicator_base.cc
endif endif
if BUILD_COMMS_MPI3L if BUILD_COMMS_MPIT
extra_sources+=communicator/Communicator_mpi3_leader.cc extra_sources+=communicator/Communicator_mpit.cc
extra_sources+=communicator/Communicator_base.cc extra_sources+=communicator/Communicator_base.cc
endif endif

View File

@ -87,15 +87,22 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
sliceInnerProductMatrix(m_rr,R,R,Orthog); sliceInnerProductMatrix(m_rr,R,R,Orthog);
//////////////////////////////////////////////////////////////////////////////////////////////////// // Force manifest hermitian to avoid rounding related
// Cholesky from Eigen m_rr = 0.5*(m_rr+m_rr.adjoint());
// There exists a ldlt that is documented as more stable
////////////////////////////////////////////////////////////////////////////////////////////////////
Eigen::MatrixXcd L = m_rr.llt().matrixL();
#if 0
std::cout << " Calling Cholesky ldlt on m_rr " << m_rr <<std::endl;
Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL();
std::cout << " Called Cholesky ldlt on m_rr " << L_ldlt <<std::endl;
auto D_ldlt = m_rr.ldlt().vectorD();
std::cout << " Called Cholesky ldlt on m_rr " << D_ldlt <<std::endl;
#endif
// std::cout << " Calling Cholesky llt on m_rr " <<std::endl;
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// std::cout << " Called Cholesky llt on m_rr " << L <<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Q = R C^{-1} // Q = R C^{-1}
// //
@ -103,7 +110,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
// //
// NB maddMatrix conventions are Right multiplication X[j] a[j,i] already // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// FIXME:: make a sliceMulMatrix to avoid zero vector
sliceMulMatrix(Q,Cinv,R,Orthog); sliceMulMatrix(Q,Cinv,R,Orthog);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -199,7 +205,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD); Linop.HermOp(X, AD);
tmp = B - AD; tmp = B - AD;
//std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
//std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
//std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
//std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
//std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
D=Q; D=Q;
std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl; std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@ -221,13 +232,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
MatrixTimer.Start(); MatrixTimer.Start();
Linop.HermOp(D, Z); Linop.HermOp(D, Z);
MatrixTimer.Stop(); MatrixTimer.Stop();
//std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
//4. M = [D^dag Z]^{-1} //4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start(); sliceInnerTimer.Start();
sliceInnerProductMatrix(m_DZ,D,Z,Orthog); sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
sliceInnerTimer.Stop(); sliceInnerTimer.Stop();
m_M = m_DZ.inverse(); m_M = m_DZ.inverse();
//std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
//5. X = X + D MC //5. X = X + D MC
m_tmp = m_M * m_C; m_tmp = m_M * m_C;
sliceMaddTimer.Start(); sliceMaddTimer.Start();

View File

@ -1,7 +1,5 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <fcntl.h>
namespace Grid { namespace Grid {
@ -11,7 +9,7 @@ int PointerCache::victim;
void *PointerCache::Insert(void *ptr,size_t bytes) { void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return NULL; if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
return NULL; return NULL;
} }
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
} }

View File

@ -64,6 +64,8 @@ namespace Grid {
}; };
void check_huge_pages(void *Buf,uint64_t BYTES);
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized. // A lattice of something, but assume the something is SIMDized.
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
@ -92,12 +94,20 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
//////////////////
// Hack 2MB align; could make option probably doesn't need configurability
//////////////////
//define GRID_ALLOC_ALIGN (128)
#define GRID_ALLOC_ALIGN (2*1024*1024)
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else #else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif #endif
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
// First touch optimise in threaded loop // First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr; uint8_t *cp = (uint8_t *)ptr;
#ifdef GRID_OMP #ifdef GRID_OMP
@ -111,6 +121,7 @@ public:
void deallocate(pointer __p, size_type __n) { void deallocate(pointer __p, size_type __n) {
size_type bytes = __n * sizeof(_Tp); size_type bytes = __n * sizeof(_Tp);
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
@ -189,16 +200,18 @@ public:
pointer allocate(size_type __n, const void* _p= 0) pointer allocate(size_type __n, const void* _p= 0)
{ {
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
#else #else
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
#endif #endif
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
uint8_t *cp = (uint8_t *)ptr; uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order // One touch per 4k page, static OMP loop to catch same loop order
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for(size_type n=0;n<bytes;n+=4096){ for(size_type n=0;n<bytes;n+=4096){
cp[n]=0; cp[n]=0;
}
} }
return ptr; return ptr;
} }

View File

@ -185,17 +185,18 @@ public:
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
void show_decomposition(){ void show_decomposition(){
std::cout << GridLogMessage << "Full Dimensions : " << _fdimensions << std::endl; std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "Global Dimensions : " << _gdimensions << std::endl; std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "Local Dimensions : " << _ldimensions << std::endl; std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl; std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "Outer strides : " << _ostride << std::endl; std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "Inner strides : " << _istride << std::endl; std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "iSites : " << _isites << std::endl; std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "oSites : " << _osites << std::endl; std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "lSites : " << lSites() << std::endl; std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "gSites : " << gSites() << std::endl; std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "Nd : " << _ndimension << std::endl; std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
} }
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////

View File

@ -62,77 +62,81 @@ public:
return shift; return shift;
} }
GridCartesian(const std::vector<int> &dimensions, GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout, const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid const std::vector<int> &processor_grid) : GridBase(processor_grid)
) : GridBase(processor_grid)
{ {
/////////////////////// ///////////////////////
// Grid information // Grid information
/////////////////////// ///////////////////////
_ndimension = dimensions.size(); _ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
for(int d=0;d<_ndimension;d++){ _fdimensions.resize(_ndimension);
_fdimensions[d] = dimensions[d]; // Global dimensions _gdimensions.resize(_ndimension);
_gdimensions[d] = _fdimensions[d]; // Global dimensions _ldimensions.resize(_ndimension);
_simd_layout[d] = simd_layout[d]; _rdimensions.resize(_ndimension);
_fsites = _fsites * _fdimensions[d]; _simd_layout.resize(_ndimension);
_gsites = _gsites * _gdimensions[d]; _lstart.resize(_ndimension);
_lend.resize(_ndimension);
//FIXME check for exact division _ostride.resize(_ndimension);
_istride.resize(_ndimension);
// Use a reduced simd grid _fsites = _gsites = _osites = _isites = 1;
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition for (int d = 0; d < _ndimension; d++)
_lstart[d] = _processor_coor[d]*_ldimensions[d]; {
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; _fdimensions[d] = dimensions[d]; // Global dimensions
_osites *= _rdimensions[d]; _gdimensions[d] = _fdimensions[d]; // Global dimensions
_isites *= _simd_layout[d]; _simd_layout[d] = simd_layout[d];
_fsites = _fsites * _fdimensions[d];
// Addressing support _gsites = _gsites * _gdimensions[d];
if ( d==0 ) {
_ostride[d] = 1; // Use a reduced simd grid
_istride[d] = 1; _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
} else { assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1]; _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
} assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
} }
else
/////////////////////// {
// subplane information _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
/////////////////////// _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block =1;
int nblock=1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d];
_slice_block[d] =block;
_slice_stride[d]=_ostride[d]*_rdimensions[d];
_slice_nblock[d]=nblock;
block = block*_rdimensions[d];
} }
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
}; };
}; };
} }
#endif #endif

View File

@ -131,21 +131,21 @@ public:
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0); Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
} }
void Init(const std::vector<int> &dimensions, void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout, const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid, const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask, const std::vector<int> &checker_dim_mask,
int checker_dim) int checker_dim)
{ {
/////////////////////// ///////////////////////
// Grid information // Grid information
/////////////////////// ///////////////////////
_checker_dim = checker_dim; _checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim]==1); assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size(); _ndimension = dimensions.size();
assert(checker_dim_mask.size()==_ndimension); assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size()==_ndimension); assert(processor_grid.size() == _ndimension);
assert(simd_layout.size()==_ndimension); assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension); _fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension); _gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension); _ldimensions.resize(_ndimension);
@ -153,114 +153,133 @@ public:
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
_ostride.resize(_ndimension); _ostride.resize(_ndimension);
_istride.resize(_ndimension); _istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1; _fsites = _gsites = _osites = _isites = 1;
_checker_dim_mask=checker_dim_mask;
for(int d=0;d<_ndimension;d++){ _checker_dim_mask = checker_dim_mask;
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
if (d==_checker_dim) {
_gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
}
_ldimensions[d] = _gdimensions[d]/_processors[d];
_lstart[d] = _processor_coor[d]*_ldimensions[d];
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
// Use a reduced simd grid for (int d = 0; d < _ndimension; d++)
_simd_layout[d] = simd_layout[d]; {
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; _fdimensions[d] = dimensions[d];
assert(_rdimensions[d]>0); _gdimensions[d] = _fdimensions[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
// all elements of a simd vector must have same checkerboard. if (d == _checker_dim)
// If Ls vectorised, this must still be the case; e.g. dwf rb5d {
if ( _simd_layout[d]>1 ) { assert((_gdimensions[d] & 0x1) == 0);
if ( checker_dim_mask[d] ) { _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
assert( (_rdimensions[d]&0x1) == 0 ); }
} _ldimensions[d] = _gdimensions[d] / _processors[d];
} assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d]; // Use a reduced simd grid
_isites *= _simd_layout[d]; _simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
// Addressing support assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
if ( d==0 ) { assert(_rdimensions[d] > 0);
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
if (_simd_layout[d] > 1)
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// subplane information // subplane information
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension); _slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension); _slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension); _slice_nblock.resize(_ndimension);
int block =1; int block = 1;
int nblock=1; int nblock = 1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d]; for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d]; for (int d = 0; d < _ndimension; d++)
_slice_block[d] =block; {
_slice_stride[d]=_ostride[d]*_rdimensions[d]; nblock /= _rdimensions[d];
_slice_nblock[d]=nblock; _slice_block[d] = block;
block = block*_rdimensions[d]; _slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
} }
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Create a checkerboard lookup table // Create a checkerboard lookup table
//////////////////////////////////////////////// ////////////////////////////////////////////////
int rvol = 1; int rvol = 1;
for(int d=0;d<_ndimension;d++){ for (int d = 0; d < _ndimension; d++)
rvol=rvol * _rdimensions[d]; {
rvol = rvol * _rdimensions[d];
} }
_checker_board.resize(rvol); _checker_board.resize(rvol);
for(int osite=0;osite<_osites;osite++){ for (int osite = 0; osite < _osites; osite++)
_checker_board[osite] = CheckerBoardFromOindex (osite); {
_checker_board[osite] = CheckerBoardFromOindex(osite);
} }
}; };
protected:
protected:
virtual int oIndex(std::vector<int> &coor) virtual int oIndex(std::vector<int> &coor)
{ {
int idx=0; int idx = 0;
for(int d=0;d<_ndimension;d++) { for (int d = 0; d < _ndimension; d++)
if( d==_checker_dim ) { {
idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]); if (d == _checker_dim)
} else { {
idx+=_ostride[d]*(coor[d]%_rdimensions[d]); idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
} }
else
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
} }
return idx; return idx;
}; };
virtual int iIndex(std::vector<int> &lcoor) virtual int iIndex(std::vector<int> &lcoor)
{ {
int idx=0; int idx = 0;
for(int d=0;d<_ndimension;d++) { for (int d = 0; d < _ndimension; d++)
if( d==_checker_dim ) { {
idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d])); if (d == _checker_dim)
} else { {
idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
} }
} else
return idx; {
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
}
return idx;
} }
}; };
} }
#endif #endif

View File

@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/mman.h>
namespace Grid { namespace Grid {
@ -33,8 +37,11 @@ namespace Grid {
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void * CartesianCommunicator::ShmCommBuf; void * CartesianCommunicator::ShmCommBuf;
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024; uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; CartesianCommunicator::CommunicatorPolicy_t
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
int CartesianCommunicator::nCommThreads = -1;
int CartesianCommunicator::Hugepages = 0;
///////////////////////////////// /////////////////////////////////
// Alloc, free shmem region // Alloc, free shmem region
@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);
} }
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) #if !defined( GRID_COMMS_MPI3)
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();}; int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
int CartesianCommunicator::RankCount(void) { return ProcessorCount();}; int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
#endif
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank, int recv_from_rank,
int bytes) int bytes, int dir)
{ {
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall); SendToRecvFromComplete(waitall);
} }
#endif
#if !defined( GRID_COMMS_MPI3)
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
@ -121,8 +146,25 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
return NULL; return NULL;
} }
void CartesianCommunicator::ShmInitGeneric(void){ void CartesianCommunicator::ShmInitGeneric(void){
#if 1
int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
#ifdef MAP_HUGETLB
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
#endif
ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
if (ShmCommBuf == (void *)MAP_FAILED) {
perror("mmap failed ");
exit(EXIT_FAILURE);
}
#ifdef MADV_HUGEPAGE
if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
#endif
#else
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
ShmCommBuf=(void *)&ShmBufStorageVector[0]; ShmCommBuf=(void *)&ShmBufStorageVector[0];
#endif
bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
} }
#endif #endif

View File

@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_MPI3 #ifdef GRID_COMMS_MPI3
#include <mpi.h> #include <mpi.h>
#endif #endif
#ifdef GRID_COMMS_MPI3L #ifdef GRID_COMMS_MPIT
#include <mpi.h> #include <mpi.h>
#endif #endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
@ -50,12 +50,24 @@ namespace Grid {
class CartesianCommunicator { class CartesianCommunicator {
public: public:
// 65536 ranks per node adequate for now
////////////////////////////////////////////
// Isend/Irecv/Wait, or Sendrecv blocking
////////////////////////////////////////////
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
static CommunicatorPolicy_t CommunicatorPolicy;
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
///////////////////////////////////////////
// Up to 65536 ranks per node adequate for now
// 128MB shared memory for comms enought for 48^4 local vol comms // 128MB shared memory for comms enought for 48^4 local vol comms
// Give external control (command line override?) of this // Give external control (command line override?) of this
///////////////////////////////////////////
static const int MAXLOG2RANKSPERNODE = 16; static const int MAXLOG2RANKSPERNODE = 16;
static uint64_t MAX_MPI_SHM_BYTES; static uint64_t MAX_MPI_SHM_BYTES;
static int nCommThreads;
// use explicit huge pages
static int Hugepages;
// Communicator should know nothing of the physics grid, only processor grid. // Communicator should know nothing of the physics grid, only processor grid.
int _Nprocessors; // How many in all int _Nprocessors; // How many in all
@ -64,14 +76,18 @@ class CartesianCommunicator {
std::vector<int> _processor_coor; // linear processor coordinate std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension; unsigned long _ndimension;
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L) #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
static MPI_Comm communicator_world; static MPI_Comm communicator_world;
MPI_Comm communicator;
MPI_Comm communicator;
std::vector<MPI_Comm> communicator_halo;
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
typedef int CommsRequest_t; typedef int CommsRequest_t;
#endif #endif
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Helper functionality for SHM Windows common to all other impls // Helper functionality for SHM Windows common to all other impls
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
@ -117,11 +133,7 @@ class CartesianCommunicator {
///////////////////////////////// /////////////////////////////////
static void * ShmCommBuf; static void * ShmCommBuf;
// Isend/Irecv/Wait, or Sendrecv blocking
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
static CommunicatorPolicy_t CommunicatorPolicy;
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
size_t heap_top; size_t heap_top;
size_t heap_bytes; size_t heap_bytes;
@ -211,14 +223,21 @@ class CartesianCommunicator {
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank, int recv_from_rank,
int bytes); int bytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
void StencilBarrier(void); void StencilBarrier(void);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////

View File

@ -41,9 +41,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef HAVE_NUMAIF_H #ifdef HAVE_NUMAIF_H
#include <numaif.h> #include <numaif.h>
#endif #endif
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
namespace Grid { namespace Grid {
@ -200,7 +198,46 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
ShmCommBuf = 0; ShmCommBuf = 0;
ShmCommBufs.resize(ShmSize); ShmCommBufs.resize(ShmSize);
#if 1 ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbf and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMMMAP
char shm_name [NAME_MAX];
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
//sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
// printf("Opening file %s \n",shm_name);
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd == -1) {
printf("open %s failed\n",shm_name);
perror("open hugetlbfs");
exit(0);
}
int mmap_flag = MAP_SHARED ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
ShmCommBufs[r] =ptr;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
// the posix shm virtual file system
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMOPEN
char shm_name [NAME_MAX]; char shm_name [NAME_MAX];
if ( ShmRank == 0 ) { if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){ for(int r=0;r<ShmSize;r++){
@ -213,13 +250,22 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd < 0 ) { perror("failed shm_open"); assert(0); } if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
ftruncate(fd, size); ftruncate(fd, size);
int mmap_flag = MAP_SHARED;
#ifdef MAP_POPULATE
mmap_flag |= MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if (Hugepages) mmap_flag |= MAP_HUGETLB;
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if ( ptr == (void * )MAP_FAILED ) { perror("failed mmap"); assert(0); }
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
// Try to force numa domain on the shm segment if we have numaif.h // Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
#ifdef HAVE_NUMAIF_H #if 0
//#ifdef HAVE_NUMAIF_H
int status; int status;
int flags=MPOL_MF_MOVE; int flags=MPOL_MF_MOVE;
#ifdef KNL #ifdef KNL
@ -236,7 +282,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
if (ierr && (page==0)) perror("numa relocate command failed"); if (ierr && (page==0)) perror("numa relocate command failed");
} }
#endif #endif
ShmCommBufs[r] =ptr; ShmCommBufs[r] =ptr;
} }
} }
@ -258,21 +304,32 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
ShmCommBufs[r] =ptr; ShmCommBufs[r] =ptr;
} }
} }
#endif
#else ////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET SHMAT and SHM_HUGETLB flag
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMGET
std::vector<int> shmids(ShmSize); std::vector<int> shmids(ShmSize);
if ( ShmRank == 0 ) { if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){ for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
key_t key = 0x4545 + r; key_t key = IPC_PRIVATE;
if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { int flags = IPC_CREAT | SHM_R | SHM_W;
#ifdef SHM_HUGETLB
if (Hugepages) flags|=SHM_HUGETLB;
#endif
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
int errsv = errno; int errsv = errno;
printf("Errno %d\n",errsv); printf("Errno %d\n",errsv);
printf("key %d\n",key);
printf("size %lld\n",size);
printf("flags %d\n",flags);
perror("shmget"); perror("shmget");
exit(1); exit(1);
} else {
printf("shmid: 0x%x\n", shmids[r]);
} }
printf("shmid: 0x%x\n", shmids[r]);
} }
} }
MPI_Barrier(ShmComm); MPI_Barrier(ShmComm);
@ -397,8 +454,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
int ierr; int ierr;
communicator=communicator_world; communicator=communicator_world;
_ndimension = processors.size(); _ndimension = processors.size();
communicator_halo.resize (2*_ndimension);
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Assert power of two shm_size. // Assert power of two shm_size.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@ -621,13 +684,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
} }
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
void *xmit, int dest,
int dest, void *recv,
void *recv, int from,
int from, int bytes,int dir)
int bytes)
{ {
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
assert(dir < communicator_halo.size());
MPI_Request xrq; MPI_Request xrq;
MPI_Request rrq; MPI_Request rrq;
@ -646,26 +723,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
gfrom = MPI_UNDEFINED; gfrom = MPI_UNDEFINED;
#endif #endif
if ( gfrom ==MPI_UNDEFINED) { if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
assert(ierr==0); assert(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} }
if ( gdest == MPI_UNDEFINED ) { if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} }
if ( CommunicatorPolicy == CommunicatorPolicySequential ) { if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list); this->StencilSendToRecvFromComplete(list,dir);
} }
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall); SendToRecvFromComplete(waitall);
} }

View File

@ -0,0 +1,286 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/ActionCore.h>
#include <mpi.h>
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
// Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag;
int provided;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
if ( provided != MPI_THREAD_MULTIPLE ) {
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
}
}
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
ShmInitGeneric();
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_ndimension = processors.size();
std::vector<int> periodic(_ndimension,1);
_Nprocessors=1;
_processors = processors;
_processor_coor.resize(_ndimension);
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
communicator_halo.resize (2*_ndimension);
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
int Size;
MPI_Comm_size(communicator,&Size);
assert(Size==_Nprocessors);
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor;
int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
///////////////////////////////////////////////////////
// Should only be used prior to Grid Init finished.
// Check for this?
///////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir)
{
int myrank = _processor;
int ierr;
assert(dir < communicator_halo.size());
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
// Give the CPU to MPI immediately; can use threads to overlap optionally
MPI_Request req[2];
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
list.push_back(req[0]);
list.push_back(req[1]);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
int nreq=waitall.size();
MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
};
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir)
{
int myrank = _processor;
int ierr;
assert(dir < communicator_halo.size());
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
// Give the CPU to MPI immediately; can use threads to overlap optionally
MPI_Request req[2];
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[dir],&req[0]);
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
return 2.0*bytes;
}
}

View File

@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/cshift/Cshift_mpi.h> #include <Grid/cshift/Cshift_mpi.h>
#endif #endif
#ifdef GRID_COMMS_MPI3L #ifdef GRID_COMMS_MPIT
#include <Grid/cshift/Cshift_mpi.h> #include <Grid/cshift/Cshift_mpi.h>
#endif #endif

View File

@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
} }
}; };
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{ {
int NN = BlockSolverGrid->_ndimension; int NN = BlockSolverGrid->_ndimension;
@ -387,6 +388,7 @@ inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
} }
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
} }
*/
template<class vobj> template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int Nblock = X._grid->GlobalDimensions()[Orthog]; int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid; GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension; int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int Nblock = X._grid->GlobalDimensions()[Orthog]; int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid; GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension; int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@ -498,18 +501,19 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid; GridBase *FullGrid = lhs._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog]; int Nblock = FullGrid->GlobalDimensions()[Orthog];
Lattice<vobj> Lslice(SliceGrid); // Lattice<vobj> Lslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension; int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@ -550,6 +554,14 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat += mat_thread; mat += mat_thread;
} }
} }
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return; return;
} }

View File

@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) { void Grid_quiesce_nodes(void) {
int me = 0; int me = 0;
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Comm_rank(MPI_COMM_WORLD, &me);
#endif #endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM

View File

@ -29,7 +29,7 @@
#ifndef GRID_BINARY_IO_H #ifndef GRID_BINARY_IO_H
#define GRID_BINARY_IO_H #define GRID_BINARY_IO_H
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO #define USE_MPI_IO
#else #else
#undef USE_MPI_IO #undef USE_MPI_IO
@ -98,35 +98,39 @@ class BinaryIO {
NerscChecksum(grid,scalardata,nersc_csum); NerscChecksum(grid,scalardata,nersc_csum);
} }
template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum) template <class fobj>
static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
{ {
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
uint64_t lsites = grid->lSites();
uint64_t lsites =grid->lSites(); if (fbuf.size() == 1)
if (fbuf.size()==1) { {
lsites=1; lsites = 1;
} }
#pragma omp parallel #pragma omp parallel
{ {
uint32_t nersc_csum_thr=0; uint32_t nersc_csum_thr = 0;
#pragma omp for #pragma omp for
for(uint64_t local_site=0;local_site<lsites;local_site++){ for (uint64_t local_site = 0; local_site < lsites; local_site++)
uint32_t * site_buf = (uint32_t *)&fbuf[local_site]; {
for(uint64_t j=0;j<size32;j++){ uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
nersc_csum_thr=nersc_csum_thr+site_buf[j]; for (uint64_t j = 0; j < size32; j++)
} {
nersc_csum_thr = nersc_csum_thr + site_buf[j];
}
} }
#pragma omp critical #pragma omp critical
{ {
nersc_csum += nersc_csum_thr; nersc_csum += nersc_csum_thr;
} }
} }
} }
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{ {
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
@ -266,7 +270,7 @@ class BinaryIO {
grid->Barrier(); grid->Barrier();
GridStopWatch timer; GridStopWatch timer;
GridStopWatch bstimer; GridStopWatch bstimer;
nersc_csum=0; nersc_csum=0;
scidac_csuma=0; scidac_csuma=0;
scidac_csumb=0; scidac_csumb=0;
@ -362,18 +366,22 @@ class BinaryIO {
#else #else
assert(0); assert(0);
#endif #endif
} else { } else {
std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : " std::cout << GridLogMessage << "C++ read I/O " << file << " : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl; << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ifstream fin; std::ifstream fin;
fin.open(file,std::ios::binary|std::ios::in); fin.open(file, std::ios::binary | std::ios::in);
if ( control & BINARYIO_MASTER_APPEND ) { if (control & BINARYIO_MASTER_APPEND)
fin.seekg(-sizeof(fobj),fin.end); {
} else { fin.seekg(-sizeof(fobj), fin.end);
fin.seekg(offset+myrank*lsites*sizeof(fobj)); }
} else
fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0); {
fin.close(); fin.seekg(offset + myrank * lsites * sizeof(fobj));
}
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
assert(fin.fail() == 0);
fin.close();
} }
timer.Stop(); timer.Stop();
@ -405,30 +413,78 @@ class BinaryIO {
timer.Start(); timer.Start();
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO #ifdef USE_MPI_IO
std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl; std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); std::cout << GridLogMessage << "Checking for errors" << std::endl;
ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); if (ierr != MPI_SUCCESS)
MPI_File_close(&fh); {
MPI_Type_free(&fileArray); char error_string[BUFSIZ];
MPI_Type_free(&localArray); int length_of_error_string, error_class;
MPI_Error_class(ierr, &error_class);
MPI_Error_string(error_class, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Error_string(ierr, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
}
std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
#else #else
assert(0); assert(0);
#endif #endif
} else { } else {
std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : " std::ofstream fout;
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl; fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
if ( control & BINARYIO_MASTER_APPEND ) { try {
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
} catch (const std::fstream::failure& exc) {
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
if ( control & BINARYIO_MASTER_APPEND ) {
fout.seekp(0,fout.end); fout.seekp(0,fout.end);
} else { } else {
fout.seekp(offset+myrank*lsites*sizeof(fobj)); fout.seekp(offset+myrank*lsites*sizeof(fobj));
} }
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
try {
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
}
catch (const std::fstream::failure& exc) {
std::cout << "Exception in writing file " << file << std::endl;
std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
fout.close(); fout.close();
} }
timer.Stop(); timer.Stop();
} }
std::cout<<GridLogMessage<<"IOobject: "; std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read "; if ( control & BINARYIO_READ) std::cout << " read ";
@ -442,11 +498,14 @@ class BinaryIO {
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Safety check // Safety check
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
grid->Barrier(); // if the data size is 1 we do not want to sum over the MPI ranks
grid->GlobalSum(nersc_csum); if (iodata.size() != 1){
grid->GlobalXOR(scidac_csuma); grid->Barrier();
grid->GlobalXOR(scidac_csumb); grid->GlobalSum(nersc_csum);
grid->Barrier(); grid->GlobalXOR(scidac_csuma);
grid->GlobalXOR(scidac_csumb);
grid->Barrier();
}
} }
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
@ -546,9 +605,9 @@ class BinaryIO {
int gsites = grid->gSites(); int gsites = grid->gSites();
int lsites = grid->lSites(); int lsites = grid->lSites();
uint32_t nersc_csum_tmp; uint32_t nersc_csum_tmp = 0;
uint32_t scidac_csuma_tmp; uint32_t scidac_csuma_tmp = 0;
uint32_t scidac_csumb_tmp; uint32_t scidac_csumb_tmp = 0;
GridStopWatch timer; GridStopWatch timer;

View File

@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
for(int i=0; i < Ls; i++){ for(int i=0; i < Ls; i++){
as[i] = 1.0; as[i] = 1.0;
omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
// assert(fabs(omega[i])>0.0); assert(omega[i]!=Coeff_t(0.0));
bs[i] = 0.5*(bpc/omega[i] + bmc); bs[i] = 0.5*(bpc/omega[i] + bmc);
cs[i] = 0.5*(bpc/omega[i] - bmc); cs[i] = 0.5*(bpc/omega[i] - bmc);
} }
@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
for(int i=0;i<Ls;i++){ for(int i=0;i<Ls;i++){
bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0); bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
// assert(fabs(bee[i])>0.0); assert(bee[i]!=Coeff_t(0.0));
cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
beo[i]=as[i]*bs[i]; beo[i]=as[i]*bs[i];
ceo[i]=-as[i]*cs[i]; ceo[i]=-as[i]*cs[i];
@ -455,11 +455,17 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
dee[i] = bee[i]; dee[i] = bee[i];
if ( i < Ls-1 ) { if ( i < Ls-1 ) {
assert(bee[i]!=Coeff_t(0.0));
assert(bee[0]!=Coeff_t(0.0));
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
leem[i]=mass*cee[Ls-1]/bee[0]; leem[i]=mass*cee[Ls-1]/bee[0];
for(int j=0;j<i;j++) leem[i]*= aee[j]/bee[j+1]; for(int j=0;j<i;j++) {
assert(bee[j+1]!=Coeff_t(0.0));
leem[i]*= aee[j]/bee[j+1];
}
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
{ {
Coeff_t delta_d=mass*cee[Ls-1]; Coeff_t delta_d=mass*cee[Ls-1];
for(int j=0;j<Ls-1;j++) { for(int j=0;j<Ls-1;j++) {
// assert(fabs(bee[j])>0.0); assert(bee[j] != Coeff_t(0.0));
delta_d *= cee[j]/bee[j]; delta_d *= cee[j]/bee[j];
} }
dee[Ls-1] += delta_d; dee[Ls-1] += delta_d;

View File

@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
{ {
Compressor compressor; Compressor compressor;
int LLs = in._grid->_rdimensions[0]; int LLs = in._grid->_rdimensions[0];
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
st.HaloExchange(in,compressor); st.HaloExchange(in,compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) { if (dag == DaggerYes) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
} }
} }
DhopComputeTime += usecond();
DhopTotalTime += usecond();
} }
template<class Impl> template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls+=1;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check conformable(in._grid,out._grid); // drops the cb check
@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
template<class Impl> template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls+=1;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check conformable(in._grid,out._grid); // drops the cb check
@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
template<class Impl> template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls+=2;
conformable(in._grid,FermionGrid()); // verifies full grid conformable(in._grid,FermionGrid()); // verifies full grid
conformable(in._grid,out._grid); conformable(in._grid,out._grid);
@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
} }
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Report(void)
{
std::vector<int> latt = GridDefaultLatt();
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _FourDimGrid->_Nprocessors;
RealD NN = _FourDimGrid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_FourDimGrid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
// Implement the general interface. Here we use SAME mass on all slices // Implement the general interface. Here we use SAME mass on all slices

View File

@ -55,6 +55,16 @@ namespace QCD {
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
template<class vobj,class cobj> template<class vobj,class cobj>
class WilsonStencil : public CartesianStencil<vobj,cobj> { class WilsonStencil : public CartesianStencil<vobj,cobj> {
public: public:
double timer0;
double timer1;
double timer2;
double timer3;
double timer4;
double timer5;
double timer6;
uint64_t callsi;
void ZeroCountersi(void)
{
timer0=0;
timer1=0;
timer2=0;
timer3=0;
timer4=0;
timer5=0;
timer6=0;
callsi=0;
}
void Reporti(int calls)
{
if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate) " <<timer1/calls <<std::endl;
if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge ) " <<timer2/calls <<std::endl;
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
}
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
std::vector<int> same_node; std::vector<int> same_node;
@ -252,6 +278,7 @@ public:
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
same_node(npoints) same_node(npoints)
{ {
ZeroCountersi();
surface_list.resize(0); surface_list.resize(0);
}; };
@ -261,7 +288,6 @@ public:
// Here we know the distance is 1 for WilsonStencil // Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){ for(int point=0;point<this->_npoints;point++){
same_node[point] = this->SameNode(point); same_node[point] = this->SameNode(point);
// std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
} }
for(int site = 0 ;site< vol4;site++){ for(int site = 0 ;site< vol4;site++){
@ -282,17 +308,28 @@ public:
{ {
std::vector<std::vector<CommsRequest_t> > reqs; std::vector<std::vector<CommsRequest_t> > reqs;
this->HaloExchangeOptGather(source,compress); this->HaloExchangeOptGather(source,compress);
this->CommunicateBegin(reqs); double t1=usecond();
this->CommunicateComplete(reqs); // Asynchronous MPI calls multidirectional, Isend etc...
// this->CommunicateBegin(reqs);
// this->CommunicateComplete(reqs);
// Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
this->Communicate();
double t2=usecond(); timer1 += t2-t1;
this->CommsMerge(compress); this->CommsMerge(compress);
double t3=usecond(); timer2 += t3-t2;
this->CommsMergeSHM(compress); this->CommsMergeSHM(compress);
double t4=usecond(); timer3 += t4-t3;
} }
template <class compressor> template <class compressor>
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
{ {
this->Prepare(); this->Prepare();
double t0=usecond();
this->HaloGatherOpt(source,compress); this->HaloGatherOpt(source,compress);
double t1=usecond();
timer0 += t1-t0;
callsi++;
} }
template <class compressor> template <class compressor>
@ -304,7 +341,9 @@ public:
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
this->mpi3synctime_g-=usecond();
this->_grid->StencilBarrier(); this->_grid->StencilBarrier();
this->mpi3synctime_g+=usecond();
assert(source._grid==this->_grid); assert(source._grid==this->_grid);
this->halogtime-=usecond(); this->halogtime-=usecond();
@ -323,7 +362,6 @@ public:
int dag = compress.dag; int dag = compress.dag;
int face_idx=0; int face_idx=0;
if ( dag ) { if ( dag ) {
// std::cout << " Optimised Dagger compress " <<std::endl;
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));

View File

@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
int vol4; int vol4;
vol4=FourDimGrid.oSites(); vol4=FourDimGrid.oSites();
Stencil.BuildSurfaceList(LLs,vol4); Stencil.BuildSurfaceList(LLs,vol4);
vol4=FourDimRedBlackGrid.oSites(); vol4=FourDimRedBlackGrid.oSites();
StencilEven.BuildSurfaceList(LLs,vol4); StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4);
std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() // std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
<<" " << StencilEven.surface_list.size()<<std::endl; // <<" " << StencilEven.surface_list.size()<<std::endl;
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::Report(void) void WilsonFermion5D<Impl>::Report(void)
{ {
std::vector<int> latt = GridDefaultLatt(); RealD NP = _FourDimGrid->_Nprocessors;
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; RealD NN = _FourDimGrid->NodeCount();
RealD NP = _FourDimGrid->_Nprocessors; RealD volume = Ls;
RealD NN = _FourDimGrid->NodeCount(); std::vector<int> latt = _FourDimGrid->GlobalDimensions();
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
if ( DhopCalls > 0 ) { if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report(); std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report(); std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
} }
if ( DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
}
} }
template<class Impl> template<class Impl>
@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
Stencil.ZeroCounters(); Stencil.ZeroCounters();
StencilEven.ZeroCounters(); StencilEven.ZeroCounters();
StencilOdd.ZeroCounters(); StencilOdd.ZeroCounters();
Stencil.ZeroCountersi();
StencilEven.ZeroCountersi();
StencilOdd.ZeroCountersi();
} }
@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes)); // assert((dag==DaggerNo) ||(dag==DaggerYes));
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
Compressor compressor(dag); Compressor compressor(dag);
@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.HaloExchangeOptGather(in,compressor); st.HaloExchangeOptGather(in,compressor);
DhopFaceTime+=usecond(); st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
std::vector<std::vector<CommsRequest_t> > reqs;
// Rely on async comms; start comms before merge of local data
DhopCommTime-=usecond();
st.CommunicateBegin(reqs);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
// Perhaps use omp task and region double ctime=0;
#pragma omp parallel double ptime=0;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
{ {
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads(); int nthreads = omp_get_num_threads();
int me = omp_get_thread_num(); int ncomms = CartesianCommunicator::nCommThreads;
int myoff, mywork; if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1); if (tid >= ncomms) {
int sF = LLs * myoff; double start = usecond();
nthreads -= ncomms;
if ( me == 0 ) { int ttid = tid - ncomms;
st.CommunicateComplete(reqs); int n = U._grid->oSites();
DhopCommTime+=usecond(); int chunk = n / nthreads;
} else { int rem = n % nthreads;
// Interior links in stencil int myblock, myn;
if ( me==1 ) DhopComputeTime-=usecond(); if (ttid < rem) {
if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); myblock = ttid * chunk + ttid;
else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); myn = chunk+1;
if ( me==1 ) DhopComputeTime+=usecond(); } else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
}
}
ptime = usecond() - start;
}
{
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
} }
} }
DhopCommTime += ctime;
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
// Load imbalance alert. Should use dynamic schedule OMP for loop
// Perhaps create a list of only those sites with face work, and
// load balance process the list.
DhopComputeTime2-=usecond(); DhopComputeTime2-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
int sz=st.surface_list.size(); int sz=st.surface_list.size();
@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
#else #else
assert(0); assert(0);
#endif #endif
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U, DoubledGaugeField & U,

View File

@ -165,7 +165,7 @@ class HMCResourceManager {
// Grids // Grids
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
void AddGrid(std::string s, GridModule& M) { void AddGrid(const std::string s, GridModule& M) {
// Check for name clashes // Check for name clashes
auto search = Grids.find(s); auto search = Grids.find(s);
if (search != Grids.end()) { if (search != Grids.end()) {
@ -174,14 +174,24 @@ class HMCResourceManager {
exit(1); exit(1);
} }
Grids[s] = std::move(M); Grids[s] = std::move(M);
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
Grids[s].show_full_decomposition();
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
} }
// Add a named grid set, 4d shortcut // Add a named grid set, 4d shortcut
void AddFourDimGrid(std::string s) { void AddFourDimGrid(const std::string s) {
GridFourDimModule<vComplex> Mod; GridFourDimModule<vComplex> Mod;
AddGrid(s, Mod); AddGrid(s, Mod);
} }
// Add a named grid set, 4d shortcut + tweak simd lanes
void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
GridFourDimModule<vComplex> Mod(simd_decomposition);
AddGrid(s, Mod);
}
GridCartesian* GetCartesian(std::string s = "") { GridCartesian* GetCartesian(std::string s = "") {

View File

@ -33,28 +33,29 @@ directory
namespace Grid { namespace Grid {
// Resources // Resources
// Modules for grids // Modules for grids
// Introduce another namespace HMCModules? // Introduce another namespace HMCModules?
class GridModuleParameters: Serializable{ class GridModuleParameters: Serializable{
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters, GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
std::string, lattice, std::string, lattice,
std::string, mpi); std::string, mpi);
std::vector<int> getLattice(){return strToVec<int>(lattice);} std::vector<int> getLattice() const {return strToVec<int>(lattice);}
std::vector<int> getMpi() {return strToVec<int>(mpi);} std::vector<int> getMpi() const {return strToVec<int>(mpi);}
void check(){
if (getLattice().size() != getMpi().size()) { void check() const {
std::cout << GridLogError if (getLattice().size() != getMpi().size() ) {
std::cout << GridLogError
<< "Error in GridModuleParameters: lattice and mpi dimensions " << "Error in GridModuleParameters: lattice and mpi dimensions "
"do not match" "do not match"
<< std::endl; << std::endl;
exit(1); exit(1);
} }
} }
template <class ReaderClass> template <class ReaderClass>
GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) { GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
@ -75,51 +76,94 @@ private:
// Lower level class // Lower level class
class GridModule { class GridModule {
public: public:
GridCartesian* get_full() { GridCartesian* get_full() {
std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl; std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
return grid_.get(); } return grid_.get(); }
GridRedBlackCartesian* get_rb() { GridRedBlackCartesian* get_rb() {
std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl; std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
return rbgrid_.get(); } return rbgrid_.get(); }
void set_full(GridCartesian* grid) { grid_.reset(grid); } void set_full(GridCartesian* grid) { grid_.reset(grid); }
void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); } void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
void show_full_decomposition(){ grid_->show_decomposition(); }
void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
protected: protected:
std::unique_ptr<GridCartesian> grid_; std::unique_ptr<GridCartesian> grid_;
std::unique_ptr<GridRedBlackCartesian> rbgrid_; std::unique_ptr<GridRedBlackCartesian> rbgrid_;
}; };
//////////////////////////////////// ////////////////////////////////////
// Classes for the user // Classes for the user
//////////////////////////////////// ////////////////////////////////////
// Note: the space time grid should be out of the QCD namespace // Note: the space time grid should be out of the QCD namespace
template< class vector_type> template <class vector_type>
class GridFourDimModule : public GridModule { class GridFourDimModule : public GridModule
public: {
GridFourDimModule() { public:
GridFourDimModule()
{
using namespace QCD; using namespace QCD;
set_full(SpaceTimeGrid::makeFourDimGrid( set_full(SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()), GridDefaultLatt(),
GridDefaultSimd(4, vector_type::Nsimd()),
GridDefaultMpi())); GridDefaultMpi()));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
} }
GridFourDimModule(GridModuleParameters Params) { GridFourDimModule(const std::vector<int> tweak_simd)
{
using namespace QCD;
if (tweak_simd.size() != 4)
{
std::cout << GridLogError
<< "Error in GridFourDimModule: SIMD size different from 4"
<< std::endl;
exit(1);
}
// Checks that the product agrees with the expectation
int simd_sum = 1;
for (auto &n : tweak_simd)
simd_sum *= n;
std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << " Sum: " << simd_sum << std::endl;
if (simd_sum == vector_type::Nsimd())
{
set_full(SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(),
tweak_simd,
GridDefaultMpi()));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
}
else
{
std::cout << GridLogError
<< "Error in GridFourDimModule: SIMD lanes must sum to "
<< vector_type::Nsimd()
<< std::endl;
}
}
GridFourDimModule(const GridModuleParameters Params)
{
using namespace QCD; using namespace QCD;
Params.check();
std::vector<int> lattice_v = Params.getLattice(); std::vector<int> lattice_v = Params.getLattice();
std::vector<int> mpi_v = Params.getMpi(); std::vector<int> mpi_v = Params.getMpi();
if (lattice_v.size() == 4) { if (lattice_v.size() == 4)
{
set_full(SpaceTimeGrid::makeFourDimGrid( set_full(SpaceTimeGrid::makeFourDimGrid(
lattice_v, GridDefaultSimd(4, vector_type::Nsimd()), lattice_v,
GridDefaultSimd(4, vector_type::Nsimd()),
mpi_v)); mpi_v));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
} else { }
std::cout << GridLogError else
<< "Error in GridFourDimModule: lattice dimension different from 4" {
<< std::endl; std::cout << GridLogError
<< "Error in GridFourDimModule: lattice dimension different from 4"
<< std::endl;
exit(1); exit(1);
} }
} }

View File

@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase; typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
using ObsBase::ObsBase; // for constructors using ObsBase::ObsBase; // for constructors
// acquire resource // acquire resource
virtual void initialize(){ virtual void initialize(){
this->ObservablePtr.reset(new PlaquetteLogger<Impl>()); this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
PlaquetteMod(): ObsBase(NoParameters()){} PlaquetteMod(): ObsBase(NoParameters()){}
}; };
template < class Impl > template < class Impl >
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{ class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase; typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
using ObsBase::ObsBase; // for constructors using ObsBase::ObsBase; // for constructors
// acquire resource // acquire resource
virtual void initialize(){ virtual void initialize(){
this->ObservablePtr.reset(new TopologicalCharge<Impl>()); this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
} }
public: public:
TopologicalChargeMod(): ObsBase(NoParameters()){} TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
TopologicalChargeMod(): ObsBase(){}
}; };
}// QCD temporarily here }// QCD temporarily here

View File

@ -33,9 +33,45 @@ directory
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
struct TopologySmearingParameters : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
int, steps,
float, step_size,
int, meas_interval,
float, maxTau);
TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
template < class ReaderClass >
TopologySmearingParameters(Reader<ReaderClass>& Reader){
read(Reader, "Smearing", *this);
}
};
struct TopologyObsParameters : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
int, interval,
bool, do_smearing,
TopologySmearingParameters, Smearing);
TopologyObsParameters(int interval = 1, bool smearing = false):
interval(interval), Smearing(smearing){}
template <class ReaderClass >
TopologyObsParameters(Reader<ReaderClass>& Reader){
read(Reader, "TopologyMeasurement", *this);
}
};
// this is only defined for a gauge theory // this is only defined for a gauge theory
template <class Impl> template <class Impl>
class TopologicalCharge : public HmcObservable<typename Impl::Field> { class TopologicalCharge : public HmcObservable<typename Impl::Field> {
TopologyObsParameters Pars;
public: public:
// here forces the Impl to be of gauge fields // here forces the Impl to be of gauge fields
// if not the compiler will complain // if not the compiler will complain
@ -44,20 +80,39 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
// necessary for HmcObservable compatibility // necessary for HmcObservable compatibility
typedef typename Impl::Field Field; typedef typename Impl::Field Field;
TopologicalCharge(int interval = 1, bool do_smearing = false):
Pars(interval, do_smearing){}
TopologicalCharge(TopologyObsParameters P):Pars(P){
std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
}
void TrajectoryComplete(int traj, void TrajectoryComplete(int traj,
Field &U, Field &U,
GridSerialRNG &sRNG, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) { GridParallelRNG &pRNG) {
Real q = WilsonLoops<Impl>::TopologicalCharge(U); if (traj%Pars.interval == 0){
// Smearing
Field Usmear = U;
int def_prec = std::cout.precision();
if (Pars.do_smearing){
// using wilson flow by default here
WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
Real T0 = WF.energyDensityPlaquette(Usmear);
std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
<< "T0 : [ " << traj << " ] "<< T0 << std::endl;
}
int def_prec = std::cout.precision(); Real q = WilsonLoops<Impl>::TopologicalCharge(Usmear);
std::cout << GridLogMessage
<< std::setprecision(std::numeric_limits<Real>::digits10 + 1)
<< "Topological Charge: [ " << traj << " ] "<< q << std::endl;
std::cout << GridLogMessage std::cout.precision(def_prec);
<< std::setprecision(std::numeric_limits<Real>::digits10 + 1) }
<< "Topological Charge: [ " << traj << " ] "<< q << std::endl;
std::cout.precision(def_prec);
} }
}; };

View File

@ -26,12 +26,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
//#include <Grid/Grid.h> //#include <Grid/Grid.h>
using namespace Grid; #ifndef GRID_QCD_GAUGE_FIX_H
using namespace Grid::QCD; #define GRID_QCD_GAUGE_FIX_H
namespace Grid {
namespace QCD {
template <class Gimpl> template <class Gimpl>
class FourierAcceleratedGaugeFixer : public Gimpl { class FourierAcceleratedGaugeFixer : public Gimpl {
public: public:
INHERIT_GIMPL_TYPES(Gimpl); INHERIT_GIMPL_TYPES(Gimpl);
typedef typename Gimpl::GaugeLinkField GaugeMat; typedef typename Gimpl::GaugeLinkField GaugeMat;
@ -186,3 +188,6 @@ class FourierAcceleratedGaugeFixer : public Gimpl {
} }
}; };
}
}
#endif

View File

@ -82,11 +82,11 @@ namespace Optimization {
double tmp[2]={a,b}; double tmp[2]={a,b};
return vld1q_f64(tmp); return vld1q_f64(tmp);
} }
//Real double // N:tbc //Real double
inline float64x2_t operator()(double a){ inline float64x2_t operator()(double a){
return vdupq_n_f64(a); return vdupq_n_f64(a);
} }
//Integer // N:tbc //Integer
inline uint32x4_t operator()(Integer a){ inline uint32x4_t operator()(Integer a){
return vdupq_n_u32(a); return vdupq_n_u32(a);
} }
@ -124,33 +124,32 @@ namespace Optimization {
// Nils: Vset untested; not used currently in Grid at all; // Nils: Vset untested; not used currently in Grid at all;
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
struct Vset{ struct Vset{
// Complex float // N:ok // Complex float
inline float32x4_t operator()(Grid::ComplexF *a){ inline float32x4_t operator()(Grid::ComplexF *a){
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
return vld1q_f32(tmp); return vld1q_f32(tmp);
} }
// Complex double // N:ok // Complex double
inline float64x2_t operator()(Grid::ComplexD *a){ inline float64x2_t operator()(Grid::ComplexD *a){
double tmp[2]={a[0].imag(),a[0].real()}; double tmp[2]={a[0].imag(),a[0].real()};
return vld1q_f64(tmp); return vld1q_f64(tmp);
} }
// Real float // N:ok // Real float
inline float32x4_t operator()(float *a){ inline float32x4_t operator()(float *a){
float tmp[4]={a[3],a[2],a[1],a[0]}; float tmp[4]={a[3],a[2],a[1],a[0]};
return vld1q_f32(tmp); return vld1q_f32(tmp);
} }
// Real double // N:ok // Real double
inline float64x2_t operator()(double *a){ inline float64x2_t operator()(double *a){
double tmp[2]={a[1],a[0]}; double tmp[2]={a[1],a[0]};
return vld1q_f64(tmp); return vld1q_f64(tmp);
} }
// Integer // N:ok // Integer
inline uint32x4_t operator()(Integer *a){ inline uint32x4_t operator()(Integer *a){
return vld1q_dup_u32(a); return vld1q_dup_u32(a);
} }
}; };
// N:leaving as is
template <typename Out_type, typename In_type> template <typename Out_type, typename In_type>
struct Reduce{ struct Reduce{
//Need templated class to overload output type //Need templated class to overload output type
@ -249,9 +248,9 @@ namespace Optimization {
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ... return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
// no fma, use mul and add // no fma, use mul and add
//float32x4_t r5; // float32x4_t r5;
//r5 = vmulq_f32(r0, a); // r5 = vmulq_f32(r0, a);
//return vaddq_f32(r4, r5); // return vaddq_f32(r4, r5);
} }
// Complex double // Complex double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){ inline float64x2_t operator()(float64x2_t a, float64x2_t b){
@ -272,9 +271,9 @@ namespace Optimization {
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
// no fma, use mul and add // no fma, use mul and add
//float64x2_t r5; // float64x2_t r5;
//r5 = vmulq_f64(r0, a); // r5 = vmulq_f64(r0, a);
//return vaddq_f64(r4, r5); // return vaddq_f64(r4, r5);
} }
}; };
@ -421,11 +420,6 @@ namespace Optimization {
} }
} }
// working, but no restriction on n
// template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
// template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
// restriction on n
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
@ -441,7 +435,7 @@ namespace Optimization {
sb = vcvt_high_f32_f16(h); sb = vcvt_high_f32_f16(h);
// there is no direct conversion from lower float32x4_t to float64x2_t // there is no direct conversion from lower float32x4_t to float64x2_t
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
//float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
// workaround for clang // workaround for clang
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h); uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2)); float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
@ -547,7 +541,7 @@ namespace Optimization {
//Complex double Reduce //Complex double Reduce
template<> // N:by Boyle template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
u128d conv; conv.v = in; u128d conv; conv.v = in;
return Grid::ComplexD(conv.f[0],conv.f[1]); return Grid::ComplexD(conv.f[0],conv.f[1]);
@ -562,9 +556,7 @@ namespace Optimization {
//Integer Reduce //Integer Reduce
template<> template<>
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){ inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
// FIXME unimplemented return vaddvq_u32(in);
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
} }
} }
@ -603,4 +595,5 @@ namespace Optimization {
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD; typedef Optimization::TimesI TimesISIMD;
} }

View File

@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
// Timing info; ugly; possibly temporary // Timing info; ugly; possibly temporary
///////////////////////////////////////// /////////////////////////////////////////
double commtime; double commtime;
double mpi3synctime;
double mpi3synctime_g;
double shmmergetime;
double gathertime; double gathertime;
double gathermtime; double gathermtime;
double halogtime; double halogtime;
@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
double splicetime; double splicetime;
double nosplicetime; double nosplicetime;
double calls; double calls;
std::vector<double> comm_bytes_thr;
std::vector<double> comm_time_thr;
std::vector<double> comm_enter_thr;
std::vector<double> comm_leave_thr;
//////////////////////////////////////// ////////////////////////////////////////
// Stencil query // Stencil query
@ -248,35 +255,120 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
////////////////////////////////////////// //////////////////////////////////////////
// Comms packet queue for asynch thread // Comms packet queue for asynch thread
////////////////////////////////////////// //////////////////////////////////////////
void CommunicateThreaded()
{
#ifdef GRID_OMP
// must be called in parallel region
int mythread = omp_get_thread_num();
int nthreads = CartesianCommunicator::nCommThreads;
#else
int mythread = 0;
int nthreads = 1;
#endif
if (nthreads == -1) nthreads = 1;
if (mythread < nthreads) {
comm_enter_thr[mythread] = usecond();
for (int i = mythread; i < Packets.size(); i += nthreads) {
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
comm_bytes_thr[mythread] += bytes;
}
comm_leave_thr[mythread]= usecond();
comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
}
}
void CollateThreads(void)
{
int nthreads = CartesianCommunicator::nCommThreads;
double first=0.0;
double last =0.0;
for(int t=0;t<nthreads;t++) {
double t0 = comm_enter_thr[t];
double t1 = comm_leave_thr[t];
comms_bytes+=comm_bytes_thr[t];
comm_enter_thr[t] = 0.0;
comm_leave_thr[t] = 0.0;
comm_time_thr[t] = 0.0;
comm_bytes_thr[t]=0;
if ( first == 0.0 ) first = t0; // first is t0
if ( (t0 > 0.0) && ( t0 < first ) ) first = t0; // min time seen
if ( t1 > last ) last = t1; // max time seen
}
commtime+= last-first;
}
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
reqs.resize(Packets.size()); reqs.resize(Packets.size());
commtime-=usecond(); commtime-=usecond();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i], comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
Packets[i].send_buf, Packets[i].send_buf,
Packets[i].to_rank, Packets[i].to_rank,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank, Packets[i].from_rank,
Packets[i].bytes); Packets[i].bytes,i);
} }
} }
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromComplete(reqs[i]); _grid->StencilSendToRecvFromComplete(reqs[i],i);
} }
commtime+=usecond(); commtime+=usecond();
} }
void Communicate(void)
{
#ifdef GRID_OMP
#pragma omp parallel
{
// must be called in parallel region
int mythread = omp_get_thread_num();
int maxthreads= omp_get_max_threads();
int nthreads = CartesianCommunicator::nCommThreads;
assert(nthreads <= maxthreads);
if (nthreads == -1) nthreads = 1;
#else
int mythread = 0;
int nthreads = 1;
#endif
if (mythread < nthreads) {
for (int i = mythread; i < Packets.size(); i += nthreads) {
double start = usecond();
comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
comm_time_thr[mythread] += usecond() - start;
}
}
#ifdef GRID_OMP
}
#endif
}
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
{ {
std::vector<std::vector<CommsRequest_t> > reqs; std::vector<std::vector<CommsRequest_t> > reqs;
Prepare(); Prepare();
HaloGather(source,compress); HaloGather(source,compress);
CommunicateBegin(reqs); // Concurrent
CommunicateComplete(reqs); //CommunicateBegin(reqs);
//CommunicateComplete(reqs);
// Sequential, possibly threaded
Communicate();
CommsMergeSHM(compress); CommsMergeSHM(compress);
CommsMerge(compress); CommsMerge(compress);
} }
@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
template<class compressor> template<class compressor>
void HaloGather(const Lattice<vobj> &source,compressor &compress) void HaloGather(const Lattice<vobj> &source,compressor &compress)
{ {
mpi3synctime_g-=usecond();
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime_g+=usecond();
// conformable(source._grid,_grid); // conformable(source._grid,_grid);
assert(source._grid==_grid); assert(source._grid==_grid);
@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
CommsMerge(decompress,Mergers,Decompressions); CommsMerge(decompress,Mergers,Decompressions);
} }
template<class decompressor> void CommsMergeSHM(decompressor decompress) { template<class decompressor> void CommsMergeSHM(decompressor decompress) {
mpi3synctime-=usecond();
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime+=usecond();
shmmergetime-=usecond();
CommsMerge(decompress,MergersSHM,DecompressionsSHM); CommsMerge(decompress,MergersSHM,DecompressionsSHM);
shmmergetime+=usecond();
} }
template<class decompressor> template<class decompressor>
@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
int checkerboard, int checkerboard,
const std::vector<int> &directions, const std::vector<int> &directions,
const std::vector<int> &distances) const std::vector<int> &distances)
: _permute_type(npoints), _comm_buf_size(npoints) : _permute_type(npoints),
_comm_buf_size(npoints),
comm_bytes_thr(npoints),
comm_enter_thr(npoints),
comm_leave_thr(npoints),
comm_time_thr(npoints)
{ {
face_table_computed=0; face_table_computed=0;
_npoints = npoints; _npoints = npoints;
@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
void ZeroCounters(void) { void ZeroCounters(void) {
gathertime = 0.; gathertime = 0.;
commtime = 0.; commtime = 0.;
mpi3synctime=0.;
mpi3synctime_g=0.;
shmmergetime=0.;
for(int i=0;i<_npoints;i++){
comm_time_thr[i]=0;
comm_bytes_thr[i]=0;
comm_enter_thr[i]=0;
comm_leave_thr[i]=0;
}
halogtime = 0.; halogtime = 0.;
mergetime = 0.; mergetime = 0.;
decompresstime = 0.; decompresstime = 0.;
@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
RealD NP = _grid->_Nprocessors; RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount(); RealD NN = _grid->NodeCount();
double t = 0;
// if comm_time_thr is set they were all done in parallel so take the max
// but add up the bytes
int threaded = 0 ;
for (int i = 0; i < 8; ++i) {
if ( comm_time_thr[i]>0.0 ) {
threaded = 1;
comms_bytes += comm_bytes_thr[i];
if (t < comm_time_thr[i]) t = comm_time_thr[i];
}
}
if (threaded) commtime += t;
_grid->GlobalSum(commtime); commtime/=NP; _grid->GlobalSum(commtime); commtime/=NP;
if ( calls > 0. ) { if ( calls > 0. ) {
@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl; std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl; std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
} }
PRINTIT(mpi3synctime);
PRINTIT(mpi3synctime_g);
PRINTIT(shmmergetime);
PRINTIT(splicetime); PRINTIT(splicetime);
PRINTIT(nosplicetime); PRINTIT(nosplicetime);
} }

View File

@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
strong_inline void mult(iVector<rtype,N> * __restrict__ ret, strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
const iVector<vtype,N> * __restrict__ rhs, const iVector<vtype,N> * __restrict__ rhs,
const iScalar<mtype> * __restrict__ lhs){ const iScalar<mtype> * __restrict__ lhs){
mult(ret,lhs,rhs); for(int c1=0;c1<N;c1++){
mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
}
} }

View File

@ -219,9 +219,15 @@ void Grid_init(int *argc,char ***argv)
int MB; int MB;
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm"); arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
GridCmdOptionInt(arg,MB); GridCmdOptionInt(arg,MB);
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; uint64_t MB64 = MB;
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
CartesianCommunicator::Hugepages = 1;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
Grid_debug_handler_init(); Grid_debug_handler_init();
} }
@ -304,6 +310,7 @@ void Grid_init(int *argc,char ***argv)
std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl; std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl; std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl; std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl; std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
@ -317,7 +324,7 @@ void Grid_init(int *argc,char ***argv)
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<" --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl; std::cout<<GridLogMessage<<" --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;
std::cout<<GridLogMessage<<" --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl; std::cout<<GridLogMessage<<" --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl; std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl; std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl; std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
@ -356,10 +363,15 @@ void Grid_init(int *argc,char ***argv)
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1; LebesgueOrder::UseLebesgueOrder=1;
} }
CartesianCommunicator::nCommThreads = -1;
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
GridCmdOptionIntVector(arg,LebesgueOrder::Block); GridCmdOptionIntVector(arg,LebesgueOrder::Block);
@ -374,10 +386,13 @@ void Grid_init(int *argc,char ***argv)
Grid_default_latt, Grid_default_latt,
Grid_default_mpi); Grid_default_mpi);
std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl; std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
if ( CartesianCommunicator::Hugepages) {
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<GridLogMessage<<"Grid Decomposition\n"; std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl; std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl; std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
@ -393,7 +408,7 @@ void Grid_init(int *argc,char ***argv)
void Grid_finalize(void) void Grid_finalize(void)
{ {
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
MPI_Finalize(); MPI_Finalize();
Grid_unquiesce_nodes(); Grid_unquiesce_nodes();
#endif #endif

View File

@ -28,6 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#include <Grid/Grid.h> #include <Grid/Grid.h>
using namespace Grid;
using namespace Grid::QCD;
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
std::vector<int> seeds({1,2,3,4}); std::vector<int> seeds({1,2,3,4});
@ -82,6 +85,7 @@ int main (int argc, char ** argv)
Uorg = Uorg - Umu; Uorg = Uorg - Umu;
std::cout << " Norm Difference "<< norm2(Uorg) << std::endl; std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
std::cout << " Norm "<< norm2(Umu) << std::endl;
std::cout<< "*****************************************************************" <<std::endl; std::cout<< "*****************************************************************" <<std::endl;

View File

@ -40,12 +40,6 @@ namespace Grid{
double, StoppingCondition, double, StoppingCondition,
int, MaxCGIterations, int, MaxCGIterations,
bool, ApplySmearing); bool, ApplySmearing);
//template <class ReaderClass >
//FermionParameters(Reader<ReaderClass>& Reader){
// read(Reader, "Mobius", *this);
//}
}; };
@ -113,9 +107,17 @@ int main(int argc, char **argv) {
bool ApplySmearing = MyParams.Mobius.ApplySmearing; bool ApplySmearing = MyParams.Mobius.ApplySmearing;
// Use this if you want to tweak the default decomposition
// commented out as very architecture speficic
//std::vector<int> simd_lanes({2,2,1,1});
// Grid from the command line // Grid from the command line arguments --grid and --mpi
TheHMC.Resources.AddFourDimGrid("gauge"); // drop the simd_lanes argument to fall back to the default decomposition for the SIMD lanes
//TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes); // tweak the SIMD lanes
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
// Possibile to create the module by hand // Possibile to create the module by hand
// hardcoding parameters or using a Reader // hardcoding parameters or using a Reader

View File

@ -66,7 +66,14 @@ int main(int argc, char **argv) {
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs; typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
TheHMC.Resources.AddObservable<PlaqObs>(); TheHMC.Resources.AddObservable<PlaqObs>();
TheHMC.Resources.AddObservable<QObs>(); TopologyObsParameters TopParams;
TopParams.interval = 5;
TopParams.do_smearing = true;
TopParams.Smearing.steps = 200;
TopParams.Smearing.step_size = 0.01;
TopParams.Smearing.meas_interval = 50;
TopParams.Smearing.maxTau = 2.0;
TheHMC.Resources.AddObservable<QObs>(TopParams);
////////////////////////////////////////////// //////////////////////////////////////////////
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////

View File

@ -75,7 +75,7 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
RealD mass=0.003; RealD mass=0.003;
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass); ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds); MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
ConjugateGradient<FermionField> CG(1.0e-8,10000); ConjugateGradient<FermionField> CG(1.0e-8,10000);
@ -99,21 +99,27 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl; std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero; result=zero;
Ds.ZeroCounters();
CG(HermOp,src,result); CG(HermOp,src,result);
Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl; std::cout << GridLogMessage << " Calling multiRHS CG for "<<Ls <<" right hand sides" <<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero; result=zero;
Ds.ZeroCounters();
mCG(HermOp,src,result); mCG(HermOp,src,result);
Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl; std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero; result=zero;
Ds.ZeroCounters();
BCGrQ(HermOp,src,result); BCGrQ(HermOp,src,result);
Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;