1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Merge branch 'develop' into feature/multi-communicator

This commit is contained in:
paboyle 2017-08-19 23:12:38 +01:00
commit be66e7dd95
11 changed files with 914 additions and 260 deletions

17
TODO
View File

@ -2,19 +2,16 @@ TODO:
--------------- ---------------
Large item work list: Large item work list:
1)- I/O; There appear to be issues with MPI IO and NERSC with large files.
Possible 2GB limit reappeared. GPFS driver in Intel MPI.
2)- BG/Q port and check 1)- BG/Q port and check
2)- Christoph's local basis expansion Lanczos
3)- Precision conversion and sort out localConvert <-- partial
3)- Christoph's local basis expansion Lanczos; port to use Lattice_transfer features
4)- Precision conversion and sort out localConvert <-- partial
- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
5)- Physical propagator interface 4)- Physical propagator interface
6)- Conserved currents 5)- Conserved currents
7)- Multigrid Wilson and DWF, compare to other Multigrid implementations 6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
8)- HDCR resume 7)- HDCR resume
Recent DONE Recent DONE

518
benchmarks/Benchmark_ITT.cc Normal file
View File

@ -0,0 +1,518 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
struct time_statistics{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v){
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void comms_header(){
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
struct controls {
int Opt;
int CommsOverlap;
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
// int HugePages;
};
class Benchmark {
public:
static void Decomposition (void ) {
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvReal : "<<sizeof(vReal )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplex : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
static void Comms(void)
{
int Nloop=100;
int nmu=0;
int maxlat=32;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
std::vector<double> t_time(Nloop);
time_statistics timestat;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
comms_header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
double dbytes;
for(int i=0;i<Nloop;i++){
double start=usecond();
std::vector<CartesianCommunicator::CommsRequest_t> requests;
dbytes=0;
ncomm=0;
parallel_for(int dir=0;dir<8;dir++){
double tbytes;
int mu =dir % 4;
if (mpi_layout[mu]>1 ) {
ncomm++;
int xmit_to_rank;
int recv_from_rank;
if ( dir == mu ) {
int comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} else {
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
#if 1
tbytes= Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[dir][0],
xmit_to_rank,
(void *)&rbuf[dir][0],
recv_from_rank,
bytes,dir);
Grid.StencilSendToRecvFromComplete(requests,dir);
#endif
requests.resize(0);
#pragma omp atomic
dbytes+=tbytes;
}
}
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
double rbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
}
return;
}
static void Memory(void)
{
const int Nvec=8;
typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
typedef iVector<vReal,Nvec> Vec;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t lmax=48;
#define NLOOP (10*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=4){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
Vec rn ; random(sRNG,rn);
LatticeVec z(&Grid); z=rn;
LatticeVec x(&Grid); x=rn;
LatticeVec y(&Grid); y=rn;
double a=2.0;
uint64_t Nloop=NLOOP;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=a*x-y;
x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
y._odata[4]=z._odata[4];
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout<<GridLogMessage<<std::setprecision(3)
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
}
};
static void DWF(int Ls,int L)
{
RealD mass=0.1;
RealD M5 =1.8;
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
std::vector<int> local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
uint64_t SHM=NP/NN;
std::vector<int> internal;
if ( SHM == 1 ) internal = std::vector<int>({1,1,1,1});
else if ( SHM == 2 ) internal = std::vector<int>({2,1,1,1});
else if ( SHM == 4 ) internal = std::vector<int>({2,2,1,1});
else if ( SHM == 8 ) internal = std::vector<int>({2,2,2,1});
else assert(0);
std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
///////// Source preparation ////////////
LatticeFermion src (FGrid); random(RNG5,src);
LatticeFermion ref (FGrid);
LatticeFermion tmp (FGrid);
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
{
LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
ref = zero;
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
LatticeFermion r_eo (FGrid);
LatticeFermion err (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
#else
const int num_cases = 4;
#endif
controls Cases [] = {
#if defined(AVX512)
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
for(int c=0;c<num_cases;c++) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
Dw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
Dw.Report();
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-ref;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert((norm2(err)<1.0e-4));
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness = "<< mflops_worst/mflops_best <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
Benchmark::Decomposition();
int do_memory=1;
int do_comms =1;
int do_su3 =0;
int do_wilson=1;
int do_dwf =1;
if ( do_memory ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Memory();
}
if ( do_comms ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Comms();
}
if ( do_su3 ) {
// empty for now
}
if ( do_wilson ) {
int Ls=1;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::DWF(Ls,16);
Benchmark::DWF(Ls,24);
Benchmark::DWF(Ls,32);
}
if ( do_dwf ) {
int Ls=16;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::DWF(Ls,8);
Benchmark::DWF(Ls,12);
Benchmark::DWF(Ls,16);
Benchmark::DWF(Ls,24);
}
Grid_finalize();
}

View File

@ -11,7 +11,7 @@ int PointerCache::victim;
void *PointerCache::Insert(void *ptr,size_t bytes) { void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return NULL; if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);

View File

@ -185,17 +185,18 @@ public:
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
void show_decomposition(){ void show_decomposition(){
std::cout << GridLogMessage << "Full Dimensions : " << _fdimensions << std::endl; std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "Global Dimensions : " << _gdimensions << std::endl; std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "Local Dimensions : " << _ldimensions << std::endl; std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl; std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "Outer strides : " << _ostride << std::endl; std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "Inner strides : " << _istride << std::endl; std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "iSites : " << _isites << std::endl; std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "oSites : " << _osites << std::endl; std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "lSites : " << lSites() << std::endl; std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "gSites : " << gSites() << std::endl; std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "Nd : " << _ndimension << std::endl; std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
} }
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////

View File

@ -62,77 +62,81 @@ public:
return shift; return shift;
} }
GridCartesian(const std::vector<int> &dimensions, GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout, const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid const std::vector<int> &processor_grid) : GridBase(processor_grid)
) : GridBase(processor_grid)
{ {
/////////////////////// ///////////////////////
// Grid information // Grid information
/////////////////////// ///////////////////////
_ndimension = dimensions.size(); _ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
for(int d=0;d<_ndimension;d++){ _fdimensions.resize(_ndimension);
_fdimensions[d] = dimensions[d]; // Global dimensions _gdimensions.resize(_ndimension);
_gdimensions[d] = _fdimensions[d]; // Global dimensions _ldimensions.resize(_ndimension);
_simd_layout[d] = simd_layout[d]; _rdimensions.resize(_ndimension);
_fsites = _fsites * _fdimensions[d]; _simd_layout.resize(_ndimension);
_gsites = _gsites * _gdimensions[d]; _lstart.resize(_ndimension);
_lend.resize(_ndimension);
//FIXME check for exact division _ostride.resize(_ndimension);
_istride.resize(_ndimension);
// Use a reduced simd grid _fsites = _gsites = _osites = _isites = 1;
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition for (int d = 0; d < _ndimension; d++)
_lstart[d] = _processor_coor[d]*_ldimensions[d]; {
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; _fdimensions[d] = dimensions[d]; // Global dimensions
_osites *= _rdimensions[d]; _gdimensions[d] = _fdimensions[d]; // Global dimensions
_isites *= _simd_layout[d]; _simd_layout[d] = simd_layout[d];
_fsites = _fsites * _fdimensions[d];
// Addressing support _gsites = _gsites * _gdimensions[d];
if ( d==0 ) {
_ostride[d] = 1; // Use a reduced simd grid
_istride[d] = 1; _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
} else { assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1]; _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
} assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
} }
else
/////////////////////// {
// subplane information _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
/////////////////////// _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block =1;
int nblock=1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d];
_slice_block[d] =block;
_slice_stride[d]=_ostride[d]*_rdimensions[d];
_slice_nblock[d]=nblock;
block = block*_rdimensions[d];
} }
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
}; };
}; };
} }
#endif #endif

View File

@ -131,21 +131,21 @@ public:
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0); Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
} }
void Init(const std::vector<int> &dimensions, void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout, const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid, const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask, const std::vector<int> &checker_dim_mask,
int checker_dim) int checker_dim)
{ {
/////////////////////// ///////////////////////
// Grid information // Grid information
/////////////////////// ///////////////////////
_checker_dim = checker_dim; _checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim]==1); assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size(); _ndimension = dimensions.size();
assert(checker_dim_mask.size()==_ndimension); assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size()==_ndimension); assert(processor_grid.size() == _ndimension);
assert(simd_layout.size()==_ndimension); assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension); _fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension); _gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension); _ldimensions.resize(_ndimension);
@ -153,114 +153,133 @@ public:
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
_ostride.resize(_ndimension); _ostride.resize(_ndimension);
_istride.resize(_ndimension); _istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1; _fsites = _gsites = _osites = _isites = 1;
_checker_dim_mask=checker_dim_mask;
for(int d=0;d<_ndimension;d++){ _checker_dim_mask = checker_dim_mask;
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
if (d==_checker_dim) {
_gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
}
_ldimensions[d] = _gdimensions[d]/_processors[d];
_lstart[d] = _processor_coor[d]*_ldimensions[d];
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
// Use a reduced simd grid for (int d = 0; d < _ndimension; d++)
_simd_layout[d] = simd_layout[d]; {
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; _fdimensions[d] = dimensions[d];
assert(_rdimensions[d]>0); _gdimensions[d] = _fdimensions[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
// all elements of a simd vector must have same checkerboard. if (d == _checker_dim)
// If Ls vectorised, this must still be the case; e.g. dwf rb5d {
if ( _simd_layout[d]>1 ) { assert((_gdimensions[d] & 0x1) == 0);
if ( checker_dim_mask[d] ) { _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
assert( (_rdimensions[d]&0x1) == 0 ); }
} _ldimensions[d] = _gdimensions[d] / _processors[d];
} assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d]; // Use a reduced simd grid
_isites *= _simd_layout[d]; _simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
// Addressing support assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
if ( d==0 ) { assert(_rdimensions[d] > 0);
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
if (_simd_layout[d] > 1)
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// subplane information // subplane information
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension); _slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension); _slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension); _slice_nblock.resize(_ndimension);
int block =1; int block = 1;
int nblock=1; int nblock = 1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d]; for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d]; for (int d = 0; d < _ndimension; d++)
_slice_block[d] =block; {
_slice_stride[d]=_ostride[d]*_rdimensions[d]; nblock /= _rdimensions[d];
_slice_nblock[d]=nblock; _slice_block[d] = block;
block = block*_rdimensions[d]; _slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
} }
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Create a checkerboard lookup table // Create a checkerboard lookup table
//////////////////////////////////////////////// ////////////////////////////////////////////////
int rvol = 1; int rvol = 1;
for(int d=0;d<_ndimension;d++){ for (int d = 0; d < _ndimension; d++)
rvol=rvol * _rdimensions[d]; {
rvol = rvol * _rdimensions[d];
} }
_checker_board.resize(rvol); _checker_board.resize(rvol);
for(int osite=0;osite<_osites;osite++){ for (int osite = 0; osite < _osites; osite++)
_checker_board[osite] = CheckerBoardFromOindex (osite); {
_checker_board[osite] = CheckerBoardFromOindex(osite);
} }
}; };
protected:
protected:
virtual int oIndex(std::vector<int> &coor) virtual int oIndex(std::vector<int> &coor)
{ {
int idx=0; int idx = 0;
for(int d=0;d<_ndimension;d++) { for (int d = 0; d < _ndimension; d++)
if( d==_checker_dim ) { {
idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]); if (d == _checker_dim)
} else { {
idx+=_ostride[d]*(coor[d]%_rdimensions[d]); idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
} }
else
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
} }
return idx; return idx;
}; };
virtual int iIndex(std::vector<int> &lcoor) virtual int iIndex(std::vector<int> &lcoor)
{ {
int idx=0; int idx = 0;
for(int d=0;d<_ndimension;d++) { for (int d = 0; d < _ndimension; d++)
if( d==_checker_dim ) { {
idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d])); if (d == _checker_dim)
} else { {
idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
} }
} else
return idx; {
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
}
return idx;
} }
}; };
} }
#endif #endif

View File

@ -98,35 +98,39 @@ class BinaryIO {
NerscChecksum(grid,scalardata,nersc_csum); NerscChecksum(grid,scalardata,nersc_csum);
} }
template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum) template <class fobj>
static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
{ {
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
uint64_t lsites = grid->lSites();
uint64_t lsites =grid->lSites(); if (fbuf.size() == 1)
if (fbuf.size()==1) { {
lsites=1; lsites = 1;
} }
#pragma omp parallel #pragma omp parallel
{ {
uint32_t nersc_csum_thr=0; uint32_t nersc_csum_thr = 0;
#pragma omp for #pragma omp for
for(uint64_t local_site=0;local_site<lsites;local_site++){ for (uint64_t local_site = 0; local_site < lsites; local_site++)
uint32_t * site_buf = (uint32_t *)&fbuf[local_site]; {
for(uint64_t j=0;j<size32;j++){ uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
nersc_csum_thr=nersc_csum_thr+site_buf[j]; for (uint64_t j = 0; j < size32; j++)
} {
nersc_csum_thr = nersc_csum_thr + site_buf[j];
}
} }
#pragma omp critical #pragma omp critical
{ {
nersc_csum += nersc_csum_thr; nersc_csum += nersc_csum_thr;
} }
} }
} }
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{ {
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
@ -266,7 +270,7 @@ class BinaryIO {
grid->Barrier(); grid->Barrier();
GridStopWatch timer; GridStopWatch timer;
GridStopWatch bstimer; GridStopWatch bstimer;
nersc_csum=0; nersc_csum=0;
scidac_csuma=0; scidac_csuma=0;
scidac_csumb=0; scidac_csumb=0;
@ -362,18 +366,22 @@ class BinaryIO {
#else #else
assert(0); assert(0);
#endif #endif
} else { } else {
std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : " std::cout << GridLogMessage << "C++ read I/O " << file << " : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl; << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ifstream fin; std::ifstream fin;
fin.open(file,std::ios::binary|std::ios::in); fin.open(file, std::ios::binary | std::ios::in);
if ( control & BINARYIO_MASTER_APPEND ) { if (control & BINARYIO_MASTER_APPEND)
fin.seekg(-sizeof(fobj),fin.end); {
} else { fin.seekg(-sizeof(fobj), fin.end);
fin.seekg(offset+myrank*lsites*sizeof(fobj)); }
} else
fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0); {
fin.close(); fin.seekg(offset + myrank * lsites * sizeof(fobj));
}
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
assert(fin.fail() == 0);
fin.close();
} }
timer.Stop(); timer.Stop();
@ -405,30 +413,78 @@ class BinaryIO {
timer.Start(); timer.Start();
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO #ifdef USE_MPI_IO
std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl; std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); std::cout << GridLogMessage << "Checking for errors" << std::endl;
ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); if (ierr != MPI_SUCCESS)
MPI_File_close(&fh); {
MPI_Type_free(&fileArray); char error_string[BUFSIZ];
MPI_Type_free(&localArray); int length_of_error_string, error_class;
MPI_Error_class(ierr, &error_class);
MPI_Error_string(error_class, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Error_string(ierr, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
}
std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
#else #else
assert(0); assert(0);
#endif #endif
} else { } else {
std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : " std::ofstream fout;
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl; fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
if ( control & BINARYIO_MASTER_APPEND ) { try {
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
} catch (const std::fstream::failure& exc) {
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
if ( control & BINARYIO_MASTER_APPEND ) {
fout.seekp(0,fout.end); fout.seekp(0,fout.end);
} else { } else {
fout.seekp(offset+myrank*lsites*sizeof(fobj)); fout.seekp(offset+myrank*lsites*sizeof(fobj));
} }
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
try {
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
}
catch (const std::fstream::failure& exc) {
std::cout << "Exception in writing file " << file << std::endl;
std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
fout.close(); fout.close();
} }
timer.Stop(); timer.Stop();
} }
std::cout<<GridLogMessage<<"IOobject: "; std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read "; if ( control & BINARYIO_READ) std::cout << " read ";
@ -442,11 +498,14 @@ class BinaryIO {
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Safety check // Safety check
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
grid->Barrier(); // if the data size is 1 we do not want to sum over the MPI ranks
grid->GlobalSum(nersc_csum); if (iodata.size() != 1){
grid->GlobalXOR(scidac_csuma); grid->Barrier();
grid->GlobalXOR(scidac_csumb); grid->GlobalSum(nersc_csum);
grid->Barrier(); grid->GlobalXOR(scidac_csuma);
grid->GlobalXOR(scidac_csumb);
grid->Barrier();
}
} }
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
@ -546,9 +605,9 @@ class BinaryIO {
int gsites = grid->gSites(); int gsites = grid->gSites();
int lsites = grid->lSites(); int lsites = grid->lSites();
uint32_t nersc_csum_tmp; uint32_t nersc_csum_tmp = 0;
uint32_t scidac_csuma_tmp; uint32_t scidac_csuma_tmp = 0;
uint32_t scidac_csumb_tmp; uint32_t scidac_csumb_tmp = 0;
GridStopWatch timer; GridStopWatch timer;

View File

@ -165,7 +165,7 @@ class HMCResourceManager {
// Grids // Grids
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
void AddGrid(std::string s, GridModule& M) { void AddGrid(const std::string s, GridModule& M) {
// Check for name clashes // Check for name clashes
auto search = Grids.find(s); auto search = Grids.find(s);
if (search != Grids.end()) { if (search != Grids.end()) {
@ -174,14 +174,24 @@ class HMCResourceManager {
exit(1); exit(1);
} }
Grids[s] = std::move(M); Grids[s] = std::move(M);
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
Grids[s].show_full_decomposition();
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
} }
// Add a named grid set, 4d shortcut // Add a named grid set, 4d shortcut
void AddFourDimGrid(std::string s) { void AddFourDimGrid(const std::string s) {
GridFourDimModule<vComplex> Mod; GridFourDimModule<vComplex> Mod;
AddGrid(s, Mod); AddGrid(s, Mod);
} }
// Add a named grid set, 4d shortcut + tweak simd lanes
void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
GridFourDimModule<vComplex> Mod(simd_decomposition);
AddGrid(s, Mod);
}
GridCartesian* GetCartesian(std::string s = "") { GridCartesian* GetCartesian(std::string s = "") {

View File

@ -33,28 +33,29 @@ directory
namespace Grid { namespace Grid {
// Resources // Resources
// Modules for grids // Modules for grids
// Introduce another namespace HMCModules? // Introduce another namespace HMCModules?
class GridModuleParameters: Serializable{ class GridModuleParameters: Serializable{
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters, GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
std::string, lattice, std::string, lattice,
std::string, mpi); std::string, mpi);
std::vector<int> getLattice(){return strToVec<int>(lattice);} std::vector<int> getLattice() const {return strToVec<int>(lattice);}
std::vector<int> getMpi() {return strToVec<int>(mpi);} std::vector<int> getMpi() const {return strToVec<int>(mpi);}
void check(){
if (getLattice().size() != getMpi().size()) { void check() const {
std::cout << GridLogError if (getLattice().size() != getMpi().size() ) {
std::cout << GridLogError
<< "Error in GridModuleParameters: lattice and mpi dimensions " << "Error in GridModuleParameters: lattice and mpi dimensions "
"do not match" "do not match"
<< std::endl; << std::endl;
exit(1); exit(1);
} }
} }
template <class ReaderClass> template <class ReaderClass>
GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) { GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
@ -75,51 +76,94 @@ private:
// Lower level class // Lower level class
class GridModule { class GridModule {
public: public:
GridCartesian* get_full() { GridCartesian* get_full() {
std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl; std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
return grid_.get(); } return grid_.get(); }
GridRedBlackCartesian* get_rb() { GridRedBlackCartesian* get_rb() {
std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl; std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
return rbgrid_.get(); } return rbgrid_.get(); }
void set_full(GridCartesian* grid) { grid_.reset(grid); } void set_full(GridCartesian* grid) { grid_.reset(grid); }
void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); } void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
void show_full_decomposition(){ grid_->show_decomposition(); }
void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
protected: protected:
std::unique_ptr<GridCartesian> grid_; std::unique_ptr<GridCartesian> grid_;
std::unique_ptr<GridRedBlackCartesian> rbgrid_; std::unique_ptr<GridRedBlackCartesian> rbgrid_;
}; };
//////////////////////////////////// ////////////////////////////////////
// Classes for the user // Classes for the user
//////////////////////////////////// ////////////////////////////////////
// Note: the space time grid should be out of the QCD namespace // Note: the space time grid should be out of the QCD namespace
template< class vector_type> template <class vector_type>
class GridFourDimModule : public GridModule { class GridFourDimModule : public GridModule
public: {
GridFourDimModule() { public:
GridFourDimModule()
{
using namespace QCD; using namespace QCD;
set_full(SpaceTimeGrid::makeFourDimGrid( set_full(SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()), GridDefaultLatt(),
GridDefaultSimd(4, vector_type::Nsimd()),
GridDefaultMpi())); GridDefaultMpi()));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
} }
GridFourDimModule(GridModuleParameters Params) { GridFourDimModule(const std::vector<int> tweak_simd)
{
using namespace QCD;
if (tweak_simd.size() != 4)
{
std::cout << GridLogError
<< "Error in GridFourDimModule: SIMD size different from 4"
<< std::endl;
exit(1);
}
// Checks that the product agrees with the expectation
int simd_sum = 1;
for (auto &n : tweak_simd)
simd_sum *= n;
std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << " Sum: " << simd_sum << std::endl;
if (simd_sum == vector_type::Nsimd())
{
set_full(SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(),
tweak_simd,
GridDefaultMpi()));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
}
else
{
std::cout << GridLogError
<< "Error in GridFourDimModule: SIMD lanes must sum to "
<< vector_type::Nsimd()
<< std::endl;
}
}
GridFourDimModule(const GridModuleParameters Params)
{
using namespace QCD; using namespace QCD;
Params.check();
std::vector<int> lattice_v = Params.getLattice(); std::vector<int> lattice_v = Params.getLattice();
std::vector<int> mpi_v = Params.getMpi(); std::vector<int> mpi_v = Params.getMpi();
if (lattice_v.size() == 4) { if (lattice_v.size() == 4)
{
set_full(SpaceTimeGrid::makeFourDimGrid( set_full(SpaceTimeGrid::makeFourDimGrid(
lattice_v, GridDefaultSimd(4, vector_type::Nsimd()), lattice_v,
GridDefaultSimd(4, vector_type::Nsimd()),
mpi_v)); mpi_v));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
} else { }
std::cout << GridLogError else
<< "Error in GridFourDimModule: lattice dimension different from 4" {
<< std::endl; std::cout << GridLogError
<< "Error in GridFourDimModule: lattice dimension different from 4"
<< std::endl;
exit(1); exit(1);
} }
} }

View File

@ -381,7 +381,7 @@ void Grid_init(int *argc,char ***argv)
std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl; std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<GridLogMessage<<"Grid Decomposition\n"; std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl; std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl; std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;

View File

@ -40,12 +40,6 @@ namespace Grid{
double, StoppingCondition, double, StoppingCondition,
int, MaxCGIterations, int, MaxCGIterations,
bool, ApplySmearing); bool, ApplySmearing);
//template <class ReaderClass >
//FermionParameters(Reader<ReaderClass>& Reader){
// read(Reader, "Mobius", *this);
//}
}; };
@ -113,9 +107,17 @@ int main(int argc, char **argv) {
bool ApplySmearing = MyParams.Mobius.ApplySmearing; bool ApplySmearing = MyParams.Mobius.ApplySmearing;
// Use this if you want to tweak the default decomposition
// commented out as very architecture speficic
//std::vector<int> simd_lanes({2,2,1,1});
// Grid from the command line // Grid from the command line arguments --grid and --mpi
TheHMC.Resources.AddFourDimGrid("gauge"); // drop the simd_lanes argument to fall back to the default decomposition for the SIMD lanes
//TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes); // tweak the SIMD lanes
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
// Possibile to create the module by hand // Possibile to create the module by hand
// hardcoding parameters or using a Reader // hardcoding parameters or using a Reader