1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Merge branch 'develop' into feature/feynman-rules

This commit is contained in:
Antonin Portelli 2016-11-03 13:52:11 +00:00
commit 75bbf6a0af
36 changed files with 1287 additions and 853 deletions

View File

@ -20,7 +20,7 @@ License: GPL v2.
Last update Nov 2016. Last update Nov 2016.
_Please send all pull requests to the `develop` branch._ _Please do not send pull requests to the `master` branch which is reserved for releases._
### Bug report ### Bug report
@ -29,7 +29,7 @@ _To help us tracking and solving more efficiently issues with Grid, please repor
When you file an issue, please go though the following checklist: When you file an issue, please go though the following checklist:
1. Check that the code is pointing to the `HEAD` of `develop` or any commit in `master` which is tagged with a version number. 1. Check that the code is pointing to the `HEAD` of `develop` or any commit in `master` which is tagged with a version number.
2. Give a description of the target platform (CPU, network, compiler). 2. Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) or `sysctl machdep.cpu.brand_string` (macOS) and the full output the `--version` option of your compiler.
3. Give the exact `configure` command used. 3. Give the exact `configure` command used.
4. Attach `config.log`. 4. Attach `config.log`.
5. Attach `config.summary`. 5. Attach `config.summary`.
@ -45,7 +45,7 @@ are provided, similar to HPF and cmfortran, and user control is given over the m
array indices to both MPI tasks and SIMD processing elements. array indices to both MPI tasks and SIMD processing elements.
* Identically shaped arrays then be processed with perfect data parallelisation. * Identically shaped arrays then be processed with perfect data parallelisation.
* Such identically shapped arrays are called conformable arrays. * Such identically shaped arrays are called conformable arrays.
The transformation is based on the observation that Cartesian array processing involves The transformation is based on the observation that Cartesian array processing involves
identical processing to be performed on different regions of the Cartesian array. identical processing to be performed on different regions of the Cartesian array.
@ -127,14 +127,15 @@ make -C tests/<subdir> tests
The following options can be use with the `--enable-simd=` option to target different communication interfaces: The following options can be use with the `--enable-simd=` option to target different communication interfaces:
| `<comm>` | Description | | `<comm>` | Description |
| ------------- | -------------------------------------------- | | -------------- | ------------------------------------------------------------- |
| `none` | no communications | | `none` | no communications |
| `mpi[-auto]` | MPI communications | | `mpi[-auto]` | MPI communications |
| `mpi3[-auto]` | MPI communications using MPI 3 shared memory | | `mpi3[-auto]` | MPI communications using MPI 3 shared memory |
| `shmem ` | Cray SHMEM communications | | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
| `shmem ` | Cray SHMEM communications |
For `mpi` and `mpi3` the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
### Possible SIMD types ### Possible SIMD types
@ -160,7 +161,7 @@ Alternatively, some CPU codenames can be directly used:
| `BGQ` | Blue Gene/Q | | `BGQ` | Blue Gene/Q |
#### Notes: #### Notes:
- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions. - We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
- For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform. - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
- BG/Q performances are currently rather poor. This is being investigated for future versions. - BG/Q performances are currently rather poor. This is being investigated for future versions.
@ -171,7 +172,7 @@ The following configuration is recommended for the Intel Knights Landing platfor
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-precision=double\
--enable-simd=KNL \ --enable-simd=KNL \
--enable-comms=mpi3-auto \ --enable-comms=mpi-auto \
--with-gmp=<path> \ --with-gmp=<path> \
--with-mpfr=<path> \ --with-mpfr=<path> \
--enable-mkl \ --enable-mkl \
@ -183,10 +184,9 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are w
``` bash ``` bash
../configure --enable-precision=double\ ../configure --enable-precision=double\
--enable-simd=KNL \ --enable-simd=KNL \
--enable-comms=mpi3 \ --enable-comms=mpi \
--with-gmp=<path> \ --with-gmp=<path> \
--with-mpfr=<path> \ --with-mpfr=<path> \
--enable-mkl \ --enable-mkl \
CXX=CC CC=cc CXX=CC CC=cc
``` ```

View File

@ -42,15 +42,14 @@ int main (int argc, char ** argv)
int Nloop=10; int Nloop=10;
int nmu=0; int nmu=0;
for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++; for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl; std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
int maxlat=16;
for(int lat=4;lat<=maxlat;lat+=2){
for(int lat=4;lat<=32;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){ for(int Ls=1;Ls<=16;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0], std::vector<int> latt_size ({lat*mpi_layout[0],
@ -125,7 +124,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
for(int lat=4;lat<=32;lat+=2){ for(int lat=4;lat<=maxlat;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){ for(int Ls=1;Ls<=16;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat}); std::vector<int> latt_size ({lat,lat,lat,lat});
@ -194,128 +193,83 @@ int main (int argc, char ** argv)
} }
} }
#if 0 Nloop=100;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl; std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
for(int lat=4;lat<=maxlat;lat+=2){
for(int lat=4;lat<=32;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){ for(int Ls=1;Ls<=16;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat}); std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int ncomm; int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
double start=usecond();
for(int i=0;i<Nloop;i++){
std::vector<CartesianCommunicator::CommsRequest_t> empty; std::vector<CartesianCommunicator::CommsRequest_t> requests;
std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
for(int mu=0;mu<4;mu++){
ncomm=0; ncomm=0;
if (mpi_layout[mu]>1 ) { for(int mu=0;mu<4;mu++){
ncomm++;
if (mpi_layout[mu]>1 ) {
int comm_proc;
int xmit_to_rank;
int recv_from_rank;
comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFromInit(requests_fwd[mu],
(void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFromInit(requests_bwd[mu],
(void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
}
}
{
double start=usecond();
for(int i=0;i<Nloop;i++){
for(int mu=0;mu<4;mu++){ ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
if (mpi_layout[mu]>1 ) { Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.StencilSendToRecvFromBegin(requests,
Grid.SendToRecvFromBegin(requests_fwd[mu]); (void *)&xbuf[mu][0],
Grid.SendToRecvFromComplete(requests_fwd[mu]); xmit_to_rank,
Grid.SendToRecvFromBegin(requests_bwd[mu]); (void *)&rbuf[mu][0],
Grid.SendToRecvFromComplete(requests_bwd[mu]); recv_from_rank,
} bytes);
}
Grid.Barrier();
}
double stop=usecond(); comm_proc = mpi_layout[mu]-1;
double dbytes = bytes; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
double xbytes = Nloop*dbytes*2.0*ncomm; Grid.StencilSendToRecvFromBegin(requests,
double rbytes = xbytes; (void *)&xbuf[mu+4][0],
double bidibytes = xbytes+rbytes; xmit_to_rank,
(void *)&rbuf[mu+4][0],
double time = stop-start; recv_from_rank,
bytes);
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
}
{
double start=usecond();
for(int i=0;i<Nloop;i++){
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
Grid.SendToRecvFromBegin(requests_fwd[mu]);
Grid.SendToRecvFromBegin(requests_bwd[mu]);
Grid.SendToRecvFromComplete(requests_fwd[mu]);
Grid.SendToRecvFromComplete(requests_bwd[mu]);
}
} }
Grid.Barrier();
} }
Grid.StencilSendToRecvFromComplete(requests);
double stop=usecond(); Grid.Barrier();
double dbytes = bytes;
double xbytes = Nloop*dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
double time = stop-start;
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
} }
double stop=usecond();
double dbytes = bytes;
double xbytes = Nloop*dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
double time = stop-start; // microseconds
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
} }
} }
#endif
Grid_finalize(); Grid_finalize();
} }

View File

@ -44,7 +44,6 @@ struct scal {
Gamma::GammaT Gamma::GammaT
}; };
bool overlapComms = false;
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR; typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF; typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD; typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
@ -54,10 +53,6 @@ int main (int argc, char ** argv)
{ {
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
overlapComms = true;
}
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@ -126,14 +121,21 @@ int main (int argc, char ** argv)
RealD NP = UGrid->_Nprocessors; RealD NP = UGrid->_Nprocessors;
for(int doasm=1;doasm<2;doasm++){
QCD::WilsonKernelsStatic::AsmOpt=doasm;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl; std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "Calling Dw"<<std::endl; std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
int ncall =100; int ncall =100;
if (1) { if (1) {
@ -162,6 +164,17 @@ int main (int argc, char ** argv)
if (1) if (1)
{ {
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR; typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
LatticeFermion ssrc(sFGrid); LatticeFermion ssrc(sFGrid);
LatticeFermion sref(sFGrid); LatticeFermion sref(sFGrid);
@ -248,6 +261,16 @@ int main (int argc, char ** argv)
sr_e = zero; sr_e = zero;
sr_o = zero; sr_o = zero;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
sDw.ZeroCounters(); sDw.ZeroCounters();
sDw.stat.init("DhopEO"); sDw.stat.init("DhopEO");
double t0=usecond(); double t0=usecond();
@ -308,7 +331,7 @@ int main (int argc, char ** argv)
ref = -0.5*ref; ref = -0.5*ref;
} }
Dw.Dhop(src,result,1); Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl; std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl; std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl; std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
@ -322,13 +345,22 @@ int main (int argc, char ** argv)
LatticeFermion r_eo (FGrid); LatticeFermion r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl; std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src); pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src); pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl; std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl; std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{ {
Dw.ZeroCounters(); Dw.ZeroCounters();
double t0=usecond(); double t0=usecond();
@ -366,8 +398,5 @@ int main (int argc, char ** argv)
assert(norm2(src_e)<1.0e-5); assert(norm2(src_e)<1.0e-5);
assert(norm2(src_o)<1.0e-5); assert(norm2(src_o)<1.0e-5);
}
Grid_finalize(); Grid_finalize();
} }

View File

@ -1,153 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class d>
struct scal {
d internal;
};
Gamma::GammaMatrix Gmu [] = {
Gamma::GammaX,
Gamma::GammaY,
Gamma::GammaZ,
Gamma::GammaT
};
bool overlapComms = false;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
overlapComms = true;
}
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt();
const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeFermion src (FGrid); random(RNG5,src);
LatticeFermion result(FGrid); result=zero;
LatticeFermion ref(FGrid); ref=zero;
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
ColourMatrix cm = Complex(1.0,0.0);
LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
if (1)
{
ref = zero;
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
typename DomainWallFermionR::ImplParams params;
params.overlapCommsCompute = overlapComms;
RealD NP = UGrid->_Nprocessors;
QCD::WilsonKernelsStatic::AsmOpt=1;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
int ncall =50;
if (1) {
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
double t1=usecond();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
// Dw.Report();
}
Grid_finalize();
}

View File

@ -51,16 +51,18 @@ int main (int argc, char ** argv)
{ {
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
const int Ls=8; const int Ls=8;
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
if ( getenv("ASMOPT") ) {
QCD::WilsonKernelsStatic::AsmOpt=1;
} else {
QCD::WilsonKernelsStatic::AsmOpt=0;
}
std::cout<<GridLogMessage << "=========================================================================="<<std::endl; std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl; std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl; std::cout<<GridLogMessage << "=========================================================================="<<std::endl;

View File

@ -58,6 +58,19 @@ int main (int argc, char ** argv)
std::vector<int> seeds({1,2,3,4}); std::vector<int> seeds({1,2,3,4});
RealD mass = 0.1; RealD mass = 0.1;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout<<GridLogMessage << "============================================================================="<< std::endl; std::cout<<GridLogMessage << "============================================================================="<< std::endl;
std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl; std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
std::cout<<GridLogMessage << "============================================================================="<< std::endl; std::cout<<GridLogMessage << "============================================================================="<< std::endl;

View File

@ -1,175 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_zmm.cc
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace Grid;
using namespace Grid::QCD;
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
int main(int argc,char **argv)
{
Grid_init(&argc,&argv);
std::ofstream os("zmm.dat");
os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
for(int L=4;L<=32;L+=4){
for(int m=1;m<=2;m++){
for(int Ls=8;Ls<=16;Ls+=8){
std::vector<int> grid({L,L,m*L,m*L});
std::cout << GridLogMessage <<"\t";
for(int i=0;i<4;i++) {
std::cout << grid[i]<<"x";
}
std::cout << Ls<<"\t\t";
bench(os,grid,Ls);
}
}
}
}
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
{
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads();
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
LatticeFermion src (FGrid);
LatticeFermion tmp (FGrid);
LatticeFermion srce(FrbGrid);
LatticeFermion resulto(FrbGrid); resulto=zero;
LatticeFermion resulta(FrbGrid); resulta=zero;
LatticeFermion junk(FrbGrid); junk=zero;
LatticeFermion diff(FrbGrid);
LatticeGaugeField Umu(UGrid);
double mfc, mfa, mfo, mfl1;
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
random(RNG5,src);
#if 1
random(RNG4,Umu);
#else
int mmu=2;
std::vector<LatticeColourMatrix> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
if ( mu!=mmu ) U[mu] = zero;
if ( mu==mmu ) U[mu] = 1.0;
PokeIndex<LorentzIndex>(Umu,U[mu],mu);
}
#endif
pickCheckerboard(Even,srce,src);
RealD mass=0.1;
RealD M5 =1.8;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall=50;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopOE(srce,resulto,0);
}
double t1=usecond();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume/2;
mfc = flops*ncall/(t1-t0);
std::cout<<mfc<<"\t\t";
QCD::WilsonKernelsStatic::AsmOpt=1;
t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopOE(srce,resulta,0);
}
t1=usecond();
mfa = flops*ncall/(t1-t0);
std::cout<<mfa<<"\t\t";
/*
int dag=DaggerNo;
t0=usecond();
for(int i=0;i<1;i++){
Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
}
t1=usecond();
mfo = flops*100/(t1-t0);
std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s = "<< mfo<<std::endl;
t0=usecond();
for(int i=0;i<1;i++){
Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
}
t1=usecond();
mfl1= flops*100/(t1-t0);
std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s = "<< mfl1<<std::endl;
os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
<< mfc<<" "
<< mfa<<" "
<< mfo<<" "
<< mfl1<<std::endl;
*/
#if 0
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
Dw.DhopOE(srce,resulta,0);
PerformanceCounter Counter(i);
Counter.Start();
Dw.DhopOE(srce,resulta,0);
Counter.Stop();
Counter.Report();
}
#endif
//resulta = (-0.5) * resulta;
diff = resulto-resulta;
std::cout<<norm2(diff)<<std::endl;
return 0;
}

View File

@ -1,18 +1,12 @@
#!/usr/bin/env bash #!/usr/bin/env bash
EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2' EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
echo "-- deploying Eigen source..." echo "-- deploying Eigen source..."
wget ${EIGEN_URL} --no-check-certificate wget ${EIGEN_URL} --no-check-certificate
./scripts/update_eigen.sh `basename ${EIGEN_URL}` ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
rm `basename ${EIGEN_URL}` rm `basename ${EIGEN_URL}`
echo "-- copying fftw prototypes..."
wget ${FFTW_URL}
./scripts/update_fftw.sh `basename ${FFTW_URL}`
rm `basename ${FFTW_URL}`
echo '-- generating Make.inc files...' echo '-- generating Make.inc files...'
./scripts/filelist ./scripts/filelist
echo '-- generating configure script...' echo '-- generating configure script...'

View File

@ -253,15 +253,23 @@ AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi
case ${ac_COMMS} in case ${ac_COMMS} in
none) none)
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
comms_type='none'
;; ;;
mpi|mpi-auto) mpi3l*)
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
comms_type='mpi3l'
;; ;;
mpi3|mpi3-auto) mpi3*)
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
comms_type='mpi3'
;;
mpi*)
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
comms_type='mpi'
;; ;;
shmem) shmem)
AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] ) AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
comms_type='shmem'
;; ;;
*) *)
AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
@ -279,12 +287,11 @@ case ${ac_COMMS} in
;; ;;
esac esac
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ]) AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI, AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ]) AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
AM_CONDITIONAL(BUILD_COMMS_MPI3, AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
[ test "X${ac_COMMS}X" == "Xmpi3X" || test "X${ac_COMMS}X" == "Xmpi3-autoX" ]) AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
############### RNG selection ############### RNG selection
AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\ AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\
@ -377,7 +384,7 @@ compiler version : ${ax_cv_gxx_version}
----- BUILD OPTIONS ----------------------------------- ----- BUILD OPTIONS -----------------------------------
SIMD : ${ac_SIMD} SIMD : ${ac_SIMD}
Threading : ${ac_openmp} Threading : ${ac_openmp}
Communications type : ${ac_COMMS} Communications type : ${comms_type}
Default precision : ${ac_PRECISION} Default precision : ${ac_PRECISION}
RNG choice : ${ac_RNG} RNG choice : ${ac_RNG}
GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi` GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`

View File

@ -42,6 +42,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/cshift/Cshift_mpi.h> #include <Grid/cshift/Cshift_mpi.h>
#endif #endif
#ifdef GRID_COMMS_MPI3L
#include <Grid/cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
#endif #endif

View File

@ -147,6 +147,13 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
return; return;
} }
void GridCmdOptionInt(std::string &str,int & val)
{
std::stringstream ss(str);
ss>>val;
return;
}
void GridParseLayout(char **argv,int argc, void GridParseLayout(char **argv,int argc,
std::vector<int> &latt, std::vector<int> &latt,
@ -177,14 +184,12 @@ void GridParseLayout(char **argv,int argc,
assert(ompthreads.size()==1); assert(ompthreads.size()==1);
GridThread::SetThreads(ompthreads[0]); GridThread::SetThreads(ompthreads[0]);
} }
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){ if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
std::vector<int> cores(0); int cores;
arg= GridCmdOptionPayload(argv,argv+argc,"--cores"); arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
GridCmdOptionIntVector(arg,cores); GridCmdOptionInt(arg,cores);
GridThread::SetCores(cores[0]); GridThread::SetCores(cores);
} }
} }
std::string GridCmdVectorIntToString(const std::vector<int> & vec){ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
@ -193,7 +198,7 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
return oss.str(); return oss.str();
} }
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// // Reinit guard
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
static int Grid_is_initialised = 0; static int Grid_is_initialised = 0;
@ -202,27 +207,31 @@ void Grid_init(int *argc,char ***argv)
{ {
GridLogger::StopWatch.Start(); GridLogger::StopWatch.Start();
std::string arg;
////////////////////////////////////
// Shared memory block size
////////////////////////////////////
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
int MB;
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
GridCmdOptionInt(arg,MB);
CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
}
CartesianCommunicator::Init(argc,argv); CartesianCommunicator::Init(argc,argv);
// Parse command line args. ////////////////////////////////////
// Logging
////////////////////////////////////
std::string arg;
std::vector<std::string> logstreams; std::vector<std::string> logstreams;
std::string defaultLog("Error,Warning,Message,Performance"); std::string defaultLog("Error,Warning,Message,Performance");
GridCmdOptionCSL(defaultLog,logstreams); GridCmdOptionCSL(defaultLog,logstreams);
GridLogConfigure(logstreams); GridLogConfigure(logstreams);
if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){ if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
std::cout<<GridLogMessage<<"--help : this message"<<std::endl; Grid_quiesce_nodes();
std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
std::cout<<GridLogMessage<<"--debug-stdout : print stdout from EVERY node"<<std::endl;
std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
std::cout<<GridLogMessage<<"--mpi n.n.n.n : default MPI decomposition"<<std::endl;
std::cout<<GridLogMessage<<"--threads n : default number of OMP threads"<<std::endl;
std::cout<<GridLogMessage<<"--grid n.n.n.n : default Grid size"<<std::endl;
std::cout<<GridLogMessage<<"--log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
exit(EXIT_SUCCESS);
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
@ -231,38 +240,39 @@ void Grid_init(int *argc,char ***argv)
GridLogConfigure(logstreams); GridLogConfigure(logstreams);
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ ////////////////////////////////////
Grid_debug_handler_init(); // Help message
} ////////////////////////////////////
if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
Grid_quiesce_nodes(); if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
} std::cout<<GridLogMessage<<" --help : this message"<<std::endl;
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){ std::cout<<GridLogMessage<<std::endl;
QCD::WilsonKernelsStatic::HandOpt=1; std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
} std::cout<<GridLogMessage<<" --mpi n.n.n.n : default MPI decomposition"<<std::endl;
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ std::cout<<GridLogMessage<<" --threads n : default number of OMP threads"<<std::endl;
LebesgueOrder::UseLebesgueOrder=1; std::cout<<GridLogMessage<<" --grid n.n.n.n : default Grid size"<<std::endl;
} std::cout<<GridLogMessage<<" --shm M : allocate M megabytes of shared memory for comms"<<std::endl;
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ std::cout<<GridLogMessage<<std::endl;
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
GridCmdOptionIntVector(arg,LebesgueOrder::Block); std::cout<<GridLogMessage<<" --log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
} std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){ std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
GridLogTimestamp(1); std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
std::cout<<GridLogMessage<<" --dslash-asm : Wilson kernel for AVX512"<<std::endl;
std::cout<<GridLogMessage<<" --lebesgue : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;
std::cout<<GridLogMessage<<" --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
exit(EXIT_SUCCESS);
} }
GridParseLayout(*argv,*argc, ////////////////////////////////////
Grid_default_latt, // Banner
Grid_default_mpi); ////////////////////////////////////
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<GridLogMessage<<"Grid Decomposition\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
}
std::string COL_RED = GridLogColours.colour["RED"]; std::string COL_RED = GridLogColours.colour["RED"];
std::string COL_PURPLE = GridLogColours.colour["PURPLE"]; std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
@ -271,7 +281,6 @@ void Grid_init(int *argc,char ***argv)
std::string COL_BLUE = GridLogColours.colour["BLUE"]; std::string COL_BLUE = GridLogColours.colour["BLUE"];
std::string COL_YELLOW = GridLogColours.colour["YELLOW"]; std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"]; std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
std::cout <<std::endl; std::cout <<std::endl;
std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl; std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
@ -305,6 +314,55 @@ void Grid_init(int *argc,char ***argv)
std::cout << COL_BACKGROUND <<std::endl; std::cout << COL_BACKGROUND <<std::endl;
std::cout << std::endl; std::cout << std::endl;
////////////////////////////////////
// Debug and performance options
////////////////////////////////////
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
Grid_debug_handler_init();
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
GridLogTimestamp(0);
} else {
GridLogTimestamp(1);
}
GridParseLayout(*argv,*argc,
Grid_default_latt,
Grid_default_mpi);
std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<GridLogMessage<<"Grid Decomposition\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
}
Grid_is_initialised = 1; Grid_is_initialised = 1;
} }

View File

@ -9,6 +9,11 @@ if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_base.cc extra_sources+=communicator/Communicator_base.cc
endif endif
if BUILD_COMMS_MPI3L
extra_sources+=communicator/Communicator_mpi3_leader.cc
extra_sources+=communicator/Communicator_base.cc
endif
if BUILD_COMMS_SHMEM if BUILD_COMMS_SHMEM
extra_sources+=communicator/Communicator_shmem.cc extra_sources+=communicator/Communicator_shmem.cc
extra_sources+=communicator/Communicator_base.cc extra_sources+=communicator/Communicator_base.cc

View File

@ -31,14 +31,8 @@ namespace Grid {
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmRank;
int CartesianCommunicator::ShmSize;
int CartesianCommunicator::GroupRank;
int CartesianCommunicator::GroupSize;
int CartesianCommunicator::WorldRank;
int CartesianCommunicator::WorldSize;
int CartesianCommunicator::Slave;
void * CartesianCommunicator::ShmCommBuf; void * CartesianCommunicator::ShmCommBuf;
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
///////////////////////////////// /////////////////////////////////
// Alloc, free shmem region // Alloc, free shmem region
@ -48,7 +42,12 @@ void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
void *ptr = (void *)heap_top; void *ptr = (void *)heap_top;
heap_top += bytes; heap_top += bytes;
heap_bytes+= bytes; heap_bytes+= bytes;
assert(heap_bytes < MAX_MPI_SHM_BYTES); if (heap_bytes >= MAX_MPI_SHM_BYTES) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
assert(heap_bytes<MAX_MPI_SHM_BYTES);
}
return ptr; return ptr;
} }
void CartesianCommunicator::ShmBufferFreeAll(void) { void CartesianCommunicator::ShmBufferFreeAll(void) {
@ -69,12 +68,6 @@ int CartesianCommunicator::ProcessorCount(void) { return
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){ return WorldRank; };
int CartesianCommunicator::Ranks (void) { return WorldSize; };
int CartesianCommunicator::Nodes (void) { return GroupSize; };
int CartesianCommunicator::Cores (void) { return ShmSize; };
int CartesianCommunicator::NodeRank (void) { return GroupRank; };
int CartesianCommunicator::CoreRank (void) { return ShmRank; };
void CartesianCommunicator::GlobalSum(ComplexF &c) void CartesianCommunicator::GlobalSum(ComplexF &c)
{ {
@ -93,7 +86,7 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);
} }
#ifndef GRID_COMMS_MPI3 #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,

View File

@ -1,3 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -37,6 +38,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_MPI3 #ifdef GRID_COMMS_MPI3
#include <mpi.h> #include <mpi.h>
#endif #endif
#ifdef GRID_COMMS_MPI3L
#include <mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
#include <mpp/shmem.h> #include <mpp/shmem.h>
#endif #endif
@ -51,7 +55,7 @@ class CartesianCommunicator {
// Give external control (command line override?) of this // Give external control (command line override?) of this
static const int MAXLOG2RANKSPERNODE = 16; static const int MAXLOG2RANKSPERNODE = 16;
static const uint64_t MAX_MPI_SHM_BYTES = 128*1024*1024; static uint64_t MAX_MPI_SHM_BYTES;
// Communicator should know nothing of the physics grid, only processor grid. // Communicator should know nothing of the physics grid, only processor grid.
int _Nprocessors; // How many in all int _Nprocessors; // How many in all
@ -60,9 +64,9 @@ class CartesianCommunicator {
std::vector<int> _processor_coor; // linear processor coordinate std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension; unsigned long _ndimension;
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
MPI_Comm communicator;
static MPI_Comm communicator_world; static MPI_Comm communicator_world;
MPI_Comm communicator;
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
typedef int CommsRequest_t; typedef int CommsRequest_t;
@ -75,7 +79,15 @@ class CartesianCommunicator {
// cartesian communicator on a subset of ranks, slave ranks controlled // cartesian communicator on a subset of ranks, slave ranks controlled
// by group leader with data xfer via shared memory // by group leader with data xfer via shared memory
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_MPI3 #ifdef GRID_COMMS_MPI3
static int ShmRank;
static int ShmSize;
static int GroupRank;
static int GroupSize;
static int WorldRank;
static int WorldSize;
std::vector<int> WorldDims; std::vector<int> WorldDims;
std::vector<int> GroupDims; std::vector<int> GroupDims;
std::vector<int> ShmDims; std::vector<int> ShmDims;
@ -83,7 +95,7 @@ class CartesianCommunicator {
std::vector<int> GroupCoor; std::vector<int> GroupCoor;
std::vector<int> ShmCoor; std::vector<int> ShmCoor;
std::vector<int> WorldCoor; std::vector<int> WorldCoor;
static std::vector<int> GroupRanks; static std::vector<int> GroupRanks;
static std::vector<int> MyGroup; static std::vector<int> MyGroup;
static int ShmSetup; static int ShmSetup;
@ -93,13 +105,20 @@ class CartesianCommunicator {
std::vector<int> LexicographicToWorldRank; std::vector<int> LexicographicToWorldRank;
static std::vector<void *> ShmCommBufs; static std::vector<void *> ShmCommBufs;
#else #else
static void ShmInitGeneric(void); static void ShmInitGeneric(void);
static commVector<uint8_t> ShmBufStorageVector; static commVector<uint8_t> ShmBufStorageVector;
#endif #endif
/////////////////////////////////
// Grid information and queries
// Implemented in Communicator_base.C
/////////////////////////////////
static void * ShmCommBuf; static void * ShmCommBuf;
size_t heap_top; size_t heap_top;
size_t heap_bytes; size_t heap_bytes;
void *ShmBufferSelf(void); void *ShmBufferSelf(void);
void *ShmBuffer(int rank); void *ShmBuffer(int rank);
void *ShmBufferTranslate(int rank,void * local_p); void *ShmBufferTranslate(int rank,void * local_p);
@ -123,28 +142,12 @@ class CartesianCommunicator {
int RankFromProcessorCoor(std::vector<int> &coor); int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor); void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
/////////////////////////////////
// Grid information and queries
/////////////////////////////////
static int ShmRank;
static int ShmSize;
static int GroupSize;
static int GroupRank;
static int WorldRank;
static int WorldSize;
static int Slave;
int IsBoss(void) ; int IsBoss(void) ;
int BossRank(void) ; int BossRank(void) ;
int ThisRank(void) ; int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ; const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ; const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ; int ProcessorCount(void) ;
static int Ranks (void);
static int Nodes (void);
static int Cores (void);
static int NodeRank (void);
static int CoreRank (void);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid

View File

@ -44,13 +44,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
MPI_Init(argc,argv); MPI_Init(argc,argv);
} }
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Comm_rank(communicator_world,&WorldRank);
MPI_Comm_size(communicator_world,&WorldSize);
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
ShmInitGeneric(); ShmInitGeneric();
} }
@ -198,6 +191,11 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
// Should only be used prior to Grid Init finished. // Should only be used prior to Grid Init finished.
// Check for this? // Check for this?
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,

View File

@ -30,12 +30,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmSetup = 0; int CartesianCommunicator::ShmSetup = 0;
int CartesianCommunicator::ShmRank;
int CartesianCommunicator::ShmSize;
int CartesianCommunicator::GroupRank;
int CartesianCommunicator::GroupSize;
int CartesianCommunicator::WorldRank;
int CartesianCommunicator::WorldSize;
MPI_Comm CartesianCommunicator::communicator_world; MPI_Comm CartesianCommunicator::communicator_world;
MPI_Comm CartesianCommunicator::ShmComm; MPI_Comm CartesianCommunicator::ShmComm;
MPI_Win CartesianCommunicator::ShmWindow; MPI_Win CartesianCommunicator::ShmWindow;
@ -97,15 +103,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
std::vector<int> world_ranks(WorldSize); std::vector<int> world_ranks(WorldSize);
GroupRanks.resize(WorldSize); GroupRanks.resize(WorldSize);
MyGroup.resize(ShmSize);
for(int r=0;r<WorldSize;r++) world_ranks[r]=r; for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Identify who is in my group and noninate the leader // Identify who is in my group and noninate the leader
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
int g=0; int g=0;
MyGroup.resize(ShmSize);
for(int rank=0;rank<WorldSize;rank++){ for(int rank=0;rank<WorldSize;rank++){
if(GroupRanks[rank]!=MPI_UNDEFINED){ if(GroupRanks[rank]!=MPI_UNDEFINED){
assert(g<ShmSize); assert(g<ShmSize);

View File

@ -0,0 +1,870 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include "Grid.h"
#include <mpi.h>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Workarounds:
/// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
/// darwin dispatch semaphores don't seem to be multiprocess.
///
/// ii) openmpi under --mca shmem posix works with two squadrons per node;
/// openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
/// memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#include <semaphore.h>
typedef sem_t *Grid_semaphore;
#define SEM_INIT(S) S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
#define SEM_POST(S) assert ( sem_post(S) == 0 );
#define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
#include <sys/mman.h>
namespace Grid {
enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL };
struct Descriptor {
uint64_t buf;
size_t bytes;
int rank;
int tag;
int command;
MPI_Request request;
};
const int pool = 48;
class SlaveState {
public:
volatile int head;
volatile int start;
volatile int tail;
volatile Descriptor Descrs[pool];
};
class Slave {
public:
Grid_semaphore sem_head;
Grid_semaphore sem_tail;
SlaveState *state;
MPI_Comm squadron;
uint64_t base;
int universe_rank;
int vertical_rank;
char sem_name [NAME_MAX];
////////////////////////////////////////////////////////////
// Descriptor circular pointers
////////////////////////////////////////////////////////////
Slave() {};
void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
void SemInit(void) {
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
// printf("SEM_NAME: %s \n",sem_name);
SEM_INIT(sem_head);
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
// printf("SEM_NAME: %s \n",sem_name);
SEM_INIT(sem_tail);
}
void SemInitExcl(void) {
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
// printf("SEM_INIT_EXCL: %s \n",sem_name);
SEM_INIT_EXCL(sem_head);
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
// printf("SEM_INIT_EXCL: %s \n",sem_name);
SEM_INIT_EXCL(sem_tail);
}
void WakeUpDMA(void) {
SEM_POST(sem_head);
};
void WakeUpCompute(void) {
SEM_POST(sem_tail);
};
void WaitForCommand(void) {
SEM_WAIT(sem_head);
};
void WaitForComplete(void) {
SEM_WAIT(sem_tail);
};
void EventLoop (void) {
// std::cout<< " Entering event loop "<<std::endl;
while(1){
WaitForCommand();
// std::cout << "Getting command "<<std::endl;
Event();
}
}
int Event (void) ;
uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
void WaitAll() {
// std::cout << "Queueing WAIT command "<<std::endl;
QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
// std::cout << "Waking up DMA "<<std::endl;
WakeUpDMA();
// std::cout << "Waiting from semaphore "<<std::endl;
WaitForComplete();
// std::cout << "Checking FIFO is empty "<<std::endl;
assert ( state->tail == state->head );
}
};
////////////////////////////////////////////////////////////////////////
// One instance of a data mover.
// Master and Slave must agree on location in shared memory
////////////////////////////////////////////////////////////////////////
class MPIoffloadEngine {
public:
static std::vector<Slave> Slaves;
static int ShmSetup;
static int UniverseRank;
static int UniverseSize;
static MPI_Comm communicator_universe;
static MPI_Comm communicator_cached;
static MPI_Comm HorizontalComm;
static int HorizontalRank;
static int HorizontalSize;
static MPI_Comm VerticalComm;
static MPI_Win VerticalWindow;
static int VerticalSize;
static int VerticalRank;
static std::vector<void *> VerticalShmBufs;
static std::vector<std::vector<int> > UniverseRanks;
static std::vector<int> UserCommunicatorToWorldRanks;
static MPI_Group WorldGroup, CachedGroup;
static void CommunicatorInit (MPI_Comm &communicator_world,
MPI_Comm &ShmComm,
void * &ShmCommBuf);
static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
/////////////////////////////////////////////////////////
// routines for master proc must handle any communicator
/////////////////////////////////////////////////////////
static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
// std::cout<< " Queueing send "<< bytes<< " slave "<< slave << " to comm "<<rank <<std::endl;
Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
// std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
Slaves[slave].WakeUpDMA();
// std::cout << "Waking up DMA "<< slave<<std::endl;
};
static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
// std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank <<std::endl;
Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
// std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
Slaves[slave].WakeUpDMA();
// std::cout << "Waking up DMA "<< slave<<std::endl;
};
static void WaitAll() {
for(int s=1;s<VerticalSize;s++) {
// std::cout << "Waiting for slave "<< s<<std::endl;
Slaves[s].WaitAll();
}
// std::cout << " Wait all Complete "<<std::endl;
};
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
int basework = nwork/units;
int backfill = units-(nwork%units);
if ( me >= units ) {
mywork = myoff = 0;
} else {
mywork = (nwork+me)/units;
myoff = basework * me;
if ( me > backfill )
myoff+= (me-backfill);
}
return;
};
static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
uint8_t * cbuf = (uint8_t *) buf;
int mywork, myoff, procs;
procs = VerticalSize-1;
for(int s=0;s<procs;s++) {
GetWork(bytes,s,mywork,myoff,procs);
QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
}
};
static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
uint8_t * cbuf = (uint8_t *) buf;
int mywork, myoff, procs;
procs = VerticalSize-1;
for(int s=0;s<procs;s++) {
GetWork(bytes,s,mywork,myoff,procs);
QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
}
};
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<Slave> MPIoffloadEngine::Slaves;
int MPIoffloadEngine::UniverseRank;
int MPIoffloadEngine::UniverseSize;
MPI_Comm MPIoffloadEngine::communicator_universe;
MPI_Comm MPIoffloadEngine::communicator_cached;
MPI_Group MPIoffloadEngine::WorldGroup;
MPI_Group MPIoffloadEngine::CachedGroup;
MPI_Comm MPIoffloadEngine::HorizontalComm;
int MPIoffloadEngine::HorizontalRank;
int MPIoffloadEngine::HorizontalSize;
MPI_Comm MPIoffloadEngine::VerticalComm;
int MPIoffloadEngine::VerticalSize;
int MPIoffloadEngine::VerticalRank;
MPI_Win MPIoffloadEngine::VerticalWindow;
std::vector<void *> MPIoffloadEngine::VerticalShmBufs;
std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
std::vector<int> MPIoffloadEngine::UserCommunicatorToWorldRanks;
int MPIoffloadEngine::ShmSetup = 0;
void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
MPI_Comm &ShmComm,
void * &ShmCommBuf)
{
int flag;
assert(ShmSetup==0);
//////////////////////////////////////////////////////////////////////
// Universe is all nodes prior to squadron grouping
//////////////////////////////////////////////////////////////////////
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
MPI_Comm_rank(communicator_universe,&UniverseRank);
MPI_Comm_size(communicator_universe,&UniverseSize);
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory (Verticals)
/////////////////////////////////////////////////////////////////////
#undef MPI_SHARED_MEM_DEBUG
#ifdef MPI_SHARED_MEM_DEBUG
MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
#else
MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
#endif
MPI_Comm_rank(VerticalComm ,&VerticalRank);
MPI_Comm_size(VerticalComm ,&VerticalSize);
//////////////////////////////////////////////////////////////////////
// Split into horizontal groups by rank in squadron
//////////////////////////////////////////////////////////////////////
MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
MPI_Comm_rank(HorizontalComm,&HorizontalRank);
MPI_Comm_size(HorizontalComm,&HorizontalSize);
assert(HorizontalSize*VerticalSize==UniverseSize);
////////////////////////////////////////////////////////////////////////////////
// What is my place in the world
////////////////////////////////////////////////////////////////////////////////
int WorldRank=0;
if(VerticalRank==0) WorldRank = HorizontalRank;
int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
assert(ierr==0);
////////////////////////////////////////////////////////////////////////////////
// Where is the world in the universe?
////////////////////////////////////////////////////////////////////////////////
UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
for(int w=0;w<HorizontalSize;w++){
ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
assert(ierr==0);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared window for our group, pass back Shm info to CartesianCommunicator
//////////////////////////////////////////////////////////////////////////////////////////////////////////
VerticalShmBufs.resize(VerticalSize);
#undef MPI_SHARED_MEM
#ifdef MPI_SHARED_MEM
ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
assert(ierr==0);
// std::cout<<"SHM "<<ShmCommBuf<<std::endl;
for(int r=0;r<VerticalSize;r++){
MPI_Aint sz;
int dsp_unit;
MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
// std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
}
#else
char shm_name [NAME_MAX];
MPI_Barrier(VerticalComm);
if ( VerticalRank == 0 ) {
for(int r=0;r<VerticalSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
if ( r>0 ) size = sizeof(SlaveState);
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
shm_unlink(shm_name);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
if ( fd < 0 ) {
perror("failed shm_open");
assert(0);
}
ftruncate(fd, size);
VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( VerticalShmBufs[r] == MAP_FAILED ) {
perror("failed mmap");
assert(0);
}
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
check[0] = WorldRank;
check[1] = r;
// std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
}
}
MPI_Barrier(VerticalComm);
if ( VerticalRank != 0 ) {
for(int r=0;r<VerticalSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
if ( r>0 ) size = sizeof(SlaveState);
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
if ( fd<0 ) {
perror("failed shm_open");
assert(0);
}
VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
assert(check[0]== WorldRank);
assert(check[1]== r);
std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
}
}
#endif
MPI_Barrier(VerticalComm);
//////////////////////////////////////////////////////////////////////
// Map rank of leader on node in their in new world, to the
// rank in this vertical plane's horizontal communicator
//////////////////////////////////////////////////////////////////////
communicator_world = HorizontalComm;
ShmComm = VerticalComm;
ShmCommBuf = VerticalShmBufs[0];
MPI_Comm_group (communicator_world, &WorldGroup);
///////////////////////////////////////////////////////////
// Start the slave data movers
///////////////////////////////////////////////////////////
if ( VerticalRank != 0 ) {
Slave indentured;
indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
indentured.SemInitExcl();// init semaphore in shared memory
MPI_Barrier(VerticalComm);
MPI_Barrier(VerticalComm);
indentured.EventLoop();
assert(0);
} else {
Slaves.resize(VerticalSize);
for(int i=1;i<VerticalSize;i++){
Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
}
MPI_Barrier(VerticalComm);
for(int i=1;i<VerticalSize;i++){
Slaves[i].SemInit();// init semaphore in shared memory
}
MPI_Barrier(VerticalComm);
}
///////////////////////////////////////////////////////////
// Verbose for now
///////////////////////////////////////////////////////////
ShmSetup=1;
if (UniverseRank == 0){
std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
std::cout<<UniverseSize << " Ranks " ;
std::cout<<HorizontalSize << " Nodes " ;
std::cout<<VerticalSize << " with ranks-per-node "<<std::endl;
std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
for(int g=0;g<HorizontalSize;g++){
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
}
for(int g=0;g<HorizontalSize;g++){
std::cout<<GridLogMessage<<" { ";
for(int s=0;s<VerticalSize;s++){
std::cout<< UniverseRanks[g][s];
if ( s<VerticalSize-1 ) {
std::cout<<",";
}
}
std::cout<<" } "<<std::endl;
}
}
};
///////////////////////////////////////////////////////////////////////////////////////////////
// Map the communicator into communicator_world, and find the neighbour.
// Cache the mappings; cache size is 1.
///////////////////////////////////////////////////////////////////////////////////////////////
void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
if ( comm == HorizontalComm ) {
comm_world_peer = rank;
// std::cout << " MapCommRankToWorldRank horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
} else if ( comm == communicator_cached ) {
comm_world_peer = UserCommunicatorToWorldRanks[rank];
// std::cout << " MapCommRankToWorldRank cached " <<rank<<"->"<<comm_world_peer<<std::endl;
} else {
int size;
MPI_Comm_size(comm,&size);
UserCommunicatorToWorldRanks.resize(size);
std::vector<int> cached_ranks(size);
for(int r=0;r<size;r++) {
cached_ranks[r]=r;
}
communicator_cached=comm;
MPI_Comm_group(communicator_cached, &CachedGroup);
MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]);
comm_world_peer = UserCommunicatorToWorldRanks[rank];
// std::cout << " MapCommRankToWorldRank cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
assert(comm_world_peer != MPI_UNDEFINED);
}
assert( (tag & (~0xFFFFL)) ==0);
uint64_t icomm = (uint64_t)comm;
int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
// hashtag = (comm_hash<<15) | tag;
hashtag = tag;
};
void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
{
squadron=_squadron;
universe_rank=_universe_rank;
vertical_rank=_vertical_rank;
state =_state;
// std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
state->head = state->tail = state->start = 0;
base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
int rank; MPI_Comm_rank(_squadron,&rank);
}
#define PERI_PLUS(A) ( (A+1)%pool )
int Slave::Event (void) {
static int tail_last;
static int head_last;
static int start_last;
int ierr;
////////////////////////////////////////////////////
// Try to advance the start pointers
////////////////////////////////////////////////////
int s=state->start;
if ( s != state->head ) {
switch ( state->Descrs[s].command ) {
case COMMAND_ISEND:
/*
std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
<< " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
<< " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl;
*/
ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),
state->Descrs[s].bytes,
MPI_CHAR,
state->Descrs[s].rank,
state->Descrs[s].tag,
MPIoffloadEngine::communicator_universe,
(MPI_Request *)&state->Descrs[s].request);
assert(ierr==0);
state->start = PERI_PLUS(s);
return 1;
break;
case COMMAND_IRECV:
/*
std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
<< " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
<< " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl;
*/
ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),
state->Descrs[s].bytes,
MPI_CHAR,
state->Descrs[s].rank,
state->Descrs[s].tag,
MPIoffloadEngine::communicator_universe,
(MPI_Request *)&state->Descrs[s].request);
// std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
// std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
assert(ierr==0);
state->start = PERI_PLUS(s);
return 1;
break;
case COMMAND_WAITALL:
for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
};
s=PERI_PLUS(s);
state->start = s;
state->tail = s;
WakeUpCompute();
return 1;
break;
default:
assert(0);
break;
}
}
return 0;
}
//////////////////////////////////////////////////////////////////////////////
// External interaction with the queue
//////////////////////////////////////////////////////////////////////////////
uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank)
{
/////////////////////////////////////////
// Spin; if FIFO is full until not full
/////////////////////////////////////////
int head =state->head;
int next = PERI_PLUS(head);
// Set up descriptor
int worldrank;
int hashtag;
MPI_Comm communicator;
MPI_Request request;
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
uint64_t relative= (uint64_t)buf - base;
state->Descrs[head].buf = relative;
state->Descrs[head].bytes = bytes;
state->Descrs[head].rank = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
state->Descrs[head].tag = hashtag;
state->Descrs[head].command= command;
/*
if ( command == COMMAND_ISEND ) {
std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank
<< " to worldrank " << worldrank <<std::endl;
std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
}
if ( command == COMMAND_IRECV ) {
std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank
<< " from worldrank " << worldrank <<std::endl;
std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
}
*/
// Block until FIFO has space
while( state->tail==next );
// Msync on weak order architectures
// Advance pointer
state->head = next;
return 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
void CartesianCommunicator::Init(int *argc, char ***argv)
{
int flag;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init(argc,argv);
}
communicator_world = MPI_COMM_WORLD;
MPI_Comm ShmComm;
MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_ndimension = processors.size();
std::vector<int> periodic(_ndimension,1);
_Nprocessors=1;
_processors = processors;
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
int Size;
MPI_Comm_size(communicator_world,&Size);
assert(Size==_Nprocessors);
_processor_coor.resize(_ndimension);
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank (communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
};
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
MPI_Request xrq;
MPI_Request rrq;
int rank = _processor;
int ierr;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
}
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
uint64_t xmit_i = (uint64_t) xmit;
uint64_t recv_i = (uint64_t) recv;
uint64_t shm = (uint64_t) ShmCommBuf;
// assert xmit and recv lie in shared memory region
assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
assert(from!=_processor);
assert(dest!=_processor);
MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
MPIoffloadEngine::WaitAll();
}
void CartesianCommunicator::StencilBarrier(void)
{
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
void *CartesianCommunicator::ShmBuffer(int rank) {
return NULL;
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
return NULL;
}
};

View File

@ -34,13 +34,6 @@ namespace Grid {
void CartesianCommunicator::Init(int *argc, char *** arv) void CartesianCommunicator::Init(int *argc, char *** arv)
{ {
WorldRank = 0;
WorldSize = 1;
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
ShmInitGeneric(); ShmInitGeneric();
} }
@ -99,6 +92,7 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
assert(0); assert(0);
} }
int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }

View File

@ -50,11 +50,16 @@ typedef struct HandShake_t {
uint64_t seq_remote; uint64_t seq_remote;
} HandShake; } HandShake;
std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
ret.fill(SHMEM_SYNC_VALUE);
return ret;
}
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
static Vector< HandShake > XConnections; static Vector< HandShake > XConnections;
static Vector< HandShake > RConnections; static Vector< HandShake > RConnections;
void CartesianCommunicator::Init(int *argc, char ***argv) { void CartesianCommunicator::Init(int *argc, char ***argv) {
shmem_init(); shmem_init();
XConnections.resize(shmem_n_pes()); XConnections.resize(shmem_n_pes());
@ -65,13 +70,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
RConnections[pe].seq_local = 0; RConnections[pe].seq_local = 0;
RConnections[pe].seq_remote= 0; RConnections[pe].seq_remote= 0;
} }
WorldSize = shmem_n_pes();
WorldRank = shmem_my_pe();
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
shmem_barrier_all(); shmem_barrier_all();
ShmInitGeneric(); ShmInitGeneric();
} }
@ -103,7 +101,7 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
static long long source ; static long long source ;
static long long dest ; static long long dest ;
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
// int nreduce=1; // int nreduce=1;
// int pestart=0; // int pestart=0;
@ -119,7 +117,7 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
static long long source ; static long long source ;
static long long dest ; static long long dest ;
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
// int nreduce=1; // int nreduce=1;
// int pestart=0; // int pestart=0;
@ -135,7 +133,7 @@ void CartesianCommunicator::GlobalSum(float &f){
static float source ; static float source ;
static float dest ; static float dest ;
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
source = f; source = f;
dest =0.0; dest =0.0;
@ -147,7 +145,7 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
static float source ; static float source ;
static float dest = 0 ; static float dest = 0 ;
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
if ( shmem_addr_accessible(f,_processor) ){ if ( shmem_addr_accessible(f,_processor) ){
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync); shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
@ -166,7 +164,7 @@ void CartesianCommunicator::GlobalSum(double &d)
static double source; static double source;
static double dest ; static double dest ;
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
source = d; source = d;
dest = 0; dest = 0;
@ -178,7 +176,8 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
static double source ; static double source ;
static double dest ; static double dest ;
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
if ( shmem_addr_accessible(d,_processor) ){ if ( shmem_addr_accessible(d,_processor) ){
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync); shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
@ -295,7 +294,7 @@ void CartesianCommunicator::Barrier(void)
} }
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{ {
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
static uint32_t word; static uint32_t word;
uint32_t *array = (uint32_t *) data; uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0); assert( (bytes % 4)==0);
@ -318,7 +317,7 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
} }
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
static uint32_t word; static uint32_t word;
uint32_t *array = (uint32_t *) data; uint32_t *array = (uint32_t *) data;
assert( (bytes % 4)==0); assert( (bytes % 4)==0);

View File

@ -32,8 +32,7 @@ directory
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
int WilsonKernelsStatic::HandOpt; int WilsonKernelsStatic::Opt;
int WilsonKernelsStatic::AsmOpt;
template <class Impl> template <class Impl>
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){}; WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};

View File

@ -40,9 +40,9 @@ namespace QCD {
//////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class WilsonKernelsStatic { class WilsonKernelsStatic {
public: public:
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
// S-direction is INNERMOST and takes no part in the parity. // S-direction is INNERMOST and takes no part in the parity.
static int AsmOpt; // these are a temporary hack static int Opt; // these are a temporary hack
static int HandOpt; // these are a temporary hack
}; };
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
@ -56,24 +56,40 @@ public:
template <bool EnableBool = true> template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out)
{
switch(Opt) {
#ifdef AVX512 #ifdef AVX512
if (AsmOpt) { case OptInlineAsm:
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
} else {
#else
{
#endif
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
if (HandOpt) WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
else
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
sF++; sF++;
} }
sU++; sU++;
} }
break;
#endif
case OptHandUnroll:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
break;
case OptGeneric:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
break;
default:
assert(0);
} }
} }
@ -81,7 +97,7 @@ public:
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
// no kernel choice
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out); WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
@ -95,23 +111,39 @@ public:
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) { int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
switch(Opt) {
#ifdef AVX512 #ifdef AVX512
if (AsmOpt) { case OptInlineAsm:
WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
} else {
#else
{
#endif
for (int site = 0; site < Ns; site++) { for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
if (HandOpt) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
else
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
sF++; sF++;
} }
sU++; sU++;
} }
break;
#endif
case OptHandUnroll:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
break;
case OptGeneric:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
break;
default:
assert(0);
} }
} }

View File

@ -32,7 +32,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
int LebesgueOrder::UseLebesgueOrder; int LebesgueOrder::UseLebesgueOrder;
std::vector<int> LebesgueOrder::Block({2,2,2,2}); std::vector<int> LebesgueOrder::Block({8,2,2,2});
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){ LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
n--; // 1000 0011 --> 1000 0010 n--; // 1000 0011 --> 1000 0010

View File

@ -1 +0,0 @@
./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/arm-linux-gnueabihf/include/c++/4.8.2/arm-linux-gnueabihf/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a7' --enable-simd=NEONv7

View File

@ -1,3 +0,0 @@
#./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7
./configure --host=aarch64-linux-gnu CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7

View File

@ -1,9 +0,0 @@
for omp in 1 2 4
do
echo > wilson.t$omp
for vol in 4.4.4.4 4.4.4.8 4.4.8.8 4.8.8.8 8.8.8.8 8.8.8.16 8.8.16.16 8.16.16.16
do
perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp | grep mflop | awk '{print $3}'`
echo $vol $perf >> wilson.t$omp
done
done

View File

@ -1,46 +0,0 @@
#!/bin/bash -e
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse"
EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512"
BLACK="\033[30m"
RED="\033[31m"
GREEN="\033[32m"
YELLOW="\033[33m"
BLUE="\033[34m"
PINK="\033[35m"
CYAN="\033[36m"
WHITE="\033[37m"
NORMAL="\033[0;39m"
for D in $DIRS
do
echo
echo -e $RED ==============================
echo -e $GREEN $D
echo -e $RED ==============================
echo -e $BLUE
cd builds/$D
make clean all -j 8
cd ../../
echo -e $NORMAL
done
if [ "X$1" == "Xextra" ]
then
for D in $EXTRADIRS
do
echo
echo -e $RED ==============================
echo -e $RED $D
echo -e $RED ==============================
echo -e $BLUE
cd builds/$D
make clean all -j 8
cd ../../
echo -e $NORMAL
done
fi

View File

@ -1,11 +0,0 @@
#!/bin/bash
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse icpc-avx-openmp-mpi icpc-avx-openmp"
for D in $DIRS
do
mkdir -p builds/$D
cd builds/$D
../../scripts/configure-commands $D
cd ../..
done

View File

@ -1,89 +0,0 @@
#!/bin/bash
WD=$1
BLACK="\033[30m"
RED="\033[31m"
GREEN="\033[32m"
YELLOW="\033[33m"
BLUE="\033[34m"
PINK="\033[35m"
CYAN="\033[36m"
WHITE="\033[37m"
NORMAL="\033[0;39m"
echo
echo -e $RED ==============================
echo -e $GREEN $WD
echo -e $RED ==============================
echo -e $YELLOW
case $WD in
g++-avx)
CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
g++-avx-openmp)
CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=none
;;
g++5-sse4)
CXX=g++-5 ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
g++5-avx)
CXX=g++-5 ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-avx)
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-avx-openmp-mpi)
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
;;
icpc-avx-openmp)
CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
;;
icpc-avx2)
CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-avx512)
CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3 -std=c++11" --host=none LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-mic)
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
;;
icpc-mic-avx512)
CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3 -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-sse)
CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx)
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx2)
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx-openmp)
CXX=clang-omp++ ../../configure --enable-precision=double --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-xc30)
CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS="" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-xc30-openmp)
CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx2-openmp)
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
clang-avx-openmp-mpi)
CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
;;
clang-avx2-openmp-mpi)
CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
;;
clang-avx-mpi)
CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
;;
clang-avx2-mpi)
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
;;
clang-avx2)
CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LDFLAGS="-L/usr/local/lib/" LIBS="-lgmp -lmpfr" --enable-comms=none
;;
esac
echo -e $NORMAL

View File

@ -1,10 +0,0 @@
#!/bin/bash
DIRS="g++-avx-openmp g++-avx clang-xc30 clang-xc30-openmp"
for D in $DIRS
do
mkdir -p builds/$D
cd builds/$D
../../scripts/configure-commands $D
cd ../..
done

View File

@ -1,10 +0,0 @@
#!/bin/bash
DIRS="build-icpc-mic"
for D in $DIRS
do
mkdir -p $D
cd $D
../configure-commands
cd ..
done

View File

@ -12,6 +12,7 @@ Grid physics library, www.github.com/paboyle/Grid
Source file: $1 Source file: $1
Copyright (C) 2015 Copyright (C) 2015
Copyright (C) 2016
EOF EOF
@ -38,8 +39,21 @@ See the full license in the file "LICENSE" in the top level distribution directo
/* END LEGAL */ /* END LEGAL */
EOF EOF
cat message > tmp.fil cat message > tmp.fil
cat $1 >> tmp.fil
NOTICE=`grep -n "END LEGAL" $1 | awk '{ print $1 }' `
if [ "X$NOTICE" != "X" ]
then
echo "found notice ending on line $NOTICE"
awk 'BEGIN { P=0 } { if ( P ) print } /END LEGAL/{P=1} ' $1 >> tmp.fil
else
cat $1 >> tmp.fil
fi
cp tmp.fil $1 cp tmp.fil $1
shift shift

View File

@ -1,2 +0,0 @@
module swap PrgEnv-cray PrgEnv-intel
module swap intel/14.0.4.211 intel/15.0.2.164

View File

@ -1,4 +0,0 @@
aclocal -I m4
autoheader -f
automake -f --add-missing
autoconf -f

View File

@ -1,18 +0,0 @@
#!/usr/bin/env bash
if (( $# != 1 )); then
echo "usage: `basename $0` <archive>" 1>&2
exit 1
fi
ARC=$1
INITDIR=`pwd`
rm -rf lib/fftw
mkdir lib/fftw
ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
tar -xf ${ARC}
cp ${ARCDIR}/api/fftw3.h lib/fftw/
cd ${INITDIR}
rm -rf ${ARCDIR}

View File

@ -1,7 +0,0 @@
plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
set terminal 'pdf'
set output 'wilson_clang.pdf'
replot
quit

View File

@ -102,16 +102,14 @@ int main (int argc, char ** argv)
PokeIndex<LorentzIndex>(mom,mommu,mu); PokeIndex<LorentzIndex>(mom,mommu,mu);
// fourth order exponential approx // fourth order exponential approx
parallel_for(auto i=mom.begin();i<mom.end();i++){ parallel_for(auto i=mom.begin();i<mom.end();i++) {
Uprime[i](mu) = Uprime[i](mu) = U[i](mu);
U[i](mu) Uprime[i](mu) += mom[i](mu)*U[i](mu)*dt ;
+ mom[i](mu)*U[i](mu)*dt Uprime[i](mu) += mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt/2.0);
+ mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt/2.0) Uprime[i](mu) += mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt/6.0);
+ mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt/6.0) Uprime[i](mu) += mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt/24.0);
+ mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt/24.0) Uprime[i](mu) += mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt/120.0);
+ mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt/120.0) Uprime[i](mu) += mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt*dt/720.0);
+ mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt*dt/720.0)
;
} }
} }