diff --git a/.gitignore b/.gitignore index 80ea8e86..e82ecf9c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ *.o *.obj - # Editor files # ################ *~ @@ -48,6 +47,7 @@ Config.h.in config.log config.status .deps +*.inc # http://www.gnu.org/software/autoconf # ######################################## @@ -63,19 +63,7 @@ config.sub config.guess INSTALL .dirstamp - -# Packages # -############ -# it's better to unpack these files and commit the raw source -# git has its own built in compression methods -*.7z -*.dmg -*.gz -*.iso -*.jar -*.rar -*.tar -*.zip +ltmain.sh # Logs and databases # ###################### @@ -101,3 +89,16 @@ build*/* ##################### *.xcodeproj/* build.sh + +# Eigen source # +################ +lib/Eigen/* + +# FFTW source # +################ +lib/fftw/* + +# libtool macros # +################## +m4/lt* +m4/libtool.m4 \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index a2154ead..ae3efda8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,10 +9,6 @@ matrix: - os: osx osx_image: xcode7.2 compiler: clang - - os: osx - osx_image: xcode7.2 - compiler: gcc - env: VERSION=-5 - compiler: gcc addons: apt: @@ -23,6 +19,8 @@ matrix: - libmpfr-dev - libgmp-dev - libmpc-dev + - libopenmpi-dev + - openmpi-bin - binutils-dev env: VERSION=-4.9 - compiler: gcc @@ -35,6 +33,8 @@ matrix: - libmpfr-dev - libgmp-dev - libmpc-dev + - libopenmpi-dev + - openmpi-bin - binutils-dev env: VERSION=-5 - compiler: clang @@ -47,6 +47,8 @@ matrix: - libmpfr-dev - libgmp-dev - libmpc-dev + - libopenmpi-dev + - openmpi-bin - binutils-dev env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz - compiler: clang @@ -59,6 +61,8 @@ matrix: - libmpfr-dev - libgmp-dev - libmpc-dev + - libopenmpi-dev + - openmpi-bin - binutils-dev env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz @@ -69,6 +73,7 @@ before_install: - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi install: @@ -82,13 +87,20 @@ install: - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi script: - - ./scripts/reconfigure_script + - ./bootstrap.sh - mkdir build - cd build - - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none + - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none + - make -j4 + - ./benchmarks/Benchmark_dwf --threads 1 + - echo make clean + - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 - - make clean - - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none + - echo make clean + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi + - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi + diff --git a/Makefile.am b/Makefile.am index 3b1d5690..18b3ddc3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,5 +1,10 @@ # additional include paths necessary to compile the C++ library -AM_CXXFLAGS = -I$(top_srcdir)/ -SUBDIRS = lib tests benchmarks +SUBDIRS = lib benchmarks tests -filelist: $(SUBDIRS) \ No newline at end of file +.PHONY: tests + +tests: all + $(MAKE) -C tests tests + +AM_CXXFLAGS += -I$(top_builddir)/include +ACLOCAL_AMFLAGS = -I m4 diff --git a/README b/README deleted file mode 100644 index 17e92fa0..00000000 --- a/README +++ /dev/null @@ -1,44 +0,0 @@ -This library provides data parallel C++ container classes with internal memory layout -that is transformed to map efficiently to SIMD architectures. CSHIFT facilities -are provided, similar to HPF and cmfortran, and user control is given over the mapping of -array indices to both MPI tasks and SIMD processing elements. - -* Identically shaped arrays then be processed with perfect data parallelisation. -* Such identically shapped arrays are called conformable arrays. - -The transformation is based on the observation that Cartesian array processing involves -identical processing to be performed on different regions of the Cartesian array. - -The library will (eventually) both geometrically decompose into MPI tasks and across SIMD lanes. - -Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but -optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification -for most programmers. - -The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. -Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported. - -These are presented as - - vRealF, vRealD, vComplexF, vComplexD - -internal vector data types. These may be useful in themselves for other programmers. -The corresponding scalar types are named - - RealF, RealD, ComplexF, ComplexD - -MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is present in the library. - - You can give `configure' initial values for configuration parameters -by setting variables in the command line or in the environment. Here -is are examples: - - ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4 - - ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1 - - ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2 - - ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none - - diff --git a/README b/README new file mode 120000 index 00000000..42061c01 --- /dev/null +++ b/README @@ -0,0 +1 @@ +README.md \ No newline at end of file diff --git a/README.md b/README.md index 0a17bd45..f4a376f1 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,51 @@ -# Grid [![Build Status](https://travis-ci.org/paboyle/Grid.svg?branch=master)](https://travis-ci.org/paboyle/Grid) -Data parallel C++ mathematical object library +# Grid +
Last stable release | ++ + | +
Development branch | ++ + | +
`: setup Grid for the SIMD target `` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-precision={single|double}`: set the default precision (default: `double`).
+- `--enable-precision=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
+- `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
+- `--disable-timers`: disable system dependent high-resolution timers.
+- `--enable-chroma`: enable Chroma regression tests.
+
+### Possible communication interfaces
+
+The following options can be use with the `--enable-comms=` option to target different communication interfaces:
+
+| `` | Description |
+| -------------- | ------------------------------------------------------------- |
+| `none` | no communications |
+| `mpi[-auto]` | MPI communications |
+| `mpi3[-auto]` | MPI communications using MPI 3 shared memory |
+| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
+| `shmem ` | Cray SHMEM communications |
+
+For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
+
+### Possible SIMD types
+
+The following options can be use with the `--enable-simd=` option to target different SIMD instruction sets:
+
+| `` | Description |
+| ----------- | -------------------------------------- |
+| `GEN` | generic portable vector code |
+| `SSE4` | SSE 4.2 (128 bit) |
+| `AVX` | AVX (256 bit) |
+| `AVXFMA` | AVX (256 bit) + FMA |
+| `AVXFMA4` | AVX (256 bit) + FMA4 |
+| `AVX2` | AVX 2 (256 bit) |
+| `AVX512` | AVX 512 bit |
+| `QPX` | QPX (256 bit) |
+
+Alternatively, some CPU codenames can be directly used:
+
+| `` | Description |
+| ----------- | -------------------------------------- |
+| `KNC` | [Intel Xeon Phi codename Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
+| `KNL` | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
+| `BGQ` | Blue Gene/Q |
+
+#### Notes:
+- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
+- For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
+- BG/Q performances are currently rather poor. This is being investigated for future versions.
+
+### Build setup for Intel Knights Landing platform
+
+The following configuration is recommended for the Intel Knights Landing platform:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=KNL \
+ --enable-comms=mpi-auto \
+ --with-gmp= \
+ --with-mpfr= \
+ --enable-mkl \
+ CXX=icpc MPICXX=mpiicpc
+```
+
+where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=KNL \
+ --enable-comms=mpi \
+ --with-gmp= \
+ --with-mpfr= \
+ --enable-mkl \
+ CXX=CC CC=cc
+```
\ No newline at end of file
diff --git a/VERSION b/VERSION
index c12f9497..e7abbba7 100644
--- a/VERSION
+++ b/VERSION
@@ -1,4 +1,6 @@
-Version : 0.5.0
+Version : 0.6.0
- AVX512, AVX2, AVX, SSE good
- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above
+- MPI and MPI3
+- HiRep, Smearing, Generic gauge group
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 21b0dd0f..969a2a42 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -25,7 +25,7 @@ Author: Peter Boyle
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
-#include
+#include
using namespace std;
using namespace Grid;
@@ -42,15 +42,14 @@ int main (int argc, char ** argv)
int Nloop=10;
int nmu=0;
- for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++;
+ for(int mu=0;mu1) nmu++;
+
std::cout< latt_size ({lat*mpi_layout[0],
@@ -125,7 +124,7 @@ int main (int argc, char ** argv)
std::cout< latt_size ({lat,lat,lat,lat});
@@ -195,6 +194,168 @@ int main (int argc, char ** argv)
}
+ Nloop=100;
+ std::cout< latt_size ({lat*mpi_layout[0],
+ lat*mpi_layout[1],
+ lat*mpi_layout[2],
+ lat*mpi_layout[3]});
+
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+
+ std::vector xbuf(8);
+ std::vector rbuf(8);
+ Grid.ShmBufferFreeAll();
+ for(int d=0;d<8;d++){
+ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ }
+
+ int ncomm;
+ int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+ double start=usecond();
+ for(int i=0;i requests;
+
+ ncomm=0;
+ for(int mu=0;mu<4;mu++){
+
+ if (mpi_layout[mu]>1 ) {
+
+ ncomm++;
+ int comm_proc=1;
+ int xmit_to_rank;
+ int recv_from_rank;
+
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ Grid.StencilSendToRecvFromBegin(requests,
+ (void *)&xbuf[mu][0],
+ xmit_to_rank,
+ (void *)&rbuf[mu][0],
+ recv_from_rank,
+ bytes);
+
+ comm_proc = mpi_layout[mu]-1;
+
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ Grid.StencilSendToRecvFromBegin(requests,
+ (void *)&xbuf[mu+4][0],
+ xmit_to_rank,
+ (void *)&rbuf[mu+4][0],
+ recv_from_rank,
+ bytes);
+
+ }
+ }
+ Grid.StencilSendToRecvFromComplete(requests);
+ Grid.Barrier();
+
+ }
+ double stop=usecond();
+
+ double dbytes = bytes;
+ double xbytes = Nloop*dbytes*2.0*ncomm;
+ double rbytes = xbytes;
+ double bidibytes = xbytes+rbytes;
+
+ double time = stop-start; // microseconds
+
+ std::cout< latt_size ({lat*mpi_layout[0],
+ lat*mpi_layout[1],
+ lat*mpi_layout[2],
+ lat*mpi_layout[3]});
+
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+
+ std::vector xbuf(8);
+ std::vector rbuf(8);
+ Grid.ShmBufferFreeAll();
+ for(int d=0;d<8;d++){
+ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ }
+
+ int ncomm;
+ int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
+ double start=usecond();
+ for(int i=0;i requests;
+
+ ncomm=0;
+ for(int mu=0;mu<4;mu++){
+
+ if (mpi_layout[mu]>1 ) {
+
+ ncomm++;
+ int comm_proc=1;
+ int xmit_to_rank;
+ int recv_from_rank;
+
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ Grid.StencilSendToRecvFromBegin(requests,
+ (void *)&xbuf[mu][0],
+ xmit_to_rank,
+ (void *)&rbuf[mu][0],
+ recv_from_rank,
+ bytes);
+ // Grid.StencilSendToRecvFromComplete(requests);
+ // requests.resize(0);
+
+ comm_proc = mpi_layout[mu]-1;
+
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ Grid.StencilSendToRecvFromBegin(requests,
+ (void *)&xbuf[mu+4][0],
+ xmit_to_rank,
+ (void *)&rbuf[mu+4][0],
+ recv_from_rank,
+ bytes);
+ Grid.StencilSendToRecvFromComplete(requests);
+ requests.resize(0);
+
+ }
+ }
+ Grid.Barrier();
+
+ }
+ double stop=usecond();
+
+ double dbytes = bytes;
+ double xbytes = Nloop*dbytes*2.0*ncomm;
+ double rbytes = xbytes;
+ double bidibytes = xbytes+rbytes;
+
+ double time = stop-start; // microseconds
+
+ std::cout<
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
-#include
-#include
+#include
using namespace std;
using namespace Grid;
@@ -45,25 +44,20 @@ struct scal {
Gamma::GammaT
};
-bool overlapComms = false;
-typedef WilsonFermion5D WilsonFermion5DR;
-typedef WilsonFermion5D WilsonFermion5DF;
-typedef WilsonFermion5D WilsonFermion5DD;
+typedef WilsonFermion5D WilsonFermion5DR;
+typedef WilsonFermion5D WilsonFermion5DF;
+typedef WilsonFermion5D WilsonFermion5DD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
- if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
- overlapComms = true;
- }
-
int threads = GridThread::GetThreads();
std::cout< latt4 = GridDefaultLatt();
- const int Ls=16;
+ const int Ls=8;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -71,8 +65,8 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "Making s innermost grids"< seeds4({1,2,3,4});
@@ -87,8 +81,6 @@ int main (int argc, char ** argv)
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
- ColourMatrix cm = Complex(1.0,0.0);
-
LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
@@ -127,21 +119,27 @@ int main (int argc, char ** argv)
RealD mass=0.1;
RealD M5 =1.8;
- typename DomainWallFermionR::ImplParams params;
- params.overlapCommsCompute = overlapComms;
-
RealD NP = UGrid->_Nprocessors;
- for(int doasm=1;doasm<2;doasm++){
+ DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
- QCD::WilsonKernelsStatic::AsmOpt=doasm;
+ std::cout << GridLogMessage<< "*****************************************************************" <Barrier();
+ Dw.ZeroCounters();
double t0=usecond();
for(int i=0;iBarrier();
double volume=Ls; for(int mu=0;mu WilsonFermion5DR;
+
+ std::cout << GridLogMessage<< "*********************************************************" <::Dhop "< WilsonFermion5DR;
LatticeFermion ssrc(sFGrid);
LatticeFermion sref(sFGrid);
LatticeFermion sresult(sFGrid);
- WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
+
+ WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
for(int x=0;xBarrier();
double t0=usecond();
+ sDw.ZeroCounters();
for(int i=0;iBarrier();
double volume=Ls; for(int mu=0;mu 1.0e-6 ) {
+ std::cout << "site "<::DhopEO "<Barrier();
+ sDw.ZeroCounters();
+ sDw.stat.init("DhopEO");
double t0=usecond();
- for(int i=0;iBarrier();
+ sDw.stat.print();
double volume=Ls; for(int mu=0;mu1.0e-4) {
+ setCheckerboard(ssrc,ssrc_o);
+ setCheckerboard(ssrc,ssrc_e);
+ std::cout<< ssrc << std::endl;
+ }
}
@@ -284,24 +324,25 @@ int main (int argc, char ** argv)
// ref = src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu+1,1);
for(int i=0;iBarrier();
double t0=usecond();
for(int i=0;iBarrier();
double volume=Ls; for(int mu=0;mu
-Author: paboyle
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
- See the full license in the file "LICENSE" in the top level distribution directory
- *************************************************************************************/
- /* END LEGAL */
-#include
-#include
-
-using namespace std;
-using namespace Grid;
-using namespace Grid::QCD;
-
-template
-struct scal {
- d internal;
-};
-
- Gamma::GammaMatrix Gmu [] = {
- Gamma::GammaX,
- Gamma::GammaY,
- Gamma::GammaZ,
- Gamma::GammaT
- };
-
-bool overlapComms = false;
-
-
-int main (int argc, char ** argv)
-{
- Grid_init(&argc,&argv);
-
- if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
- overlapComms = true;
- }
-
- int threads = GridThread::GetThreads();
- std::cout< latt4 = GridDefaultLatt();
- const int Ls=16;
- GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
- GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
- GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
- GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
- std::vector seeds4({1,2,3,4});
- std::vector seeds5({5,6,7,8});
-
- GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
- GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
-
- LatticeFermion src (FGrid); random(RNG5,src);
- LatticeFermion result(FGrid); result=zero;
- LatticeFermion ref(FGrid); ref=zero;
- LatticeFermion tmp(FGrid);
- LatticeFermion err(FGrid);
-
- ColourMatrix cm = Complex(1.0,0.0);
-
- LatticeGaugeField Umu(UGrid);
- random(RNG4,Umu);
-
- LatticeGaugeField Umu5d(FGrid);
-
- // replicate across fifth dimension
- for(int ss=0;ssoSites();ss++){
- for(int s=0;s U(4,FGrid);
- for(int mu=0;mu(Umu5d,mu);
- }
-
- if (1)
- {
- ref = zero;
- for(int mu=0;mu_Nprocessors;
-
-
- QCD::WilsonKernelsStatic::AsmOpt=1;
-
- DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
-
- std::cout<
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
-#include
-#include
+#include
using namespace std;
using namespace Grid;
@@ -52,22 +51,26 @@ int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
- const int Ls=16;
+ std::cout << GridLogMessage<< "*****************************************************************" < & latt4, int Ls, int threads,int report )
ColourMatrix cm = Complex(1.0,0.0);
-
LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension
@@ -145,11 +147,10 @@ void benchDw(std::vector & latt4, int Ls, int threads,int report )
}
#ifdef CHECK
- if (1)
- {
+ if (1) {
+
ref = zero;
for(int mu=0;mu & latt4, int Ls, int threads,int report )
Counter.Report();
}
- if ( ! report )
- {
- double volume=Ls; for(int mu=0;mu 1.0e-4 ) {
- std::cout< 1.0e-4 ) {
+ std::cout< & latt4, int Ls, int threads,int report )
std::cout<< flops/(t1-t0);
}
}
-
}
-#undef CHECK_SDW
+#define CHECK_SDW
void benchsDw(std::vector & latt4, int Ls, int threads, int report )
{
@@ -243,7 +242,9 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report )
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
+ GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
@@ -277,93 +278,89 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report )
}
}
-
RealD mass=0.1;
RealD M5 =1.8;
- typedef WilsonFermion5D WilsonFermion5DR;
- LatticeFermion ssrc(sFGrid);
- LatticeFermion sref(sFGrid);
- LatticeFermion sresult(sFGrid);
- WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
+ typedef WilsonFermion5D WilsonFermion5DR;
+ LatticeFermion ssrc(sFGrid);
+ LatticeFermion sref(sFGrid);
+ LatticeFermion sresult(sFGrid);
+ WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
- for(int x=0;x site({s,x,y,z,t});
- SpinColourVector tmp;
- peekSite(tmp,src,site);
- pokeSite(tmp,ssrc,site);
- }}}}}
+ for(int x=0;x site({s,x,y,z,t});
+ SpinColourVector tmp;
+ peekSite(tmp,src,site);
+ pokeSite(tmp,ssrc,site);
+ }}}}}
- double t0=usecond();
- sDw.Dhop(ssrc,sresult,0);
- double t1=usecond();
+ double t0=usecond();
+ sDw.Dhop(ssrc,sresult,0);
+ double t1=usecond();
#ifdef TIMERS_OFF
- int ncall =10;
+ int ncall =10;
#else
- int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
+ int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
#endif
- PerformanceCounter Counter(8);
- Counter.Start();
- t0=usecond();
- for(int i=0;i
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
-#include
+#include
using namespace std;
using namespace Grid;
diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index 1fc5cbc4..435af7f4 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -26,7 +26,7 @@ Author: paboyle
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
-#include
+#include
using namespace std;
using namespace Grid;
diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index a5f71299..b6d1d303 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -26,7 +26,7 @@ Author: Peter Boyle
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
-#include
+#include
using namespace std;
using namespace Grid;
diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc
index 10aa7009..4930713c 100644
--- a/benchmarks/Benchmark_wilson.cc
+++ b/benchmarks/Benchmark_wilson.cc
@@ -26,7 +26,7 @@ Author: paboyle
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
-#include
+#include
using namespace std;
using namespace Grid;
diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc
new file mode 100644
index 00000000..96e5b5e4
--- /dev/null
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -0,0 +1,130 @@
+/*************************************************************************************
+ Grid physics library, www.github.com/paboyle/Grid
+ Source file: ./benchmarks/Benchmark_wilson.cc
+ Copyright (C) 2015
+Author: Peter Boyle
+Author: paboyle
+Author: Richard Rollins
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/* END LEGAL */
+#include
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template
+struct scal {
+ d internal;
+};
+
+Gamma::GammaMatrix Gmu [] = {
+ Gamma::GammaX,
+ Gamma::GammaY,
+ Gamma::GammaZ,
+ Gamma::GammaT
+};
+
+bool overlapComms = false;
+
+void bench_wilson (
+ LatticeFermion & src,
+ LatticeFermion & result,
+ WilsonFermionR & Dw,
+ double const volume,
+ int const dag );
+
+int main (int argc, char ** argv)
+{
+ Grid_init(&argc,&argv);
+ if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; }
+ typename WilsonFermionR::ImplParams params;
+ params.overlapCommsCompute = overlapComms;
+
+ std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+ std::vector mpi_layout = GridDefaultMpi();
+ std::vector seeds({1,2,3,4});
+ RealD mass = 0.1;
+
+ std::cout << GridLogMessage<< "*****************************************************************" < latt_size = std::vector(4,L);
+ for(int d=4; d>dmin; d--)
+ {
+ if ( d<=3 ) { latt_size[d] *= 2; }
+
+ std::cout << GridLogMessage;
+ std::copy( latt_size.begin(), --latt_size.end(), std::ostream_iterator( std::cout, std::string("x").c_str() ) );
+ std::cout << latt_size.back() << "\t\t";
+
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
+
+ GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
+ LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+ LatticeFermion src(&Grid); random(pRNG,src);
+ LatticeFermion result(&Grid); result=zero;
+
+ double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies());
+
+ WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
+
+ bench_wilson(src,result,Dw,volume,DaggerNo);
+ bench_wilson(src,result,Dw,volume,DaggerYes);
+ std::cout << std::endl;
+ }
+ }
+
+ std::cout<
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
- See the full license in the file "LICENSE" in the top level distribution directory
- *************************************************************************************/
- /* END LEGAL */
-#include
-#include
-
-
-using namespace Grid;
-using namespace Grid::QCD;
-
-
-int bench(std::ofstream &os, std::vector &latt4,int Ls);
-
-int main(int argc,char **argv)
-{
- Grid_init(&argc,&argv);
- std::ofstream os("zmm.dat");
-
- os << "#V Ls Lxy Lzt C++ Asm OMP L1 " < grid({L,L,m*L,m*L});
- for(int i=0;i<4;i++) {
- std::cout << grid[i]<<"x";
- }
- std::cout << Ls< &latt4,int Ls)
-{
-
- GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
- GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
- GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
- GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
- std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
- std::vector mpi_layout = GridDefaultMpi();
- int threads = GridThread::GetThreads();
-
- std::vector seeds4({1,2,3,4});
- std::vector seeds5({5,6,7,8});
-
- GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
-
- LatticeFermion src (FGrid);
- LatticeFermion tmp (FGrid);
- LatticeFermion srce(FrbGrid);
-
- LatticeFermion resulto(FrbGrid); resulto=zero;
- LatticeFermion resulta(FrbGrid); resulta=zero;
- LatticeFermion junk(FrbGrid); junk=zero;
- LatticeFermion diff(FrbGrid);
- LatticeGaugeField Umu(UGrid);
-
- double mfc, mfa, mfo, mfl1;
-
- GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
- GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
- random(RNG5,src);
-#if 1
- random(RNG4,Umu);
-#else
- int mmu=2;
- std::vector U(4,UGrid);
- for(int mu=0;mu(Umu,mu);
- if ( mu!=mmu ) U[mu] = zero;
- if ( mu==mmu ) U[mu] = 1.0;
- PokeIndex(Umu,U[mu],mu);
- }
-#endif
- pickCheckerboard(Even,srce,src);
-
- RealD mass=0.1;
- RealD M5 =1.8;
- DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-
- std::cout<
-
AC_PREREQ([2.63])
-AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
-AC_CANONICAL_SYSTEM
+AC_INIT([Grid], [0.6.0], [https://github.com/paboyle/Grid], [Grid])
+AC_CANONICAL_BUILD
+AC_CANONICAL_HOST
+AC_CANONICAL_TARGET
AM_INIT_AUTOMAKE(subdir-objects)
AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_SRCDIR([lib/Grid.h])
AC_CONFIG_HEADERS([lib/Config.h])
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-AC_MSG_NOTICE([
-
-:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-Configuring $PACKAGE v$VERSION for $host
-:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-])
-
-# Checks for programs.
-AC_LANG(C++)
+############### Checks for programs
+CXXFLAGS="-O3 $CXXFLAGS"
AC_PROG_CXX
-AC_OPENMP
AC_PROG_RANLIB
-#AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
-AX_EXT
-# Checks for libraries.
-#AX_GCC_VAR_ATTRIBUTE(aligned)
+############### Get compiler informations
+AC_LANG([C++])
+AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
+AX_COMPILER_VENDOR
+AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
+ [vendor of C++ compiler that will compile the code])
+AX_GXX_VERSION
+AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
+ [version of g++ that will compile the code])
-# Checks for header files.
+############### Checks for typedefs, structures, and compiler characteristics
+AC_TYPE_SIZE_T
+AC_TYPE_UINT32_T
+AC_TYPE_UINT64_T
+
+############### OpenMP
+AC_OPENMP
+ac_openmp=no
+if test "${OPENMP_CXXFLAGS}X" != "X"; then
+ ac_openmp=yes
+ AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
+ AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
+fi
+
+############### Checks for header files
AC_CHECK_HEADERS(stdint.h)
AC_CHECK_HEADERS(mm_malloc.h)
AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h)
-AC_CHECK_HEADERS(gmp.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include ]])
AC_CHECK_DECLS([be64toh],[], [], [[#include ]])
-# Checks for typedefs, structures, and compiler characteristics.
-AC_TYPE_SIZE_T
-AC_TYPE_UINT32_T
-AC_TYPE_UINT64_T
+############### GMP and MPFR
+AC_ARG_WITH([gmp],
+ [AS_HELP_STRING([--with-gmp=prefix],
+ [try this for a non-standard install prefix of the GMP library])],
+ [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
+ [AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
+AC_ARG_WITH([mpfr],
+ [AS_HELP_STRING([--with-mpfr=prefix],
+ [try this for a non-standard install prefix of the MPFR library])],
+ [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
+ [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
-# Checks for library functions.
-echo
-echo Checking libraries
-echo :::::::::::::::::::::::::::::::::::::::::::
+############### FFTW3
+AC_ARG_WITH([fftw],
+ [AS_HELP_STRING([--with-fftw=prefix],
+ [try this for a non-standard install prefix of the FFTW3 library])],
+ [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
+ [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
+
+############### lapack
+AC_ARG_ENABLE([lapack],
+ [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
+ [ac_LAPACK=${enable_lapack}], [ac_LAPACK=no])
+
+case ${ac_LAPACK} in
+ no)
+ ;;
+ yes)
+ AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
+ *)
+ AM_CXXFLAGS="-I$ac_LAPACK/include $AM_CXXFLAGS"
+ AM_LDFLAGS="-L$ac_LAPACK/lib $AM_LDFLAGS"
+ AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
+esac
+
+############### MKL
+AC_ARG_ENABLE([mkl],
+ [AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
+ [ac_MKL=${enable_mkl}], [ac_MKL=no])
+
+case ${ac_MKL} in
+ no)
+ ;;
+ yes)
+ AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);;
+ *)
+ AM_CXXFLAGS="-I$ac_MKL/include $AM_CXXFLAGS"
+ AM_LDFLAGS="-L$ac_MKL/lib $AM_LDFLAGS"
+ AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);;
+esac
+
+############### first-touch
+AC_ARG_ENABLE([numa],
+ [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
+ [ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
+
+case ${ac_NUMA} in
+ no)
+ ;;
+ yes)
+ AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
+ *)
+ AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
+esac
+
+############### Checks for library functions
+CXXFLAGS_CPY=$CXXFLAGS
+LDFLAGS_CPY=$LDFLAGS
+CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
+LDFLAGS="$AM_LDFLAGS $LDFLAGS"
AC_CHECK_FUNCS([gettimeofday])
-#AC_CHECK_LIB([gmp],[__gmpf_init],,
-# [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.gmplib.org)])
+if test "${ac_MKL}x" != "nox"; then
+ AC_SEARCH_LIBS([mkl_set_interface_layer], [mkl_rt], [],
+ [AC_MSG_ERROR("MKL enabled but library not found")])
+fi
-#AC_CHECK_LIB([mpfr],[mpfr_init],,
-# [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.mpfr.org/)])
+AC_SEARCH_LIBS([__gmpf_init], [gmp],
+ [AC_SEARCH_LIBS([mpfr_init], [mpfr],
+ [AC_DEFINE([HAVE_LIBMPFR], [1],
+ [Define to 1 if you have the `MPFR' library])]
+ [have_mpfr=true], [AC_MSG_ERROR([MPFR library not found])])]
+ [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library])]
+ [have_gmp=true])
-#
-# SIMD instructions selection
-#
+if test "${ac_LAPACK}x" != "nox"; then
+ AC_SEARCH_LIBS([LAPACKE_sbdsdc], [lapack], [],
+ [AC_MSG_ERROR("LAPACK enabled but library not found")])
+fi
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
- [Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
- [ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])
+AC_SEARCH_LIBS([fftw_execute], [fftw3],
+ [AC_SEARCH_LIBS([fftwf_execute], [fftw3f], [],
+ [AC_MSG_ERROR("single precision FFTW library not found")])]
+ [AC_DEFINE([HAVE_FFTW], [1], [Define to 1 if you have the `FFTW' library])]
+ [have_fftw=true])
-supported=no
+CXXFLAGS=$CXXFLAGS_CPY
+LDFLAGS=$LDFLAGS_CPY
-ac_ZMM=no;
+############### SIMD instruction selection
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=],
+ [select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])
+
+case ${ax_cv_cxx_compiler_vendor} in
+ clang|gnu)
+ case ${ac_SIMD} in
+ SSE4)
+ AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
+ SIMD_FLAGS='-msse4.2';;
+ AVX)
+ AC_DEFINE([AVX1],[1],[AVX intrinsics])
+ SIMD_FLAGS='-mavx';;
+ AVXFMA4)
+ AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
+ SIMD_FLAGS='-mavx -mfma4';;
+ AVXFMA)
+ AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
+ SIMD_FLAGS='-mavx -mfma';;
+ AVX2)
+ AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
+ SIMD_FLAGS='-mavx2 -mfma';;
+ AVX512)
+ AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+ SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
+ KNC)
+ AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
+ SIMD_FLAGS='';;
+ KNL)
+ AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+ SIMD_FLAGS='-march=knl';;
+ GEN)
+ AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
+ SIMD_FLAGS='';;
+ QPX|BGQ)
+ AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
+ SIMD_FLAGS='';;
+ *)
+ AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
+ esac;;
+ intel)
+ case ${ac_SIMD} in
+ SSE4)
+ AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
+ SIMD_FLAGS='-msse4.2 -xsse4.2';;
+ AVX)
+ AC_DEFINE([AVX1],[1],[AVX intrinsics])
+ SIMD_FLAGS='-mavx -xavx';;
+ AVXFMA)
+ AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
+ SIMD_FLAGS='-mavx -mfma';;
+ AVX2)
+ AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
+ SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
+ AVX512)
+ AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+ SIMD_FLAGS='-xcore-avx512';;
+ KNC)
+ AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+ SIMD_FLAGS='';;
+ KNL)
+ AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
+ SIMD_FLAGS='-xmic-avx512';;
+ GEN)
+ AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
+ SIMD_FLAGS='';;
+ *)
+ AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
+ esac;;
+ *)
+ AC_MSG_WARN([Compiler unknown, using generic vector code])
+ AC_DEFINE([GENERIC_VEC],[1],[generic vector code]);;
+esac
+AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
+AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"
case ${ac_SIMD} in
- SSE4)
- echo Configuring for SSE4
- AC_DEFINE([SSE4],[1],[SSE4 Intrinsics] )
- if test x"$ax_cv_support_ssse3_ext" = x"yes"; then dnl minimal support for SSE4
- supported=yes
- else
- AC_MSG_WARN([Your processor does not support SSE4 instructions])
- fi
- ;;
- AVX)
- echo Configuring for AVX
- AC_DEFINE([AVX1],[1],[AVX Intrinsics] )
- if test x"$ax_cv_support_avx_ext" = x"yes"; then dnl minimal support for AVX
- supported=yes
- else
- AC_MSG_WARN([Your processor does not support AVX instructions])
- fi
- ;;
- AVXFMA4)
- echo Configuring for AVX
- AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
- if test x"$ax_cv_support_avx_ext" = x"yes"; then dnl minimal support for AVX
- supported=yes
- else
- AC_MSG_WARN([Your processor does not support AVX instructions])
- fi
- ;;
- AVX2)
- echo Configuring for AVX2
- AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
- if test x"$ax_cv_support_avx2_ext" = x"yes"; then dnl minimal support for AVX2
- supported=yes
- else
- AC_MSG_WARN([Your processor does not support AVX2 instructions])
- fi
- ;;
- AVX512)
- echo Configuring for AVX512
- AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
- supported="cross compilation"
- ac_ZMM=yes;
- ;;
- IMCI)
- echo Configuring for IMCI
- AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
- supported="cross compilation"
- ac_ZMM=no;
- ;;
- NEONv8)
- echo Configuring for experimental ARMv8a support
- AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
- supported="cross compilation"
- ;;
- DEBUG)
- echo Configuring without SIMD support - only for compiler DEBUGGING!
- AC_DEFINE([EMPTY_SIMD],[1],[EMPTY_SIMD only for DEBUGGING] )
- ;;
- *)
- AC_MSG_ERROR([${ac_SIMD} flag unsupported as --enable-simd option\nRun ./configure --help for the list of options]);
- ;;
+ AVX512|KNL)
+ AC_DEFINE([TEST_ZMM],[1],[compile ZMM test]);;
+ *)
+ ;;
esac
-case ${ac_ZMM} in
-yes)
- echo Enabling ZMM source code
-;;
-no)
- echo Disabling ZMM source code
-;;
-esac
+############### Precision selection
+AC_ARG_ENABLE([precision],
+ [AC_HELP_STRING([--enable-precision=single|double],
+ [Select default word size of Real])],
+ [ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
-AM_CONDITIONAL(BUILD_ZMM,[ test "X${ac_ZMM}X" == "XyesX" ])
-
-AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
case ${ac_PRECISION} in
single)
- echo default precision is single
AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
;;
double)
- echo default precision is double
AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
;;
esac
-#
-# Comms selection
-#
-
-AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
+############### communication type selection
+AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
+ [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
case ${ac_COMMS} in
none)
- echo Configuring for NO communications
- AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
+ AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
+ comms_type='none'
;;
- mpi)
- echo Configuring for MPI communications
- AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
+ mpi3l*)
+ AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
+ comms_type='mpi3l'
+ ;;
+ mpi3*)
+ AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
+ comms_type='mpi3'
+ ;;
+ mpi*)
+ AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
+ comms_type='mpi'
;;
shmem)
- echo Configuring for SHMEM communications
- AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
+ AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
+ comms_type='shmem'
;;
*)
- AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
+ AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
;;
esac
+case ${ac_COMMS} in
+ *-auto)
+ LX_FIND_MPI
+ if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
+ AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
+ AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
+ AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
+ LIBS="`echo $MPI_CXXLDFLAGS | sed -E 's/-L@<:@^ @:>@+//g'` $LIBS";;
+ *)
+ ;;
+esac
-AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
-AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
-AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
+AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
+AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
+AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
+AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
+AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
-#
-# RNG selection
-#
+############### RNG selection
AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\
- [Select Random Number Generator to be used])],\
- [ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
+ [Select Random Number Generator to be used])],\
+ [ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
+
case ${ac_RNG} in
ranlux48)
- AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
+ AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
;;
mt19937)
- AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
+ AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
;;
*)
- AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
+ AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
;;
esac
-#
-# SDE timing mode
-#
-AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
- [Enable system dependent high res timers])],\
- [ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
+############### Timer option
+AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers],\
+ [Enable system dependent high res timers])],\
+ [ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
+
case ${ac_TIMERS} in
yes)
- AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
+ AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
;;
no)
- AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
+ AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
;;
*)
- AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
+ AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
;;
esac
-#
-# Chroma regression tests
-#
-AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
+############### Chroma regression test
+AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],
+ [Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
case ${ac_CHROMA} in
- yes)
- echo Enabling tests regressing to Chroma
- ;;
- no)
- echo Disabling tests regressing to Chroma
+ yes|no)
;;
*)
- AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]);
+ AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]);
;;
esac
AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
-#
-# Lapack
-#
-AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+############### Doxygen
+AC_PROG_DOXYGEN
-case ${ac_LAPACK} in
- yes)
- echo Enabling lapack
- ;;
- no)
- echo Disabling lapack
- ;;
- *)
- echo Enabling lapack at ${ac_LAPACK}
- ;;
-esac
+if test -n "$DOXYGEN"
+then
+AC_CONFIG_FILES([docs/doxy.cfg])
+fi
-AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
-AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
-
-###################################################################
-# Checks for doxygen support
-# if present enables the "make doxyfile" command
-#echo
-#echo Checking doxygen support
-#echo :::::::::::::::::::::::::::::::::::::::::::
-#AC_PROG_DOXYGEN
-
-#if test -n "$DOXYGEN"
-#then
-#AC_CONFIG_FILES([docs/doxy.cfg])
-#fi
-
-echo
-echo Creating configuration files
-echo :::::::::::::::::::::::::::::::::::::::::::
+############### Ouput
+cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
+AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
+AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
+AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
+AC_SUBST([AM_CFLAGS])
+AC_SUBST([AM_CXXFLAGS])
+AC_SUBST([AM_LDFLAGS])
AC_CONFIG_FILES(Makefile)
AC_CONFIG_FILES(lib/Makefile)
AC_CONFIG_FILES(tests/Makefile)
+AC_CONFIG_FILES(tests/IO/Makefile)
+AC_CONFIG_FILES(tests/core/Makefile)
+AC_CONFIG_FILES(tests/debug/Makefile)
+AC_CONFIG_FILES(tests/forces/Makefile)
+AC_CONFIG_FILES(tests/hmc/Makefile)
+AC_CONFIG_FILES(tests/solver/Makefile)
AC_CONFIG_FILES(tests/qdpxx/Makefile)
AC_CONFIG_FILES(benchmarks/Makefile)
AC_OUTPUT
-
-echo "
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Summary of configuration for $PACKAGE v$VERSION
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The following features are enabled:
-
-- architecture (build) : $build_cpu
-- os (build) : $build_os
-- architecture (target) : $target_cpu
-- os (target) : $target_os
-- build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
-- graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
-- Supported SIMD flags : $SIMD_FLAGS
-----------------------------------------------------------
-- enabled simd support : ${ac_SIMD} (config macro says supported: $supported )
-- communications type : ${ac_COMMS}
-- default precision : ${ac_PRECISION}
-- RNG choice : ${ac_RNG}
-- LAPACK : ${ac_LAPACK}
-
-
-"
+----- PLATFORM ----------------------------------------
+architecture (build) : $build_cpu
+os (build) : $build_os
+architecture (target) : $target_cpu
+os (target) : $target_os
+compiler vendor : ${ax_cv_cxx_compiler_vendor}
+compiler version : ${ax_cv_gxx_version}
+----- BUILD OPTIONS -----------------------------------
+SIMD : ${ac_SIMD}
+Threading : ${ac_openmp}
+Communications type : ${comms_type}
+Default precision : ${ac_PRECISION}
+RNG choice : ${ac_RNG}
+GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
+LAPACK : ${ac_LAPACK}
+FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
+build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
+graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
+----- BUILD FLAGS -------------------------------------
+CXXFLAGS:
+`echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
+LDFLAGS:
+`echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
+LIBS:
+`echo ${LIBS} | tr ' ' '\n' | sed 's/^-/ -/g'`
+-------------------------------------------------------" > config.summary
+echo ""
+cat config.summary
+echo ""
diff --git a/include/Grid b/include/Grid
new file mode 120000
index 00000000..dc598c56
--- /dev/null
+++ b/include/Grid
@@ -0,0 +1 @@
+../lib
\ No newline at end of file
diff --git a/lib/Algorithms.h b/lib/Algorithms.h
index 0a3d34ce..67eb11c3 100644
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -29,27 +29,28 @@ Author: Peter Boyle
#ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H
-#include
-#include
-#include
+#include
+#include
+#include
-#include
-#include
-#include
-#include
+#include
+#include
+#include
+#include
-#include
-#include
-#include
-#include
+#include
+#include
+#include
+#include
-#include
+#include
+#include
// Lanczos support
-#include
-#include
+#include
+#include
-#include
+#include
// Eigen/lanczos
// EigCg
diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h
index 2cd8263d..a8b9c53b 100644
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -40,14 +40,6 @@ Author: Peter Boyle
#include
#endif
-#ifdef GRID_COMMS_SHMEM
-extern "C" {
-#include
-extern void * shmem_align(size_t, size_t);
-extern void shmem_free(void *);
-}
-#endif
-
namespace Grid {
////////////////////////////////////////////////////////////////////
@@ -65,28 +57,85 @@ public:
typedef _Tp value_type;
template struct rebind { typedef alignedAllocator<_Tp1> other; };
-
alignedAllocator() throw() { }
-
alignedAllocator(const alignedAllocator&) throw() { }
-
template alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
-
~alignedAllocator() throw() { }
-
pointer address(reference __x) const { return &__x; }
- // const_pointer address(const_reference __x) const { return &__x; }
-
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
+#ifdef HAVE_MM_MALLOC_H
+ _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+#else
+ _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+#endif
+
+ _Tp tmp;
+#ifdef GRID_NUMA
+#pragma omp parallel for schedule(static)
+ for(int i=0;i<__n;i++){
+ ptr[i]=tmp;
+ }
+#endif
+ return ptr;
+ }
+
+ void deallocate(pointer __p, size_type) {
+#ifdef HAVE_MM_MALLOC_H
+ _mm_free((void *)__p);
+#else
+ free((void *)__p);
+#endif
+ }
+ void construct(pointer __p, const _Tp& __val) { };
+ void construct(pointer __p) { };
+ void destroy(pointer __p) { };
+};
+template inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// MPI3 : comms must use shm region
+// SHMEM: comms must use symmetric heap
+//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM
-
- _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
-
-
+extern "C" {
+#include
+extern void * shmem_align(size_t, size_t);
+extern void shmem_free(void *);
+}
#define PARANOID_SYMMETRIC_HEAP
+#endif
+
+template
+class commAllocator {
+public:
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+ typedef _Tp* pointer;
+ typedef const _Tp* const_pointer;
+ typedef _Tp& reference;
+ typedef const _Tp& const_reference;
+ typedef _Tp value_type;
+
+ template struct rebind { typedef commAllocator<_Tp1> other; };
+ commAllocator() throw() { }
+ commAllocator(const commAllocator&) throw() { }
+ template commAllocator(const commAllocator<_Tp1>&) throw() { }
+ ~commAllocator() throw() { }
+ pointer address(reference __x) const { return &__x; }
+ size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+
+#ifdef GRID_COMMS_SHMEM
+ pointer allocate(size_type __n, const void* _p= 0)
+ {
+#ifdef CRAY
+ _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
+#else
+ _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
+#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
@@ -96,55 +145,47 @@ public:
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
- BACKTRACEFILE();
+ // BACKTRACEFILE();
exit(0);
}
-
assert( bcast == (void *) ptr);
-
#endif
+ return ptr;
+ }
+ void deallocate(pointer __p, size_type) {
+ shmem_free((void *)__p);
+ }
#else
-
+ pointer allocate(size_type __n, const void* _p= 0)
+ {
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
#else
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
#endif
-
-#endif
- _Tp tmp;
-#undef FIRST_TOUCH_OPTIMISE
-#ifdef FIRST_TOUCH_OPTIMISE
-#pragma omp parallel for
- for(int i=0;i<__n;i++){
- ptr[i]=tmp;
- }
-#endif
return ptr;
}
-
void deallocate(pointer __p, size_type) {
-#ifdef GRID_COMMS_SHMEM
- shmem_free((void *)__p);
-#else
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
-#endif
#endif
}
+#endif
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
-
void destroy(pointer __p) { };
};
+template inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
+template inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
-template inline bool
-operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
-
-template inline bool
-operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+////////////////////////////////////////////////////////////////////////////////
+// Template typedefs
+////////////////////////////////////////////////////////////////////////////////
+template using Vector = std::vector >;
+template using commVector = std::vector >;
+template using Matrix = std::vector > >;
}; // namespace Grid
#endif
diff --git a/lib/Cartesian.h b/lib/Cartesian.h
index aeffe331..f3710a48 100644
--- a/lib/Cartesian.h
+++ b/lib/Cartesian.h
@@ -28,8 +28,8 @@ Author: Peter Boyle
#ifndef GRID_CARTESIAN_H
#define GRID_CARTESIAN_H
-#include
-#include
-#include
+#include
+#include
+#include
#endif
diff --git a/lib/Communicator.h b/lib/Communicator.h
index bc3ae166..09ce50dc 100644
--- a/lib/Communicator.h
+++ b/lib/Communicator.h
@@ -28,6 +28,6 @@ Author: Peter Boyle
#ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H
-#include
+#include
#endif
diff --git a/lib/Cshift.h b/lib/Cshift.h
index 675544e2..cd162e35 100644
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -28,17 +28,25 @@ Author: Peter Boyle
#ifndef _GRID_CSHIFT_H_
#define _GRID_CSHIFT_H_
-#include
+#include
#ifdef GRID_COMMS_NONE
-#include
+#include
#endif
#ifdef GRID_COMMS_MPI
-#include
+#include
+#endif
+
+#ifdef GRID_COMMS_MPI3
+#include
+#endif
+
+#ifdef GRID_COMMS_MPI3L
+#include
#endif
#ifdef GRID_COMMS_SHMEM
-#include // uses same implementation of communicator
+#include // uses same implementation of communicator
#endif
#endif
diff --git a/lib/FFT.h b/lib/FFT.h
new file mode 100644
index 00000000..b5b31d82
--- /dev/null
+++ b/lib/FFT.h
@@ -0,0 +1,302 @@
+
+ /*************************************************************************************
+
+ Grid physics library, www.github.com/paboyle/Grid
+
+ Source file: ./lib/Cshift.h
+
+ Copyright (C) 2015
+
+Author: Peter Boyle
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+ /* END LEGAL */
+#ifndef _GRID_FFT_H_
+#define _GRID_FFT_H_
+
+#ifdef HAVE_FFTW
+#ifdef USE_MKL
+#include
+#else
+#include
+#endif
+#endif
+
+
+namespace Grid {
+
+ template struct FFTW { };
+
+#ifdef HAVE_FFTW
+ template<> struct FFTW {
+ public:
+
+ typedef fftw_complex FFTW_scalar;
+ typedef fftw_plan FFTW_plan;
+
+ static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+ FFTW_scalar *in, const int *inembed,
+ int istride, int idist,
+ FFTW_scalar *out, const int *onembed,
+ int ostride, int odist,
+ int sign, unsigned flags) {
+ return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+ }
+
+ static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+ ::fftw_flops(p,add,mul,fmas);
+ }
+
+ inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+ ::fftw_execute_dft(p,in,out);
+ }
+ inline static void fftw_destroy_plan(const FFTW_plan p) {
+ ::fftw_destroy_plan(p);
+ }
+ };
+
+ template<> struct FFTW {
+ public:
+
+ typedef fftwf_complex FFTW_scalar;
+ typedef fftwf_plan FFTW_plan;
+
+ static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+ FFTW_scalar *in, const int *inembed,
+ int istride, int idist,
+ FFTW_scalar *out, const int *onembed,
+ int ostride, int odist,
+ int sign, unsigned flags) {
+ return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+ }
+
+ static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+ ::fftwf_flops(p,add,mul,fmas);
+ }
+
+ inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+ ::fftwf_execute_dft(p,in,out);
+ }
+ inline static void fftw_destroy_plan(const FFTW_plan p) {
+ ::fftwf_destroy_plan(p);
+ }
+ };
+
+#endif
+
+#ifndef FFTW_FORWARD
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+#endif
+
+ class FFT {
+ private:
+
+ GridCartesian *vgrid;
+ GridCartesian *sgrid;
+
+ int Nd;
+ double flops;
+ double flops_call;
+ uint64_t usec;
+
+ std::vector dimensions;
+ std::vector processors;
+ std::vector processor_coor;
+
+ public:
+
+ static const int forward=FFTW_FORWARD;
+ static const int backward=FFTW_BACKWARD;
+
+ double Flops(void) {return flops;}
+ double MFlops(void) {return flops/usec;}
+ double USec(void) {return (double)usec;}
+
+ FFT ( GridCartesian * grid ) :
+ vgrid(grid),
+ Nd(grid->_ndimension),
+ dimensions(grid->_fdimensions),
+ processors(grid->_processors),
+ processor_coor(grid->_processor_coor)
+ {
+ flops=0;
+ usec =0;
+ std::vector layout(Nd,1);
+ sgrid = new GridCartesian(dimensions,layout,processors);
+ };
+
+ ~FFT ( void) {
+ delete sgrid;
+ }
+
+ template
+ void FFT_dim_mask(Lattice &result,const Lattice &source,std::vector mask,int sign){
+
+ conformable(result._grid,vgrid);
+ conformable(source._grid,vgrid);
+ Lattice tmp(vgrid);
+ tmp = source;
+ for(int d=0;d
+ void FFT_all_dim(Lattice &result,const Lattice &source,int sign){
+ std::vector mask(Nd,1);
+ FFT_dim_mask(result,source,mask,sign);
+ }
+
+
+ template
+ void FFT_dim(Lattice &result,const Lattice &source,int dim, int sign){
+#ifndef HAVE_FFTW
+ assert(0);
+#else
+ conformable(result._grid,vgrid);
+ conformable(source._grid,vgrid);
+
+ int L = vgrid->_ldimensions[dim];
+ int G = vgrid->_fdimensions[dim];
+
+ std::vector layout(Nd,1);
+ std::vector pencil_gd(vgrid->_fdimensions);
+
+ pencil_gd[dim] = G*processors[dim];
+
+ // Pencil global vol LxLxGxLxL per node
+ GridCartesian pencil_g(pencil_gd,layout,processors);
+
+ // Construct pencils
+ typedef typename vobj::scalar_object sobj;
+ typedef typename sobj::scalar_type scalar;
+
+ Lattice pgbuf(&pencil_g);
+
+
+ typedef typename FFTW::FFTW_scalar FFTW_scalar;
+ typedef typename FFTW::FFTW_plan FFTW_plan;
+
+ int Ncomp = sizeof(sobj)/sizeof(scalar);
+ int Nlow = 1;
+ for(int d=0;d_ldimensions[d];
+ }
+
+ int rank = 1; /* 1d transforms */
+ int n[] = {G}; /* 1d transforms of length G */
+ int howmany = Ncomp;
+ int odist,idist,istride,ostride;
+ idist = odist = 1; /* Distance between consecutive FT's */
+ istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
+ int *inembed = n, *onembed = n;
+
+ scalar div;
+ if ( sign == backward ) div = 1.0/G;
+ else if ( sign == forward ) div = 1.0;
+ else assert(0);
+
+ FFTW_plan p;
+ {
+ FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
+ FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
+ p = FFTW::fftw_plan_many_dft(rank,n,howmany,
+ in,inembed,
+ istride,idist,
+ out,onembed,
+ ostride, odist,
+ sign,FFTW_ESTIMATE);
+ }
+
+ // Barrel shift and collect global pencil
+ std::vector lcoor(Nd), gcoor(Nd);
+ result = source;
+ for(int p=0;p cbuf(Nd);
+ sobj s;
+
+ PARALLEL_FOR_LOOP_INTERN
+ for(int idx=0;idxlSites();idx++) {
+ sgrid->LocalIndexToLocalCoor(idx,cbuf);
+ peekLocalSite(s,result,cbuf);
+ cbuf[dim]+=p*L;
+ pokeLocalSite(s,pgbuf,cbuf);
+ }
+ }
+ result = Cshift(result,dim,L);
+ }
+
+ // Loop over orthog coords
+ int NN=pencil_g.lSites();
+ GridStopWatch timer;
+ timer.Start();
+ PARALLEL_REGION
+ {
+ std::vector cbuf(Nd);
+
+ PARALLEL_FOR_LOOP_INTERN
+ for(int idx=0;idx::fftw_execute_dft(p,in,out);
+ }
+ }
+ }
+ timer.Stop();
+
+ // performance counting
+ double add,mul,fma;
+ FFTW::fftw_flops(p,&add,&mul,&fma);
+ flops_call = add+mul+2.0*fma;
+ usec += timer.useconds();
+ flops+= flops_call*NN;
+
+ // writing out result
+ int pc = processor_coor[dim];
+ PARALLEL_REGION
+ {
+ std::vector clbuf(Nd), cgbuf(Nd);
+ sobj s;
+
+ PARALLEL_FOR_LOOP_INTERN
+ for(int idx=0;idxlSites();idx++) {
+ sgrid->LocalIndexToLocalCoor(idx,clbuf);
+ cgbuf = clbuf;
+ cgbuf[dim] = clbuf[dim]+L*pc;
+ peekLocalSite(s,pgbuf,cgbuf);
+ s = s * div;
+ pokeLocalSite(s,result,clbuf);
+ }
+ }
+
+ // destroying plan
+ FFTW::fftw_destroy_plan(p);
+#endif
+ }
+ };
+}
+
+#endif
diff --git a/lib/Grid.h b/lib/Grid.h
index eb2be1d1..0c5983f3 100644
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -59,29 +59,30 @@ Author: paboyle
///////////////////
// Grid headers
///////////////////
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include