diff --git a/.travis.yml b/.travis.yml
index 64dae823..7d8203ce 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,68 +9,6 @@ matrix:
- os: osx
osx_image: xcode8.3
compiler: clang
- - compiler: gcc
- dist: trusty
- sudo: required
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-4.9
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: VERSION=-4.9
- - compiler: gcc
- dist: trusty
- sudo: required
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-5
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: VERSION=-5
- - compiler: clang
- dist: trusty
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-4.8
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
- - compiler: clang
- dist: trusty
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-4.8
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
before_install:
- export GRIDDIR=`pwd`
@@ -106,9 +44,3 @@ script:
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check
- - echo make clean
- - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
- - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
- - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
-
-
diff --git a/README.md b/README.md
index 9432abe1..13dd6996 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,44 @@
-# Grid
-
-
- Last stable release |
-
-
- |
-
-
- Development branch |
-
-
- |
-
-
+# Grid [),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [](https://travis-ci.org/paboyle/Grid)
**Data parallel C++ mathematical object library.**
License: GPL v2.
-Last update Nov 2016.
+Last update June 2017.
_Please do not send pull requests to the `master` branch which is reserved for releases._
+
+
+### Description
+This library provides data parallel C++ container classes with internal memory layout
+that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
+are provided, similar to HPF and cmfortran, and user control is given over the mapping of
+array indices to both MPI tasks and SIMD processing elements.
+
+* Identically shaped arrays then be processed with perfect data parallelisation.
+* Such identically shaped arrays are called conformable arrays.
+
+The transformation is based on the observation that Cartesian array processing involves
+identical processing to be performed on different regions of the Cartesian array.
+
+The library will both geometrically decompose into MPI tasks and across SIMD lanes.
+Local vector loops are parallelised with OpenMP pragmas.
+
+Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
+optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
+for most programmers.
+
+The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
+
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types.
+The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
+
+MPI, OpenMP, and SIMD parallelism are present in the library.
+Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
+
+
### Compilers
Intel ICPC v16.0.3 and later
@@ -56,35 +73,25 @@ When you file an issue, please go though the following checklist:
6. Attach the output of `make V=1`.
7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
+### Required libraries
+Grid requires:
+[GMP](https://gmplib.org/),
-### Description
-This library provides data parallel C++ container classes with internal memory layout
-that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
-are provided, similar to HPF and cmfortran, and user control is given over the mapping of
-array indices to both MPI tasks and SIMD processing elements.
+[MPFR](http://www.mpfr.org/)
-* Identically shaped arrays then be processed with perfect data parallelisation.
-* Such identically shaped arrays are called conformable arrays.
+Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
-The transformation is based on the observation that Cartesian array processing involves
-identical processing to be performed on different regions of the Cartesian array.
+Grid optionally uses:
-The library will both geometrically decompose into MPI tasks and across SIMD lanes.
-Local vector loops are parallelised with OpenMP pragmas.
+[HDF5](https://support.hdfgroup.org/HDF5/)
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
-optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
-for most programmers.
+[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support.
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
+[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
-The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
+LAPACK either generic version or Intel MKL library.
-MPI, OpenMP, and SIMD parallelism are present in the library.
-Please see https://arxiv.org/abs/1512.03487 for more detail.
### Quick start
First, start by cloning the repository:
@@ -155,7 +162,6 @@ The following options can be use with the `--enable-comms=` option to target dif
| `none` | no communications |
| `mpi[-auto]` | MPI communications |
| `mpi3[-auto]` | MPI communications using MPI 3 shared memory |
-| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
| `shmem ` | Cray SHMEM communications |
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.
@@ -173,7 +179,8 @@ The following options can be use with the `--enable-simd=` option to target diff
| `AVXFMA4` | AVX (256 bit) + FMA4 |
| `AVX2` | AVX 2 (256 bit) |
| `AVX512` | AVX 512 bit |
-| `QPX` | QPX (256 bit) |
+| `NEONv8` | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit) |
+| `QPX` | IBM QPX (256 bit) |
Alternatively, some CPU codenames can be directly used:
@@ -195,21 +202,205 @@ The following configuration is recommended for the Intel Knights Landing platfor
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
- --enable-comms=mpi-auto \
- --with-gmp= \
- --with-mpfr= \
+ --enable-comms=mpi-auto \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
-where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi \
- --with-gmp= \
- --with-mpfr= \
--enable-mkl \
CXX=CC CC=cc
-```
\ No newline at end of file
+```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+ --with-gmp= \
+ --with-mpfr= \
+```
+where `` is the UNIX prefix where GMP and MPFR are installed.
+
+Knight's Landing with Intel Omnipath adapters with two adapters per node
+presently performs better with use of more than one rank per node, using shared memory
+for interior communication. This is the mpi3 communications implementation.
+We recommend four ranks per node for best performance, but optimum is local volume dependent.
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=KNL \
+ --enable-comms=mpi3-auto \
+ --enable-mkl \
+ CC=icpc MPICXX=mpiicpc
+```
+
+### Build setup for Intel Haswell Xeon platform
+
+The following configuration is recommended for the Intel Haswell platform:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX2 \
+ --enable-comms=mpi3-auto \
+ --enable-mkl \
+ CXX=icpc MPICXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+ --with-gmp= \
+ --with-mpfr= \
+```
+where `` is the UNIX prefix where GMP and MPFR are installed.
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX2 \
+ --enable-comms=mpi3 \
+ --enable-mkl \
+ CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+```
+ export I_MPI_PIN=1
+```
+This is the default.
+
+### Build setup for Intel Skylake Xeon platform
+
+The following configuration is recommended for the Intel Skylake platform:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX512 \
+ --enable-comms=mpi3 \
+ --enable-mkl \
+ CXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+ --with-gmp= \
+ --with-mpfr= \
+```
+where `` is the UNIX prefix where GMP and MPFR are installed.
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX512 \
+ --enable-comms=mpi3 \
+ --enable-mkl \
+ CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+```
+ export I_MPI_PIN=1
+```
+This is the default.
+
+#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping):
+
+mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18
+
+TBA
+
+
+### Build setup for AMD EPYC / RYZEN
+
+The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
+So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
+are common. Each chip within the module exposes a separate NUMA domain.
+There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
+MPI-3 is recommended with the use of four ranks per socket,
+and 8 threads per rank.
+
+The following configuration is recommended for the AMD EPYC platform.
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX2 \
+ --enable-comms=mpi3 \
+ CXX=mpicxx
+```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+ --with-gmp= \
+ --with-mpfr= \
+```
+where `` is the UNIX prefix where GMP and MPFR are installed.
+
+Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
+This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this.
+
+It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
+shared memory to communicate within this node:
+
+mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4
+
+Where omp_bind.sh does the following:
+```
+#!/bin/bash
+
+numanode=` expr $PMI_RANK % 8 `
+basecore=`expr $numanode \* 16`
+core0=`expr $basecore + 0 `
+core1=`expr $basecore + 2 `
+core2=`expr $basecore + 4 `
+core3=`expr $basecore + 6 `
+core4=`expr $basecore + 8 `
+core5=`expr $basecore + 10 `
+core6=`expr $basecore + 12 `
+core7=`expr $basecore + 14 `
+
+export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
+echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
+
+$@
+```
+
+Performance:
+
+#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping):
+
+mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
+
+TBA
+
+### Build setup for BlueGene/Q
+
+To be written...
+
+### Build setup for ARM Neon
+
+To be written...
+
+### Build setup for laptops, other compilers, non-cluster builds
+
+Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
+and omit the enable-mkl flag.
+
+Single node builds are enabled with
+```
+ --enable-comms=none
+```
+
+FFTW support that is not in the default search path may then enabled with
+```
+ --with-fftw=
+```
+
+BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
+
diff --git a/TODO b/TODO
index 672879cd..c37cbf8b 100644
--- a/TODO
+++ b/TODO
@@ -1,23 +1,32 @@
TODO:
---------------
-Peter's work list:
-2)- Precision conversion and sort out localConvert <--
-3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started
-4)- Binary I/O speed up & x-strips
--- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
--- Physical propagator interface
--- Conserved currents
--- GaugeFix into central location
--- Multigrid Wilson and DWF, compare to other Multigrid implementations
--- HDCR resume
+Large item work list:
+
+1)- BG/Q port and check
+2)- Christoph's local basis expansion Lanczos
+3)- Precision conversion and sort out localConvert <-- partial
+
+ - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
+4)- Physical propagator interface
+5)- Conserved currents
+6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
+7)- HDCR resume
Recent DONE
+
+-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O. <--- DONE
+-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
+-- GaugeFix into central location <-- DONE
+-- Scidac and Ildg metadata handling <-- DONE
+-- Binary I/O MPI2 IO <-- DONE
+-- Binary I/O speed up & x-strips <-- DONE
-- Cut down the exterior overhead <-- DONE
-- Interior legs from SHM comms <-- DONE
-- Half-precision comms <-- DONE
--- Merge high precision reduction into develop
--- multiRHS DWF; benchmark on Cori/BNL for comms elimination
+-- Merge high precision reduction into develop <-- DONE
+-- BlockCG, BCGrQ <-- DONE
+-- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE
-- slice* linalg routines for multiRHS, BlockCG
-----
diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
new file mode 100644
index 00000000..c0ce451f
--- /dev/null
+++ b/benchmarks/Benchmark_ITT.cc
@@ -0,0 +1,775 @@
+ /*************************************************************************************
+
+ Grid physics library, www.github.com/paboyle/Grid
+
+ Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
+
+ Copyright (C) 2015
+
+Author: Peter Boyle
+Author: paboyle
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+ /* END LEGAL */
+#include
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+typedef WilsonFermion5D WilsonFermion5DR;
+typedef WilsonFermion5D WilsonFermion5DF;
+typedef WilsonFermion5D WilsonFermion5DD;
+
+
+std::vector L_list;
+std::vector Ls_list;
+std::vector mflop_list;
+
+double mflop_ref;
+double mflop_ref_err;
+
+int NN_global;
+
+struct time_statistics{
+ double mean;
+ double err;
+ double min;
+ double max;
+
+ void statistics(std::vector v){
+ double sum = std::accumulate(v.begin(), v.end(), 0.0);
+ mean = sum / v.size();
+
+ std::vector diff(v.size());
+ std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+ double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+ err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+
+ auto result = std::minmax_element(v.begin(), v.end());
+ min = *result.first;
+ max = *result.second;
+}
+};
+
+void comms_header(){
+ std::cout < simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+ std::vector mpi_layout = GridDefaultMpi();
+
+ for(int mu=0;mu1) nmu++;
+
+ std::vector t_time(Nloop);
+ time_statistics timestat;
+
+ std::cout< latt_size ({lat*mpi_layout[0],
+ lat*mpi_layout[1],
+ lat*mpi_layout[2],
+ lat*mpi_layout[3]});
+
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
+
+ std::vector xbuf(8);
+ std::vector rbuf(8);
+ Grid.ShmBufferFreeAll();
+ for(int d=0;d<8;d++){
+ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ }
+
+ int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ int ncomm;
+ double dbytes;
+ std::vector times(Nloop);
+ for(int i=0;i1 ) {
+
+ int xmit_to_rank;
+ int recv_from_rank;
+ if ( dir == mu ) {
+ int comm_proc=1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ } else {
+ int comm_proc = mpi_layout[mu]-1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ }
+ tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+ (void *)&rbuf[dir][0], recv_from_rank,
+ bytes,dir);
+
+#ifdef GRID_OMP
+#pragma omp atomic
+#endif
+ ncomm++;
+
+#ifdef GRID_OMP
+#pragma omp atomic
+#endif
+ dbytes+=tbytes;
+ }
+ }
+ Grid.Barrier();
+ double stop=usecond();
+ t_time[i] = stop-start; // microseconds
+ }
+
+ timestat.statistics(t_time);
+ // for(int i=0;i > LatticeVec;
+ typedef iVector Vec;
+
+ std::vector simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
+ std::vector mpi_layout = GridDefaultMpi();
+
+ std::cout<({45,12,81,9}));
+ for(int lat=8;lat<=lmax;lat+=4){
+
+ std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+
+ Vec rn ; random(sRNG,rn);
+
+ LatticeVec z(&Grid); z=rn;
+ LatticeVec x(&Grid); x=rn;
+ LatticeVec y(&Grid); y=rn;
+ double a=2.0;
+
+ uint64_t Nloop=NLOOP;
+
+ double start=usecond();
+ for(int i=0;i mflops_all;
+
+ ///////////////////////////////////////////////////////
+ // Set/Get the layout & grid size
+ ///////////////////////////////////////////////////////
+ int threads = GridThread::GetThreads();
+ std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4);
+ std::vector local({L,L,L,L});
+
+ GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}),
+ GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+ uint64_t NP = TmpGrid->RankCount();
+ uint64_t NN = TmpGrid->NodeCount();
+ NN_global=NN;
+ uint64_t SHM=NP/NN;
+
+ std::vector internal;
+ if ( SHM == 1 ) internal = std::vector({1,1,1,1});
+ else if ( SHM == 2 ) internal = std::vector({2,1,1,1});
+ else if ( SHM == 4 ) internal = std::vector({2,2,1,1});
+ else if ( SHM == 8 ) internal = std::vector({2,2,2,1});
+ else assert(0);
+
+ std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
+ std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+
+ ///////// Welcome message ////////////
+ std::cout< seeds4({1,2,3,4});
+ std::vector seeds5({5,6,7,8});
+ GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+ GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
+ std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+ ///////// Source preparation ////////////
+ LatticeFermion src (sFGrid); random(RNG5,src);
+ LatticeFermion tmp (sFGrid);
+
+ RealD N2 = 1.0/::sqrt(norm2(src));
+ src = src*N2;
+
+ LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+
+ WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
+ LatticeFermion src_e (sFrbGrid);
+ LatticeFermion src_o (sFrbGrid);
+ LatticeFermion r_e (sFrbGrid);
+ LatticeFermion r_o (sFrbGrid);
+ LatticeFermion r_eo (sFGrid);
+ LatticeFermion err (sFGrid);
+ {
+
+ pickCheckerboard(Even,src_e,src);
+ pickCheckerboard(Odd,src_o,src);
+
+#if defined(AVX512)
+ const int num_cases = 6;
+ std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
+#else
+ const int num_cases = 4;
+ std::string fmt("U/S ; U/O ; G/S ; G/O ");
+#endif
+ controls Cases [] = {
+#ifdef AVX512
+ { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+#endif
+ { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
+ };
+
+ for(int c=0;cBarrier();
+ for(int i=0;iBarrier();
+ double t1=usecond();
+ // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+ // if (ncall < 500) ncall = 500;
+ uint64_t ncall = 500;
+
+ sFGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+ // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall);
+ for(uint64_t i=0;iBarrier();
+
+ double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops;
+ if ( mflops mflops_all;
+
+ ///////////////////////////////////////////////////////
+ // Set/Get the layout & grid size
+ ///////////////////////////////////////////////////////
+ int threads = GridThread::GetThreads();
+ std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4);
+ std::vector local({L,L,L,L});
+
+ GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}),
+ GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+ uint64_t NP = TmpGrid->RankCount();
+ uint64_t NN = TmpGrid->NodeCount();
+ NN_global=NN;
+ uint64_t SHM=NP/NN;
+
+ std::vector internal;
+ if ( SHM == 1 ) internal = std::vector({1,1,1,1});
+ else if ( SHM == 2 ) internal = std::vector({2,1,1,1});
+ else if ( SHM == 4 ) internal = std::vector({2,2,1,1});
+ else if ( SHM == 8 ) internal = std::vector({2,2,2,1});
+ else assert(0);
+
+ std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
+ std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+
+ ///////// Welcome message ////////////
+ std::cout< seeds4({1,2,3,4});
+ std::vector seeds5({5,6,7,8});
+ GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+ GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
+ std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+ ///////// Source preparation ////////////
+ LatticeFermion src (FGrid); random(RNG5,src);
+ LatticeFermion ref (FGrid);
+ LatticeFermion tmp (FGrid);
+
+ RealD N2 = 1.0/::sqrt(norm2(src));
+ src = src*N2;
+
+ LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+
+ DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+ ////////////////////////////////////
+ // Naive wilson implementation
+ ////////////////////////////////////
+ {
+ LatticeGaugeField Umu5d(FGrid);
+ std::vector U(4,FGrid);
+ for(int ss=0;ssoSites();ss++){
+ for(int s=0;s(Umu5d,mu);
+ }
+ for(int mu=0;muBarrier();
+ for(int i=0;iBarrier();
+ double t1=usecond();
+ // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+ // if (ncall < 500) ncall = 500;
+ uint64_t ncall = 1000;
+
+ FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+ // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall);
+ for(uint64_t i=0;iBarrier();
+
+ double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops;
+ if ( mflops({8,2,2,2});
+#else
+ LebesgueOrder::Block = std::vector({2,2,2,2});
+#endif
+ Benchmark::Decomposition();
+
+ int do_memory=1;
+ int do_comms =1;
+ int do_su3 =0;
+ int do_wilson=1;
+ int do_dwf =1;
+
+ if ( do_memory ) {
+ std::cout< L_list({8,12,16,24});
+ std::vector wilson;
+ std::vector dwf4;
+ std::vector dwf5;
+
+ if ( do_wilson ) {
+ int Ls=1;
+ std::cout<1) nmu++;
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
std::cout< latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@@ -88,12 +88,20 @@ int main (int argc, char ** argv)
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
- std::vector > xbuf(8,std::vector(lat*lat*lat*Ls));
- std::vector > rbuf(8,std::vector(lat*lat*lat*Ls));
+ std::vector > xbuf(8);
+ std::vector > rbuf(8);
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ for(int mu=0;mu<8;mu++){
+ xbuf[mu].resize(lat*lat*lat*Ls);
+ rbuf[mu].resize(lat*lat*lat*Ls);
+ // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
- std::vector > xbuf(8,std::vector(lat*lat*lat*Ls));
- std::vector > rbuf(8,std::vector(lat*lat*lat*Ls));
+ std::vector > xbuf(8);
+ std::vector > rbuf(8);
+ for(int mu=0;mu<8;mu++){
+ xbuf[mu].resize(lat*lat*lat*Ls);
+ rbuf[mu].resize(lat*lat*lat*Ls);
+ // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@@ -251,6 +266,9 @@ int main (int argc, char ** argv)
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
std::vector xbuf(8);
std::vector rbuf(8);
@@ -258,59 +276,66 @@ int main (int argc, char ** argv)
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ double dbytes;
for(int i=0;i requests;
- ncomm=0;
for(int mu=0;mu<4;mu++){
+
if (mpi_layout[mu]>1 ) {
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
-
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
- Grid.StencilSendToRecvFromBegin(requests,
- (void *)&xbuf[mu][0],
- xmit_to_rank,
- (void *)&rbuf[mu][0],
- recv_from_rank,
- bytes);
+ dbytes+=
+ Grid.StencilSendToRecvFromBegin(requests,
+ (void *)&xbuf[mu][0],
+ xmit_to_rank,
+ (void *)&rbuf[mu][0],
+ recv_from_rank,
+ bytes,mu);
comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
- Grid.StencilSendToRecvFromBegin(requests,
- (void *)&xbuf[mu+4][0],
- xmit_to_rank,
- (void *)&rbuf[mu+4][0],
- recv_from_rank,
- bytes);
+ dbytes+=
+ Grid.StencilSendToRecvFromBegin(requests,
+ (void *)&xbuf[mu+4][0],
+ xmit_to_rank,
+ (void *)&rbuf[mu+4][0],
+ recv_from_rank,
+ bytes,mu+4);
}
}
- Grid.StencilSendToRecvFromComplete(requests);
+ Grid.StencilSendToRecvFromComplete(requests,0);
Grid.Barrier();
- double stop=usecond();
- t_time[i] = stop-start; // microseconds
-
+ double stop=usecond();
+ t_time[i] = stop-start; // microseconds
+
}
timestat.statistics(t_time);
- double dbytes = bytes;
- double xbytes = dbytes*2.0*ncomm;
- double rbytes = xbytes;
- double bidibytes = xbytes+rbytes;
+ dbytes=dbytes*ppn;
+ double xbytes = dbytes*0.5;
+ double rbytes = dbytes*0.5;
+ double bidibytes = dbytes;
std::cout< latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@@ -338,6 +363,9 @@ int main (int argc, char ** argv)
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
std::vector xbuf(8);
std::vector rbuf(8);
@@ -345,16 +373,18 @@ int main (int argc, char ** argv)
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-
+ double dbytes;
for(int i=0;i requests;
-
+ dbytes=0;
ncomm=0;
for(int mu=0;mu<4;mu++){
@@ -366,41 +396,43 @@ int main (int argc, char ** argv)
int recv_from_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
- Grid.StencilSendToRecvFromBegin(requests,
- (void *)&xbuf[mu][0],
- xmit_to_rank,
- (void *)&rbuf[mu][0],
- recv_from_rank,
- bytes);
- Grid.StencilSendToRecvFromComplete(requests);
+ dbytes+=
+ Grid.StencilSendToRecvFromBegin(requests,
+ (void *)&xbuf[mu][0],
+ xmit_to_rank,
+ (void *)&rbuf[mu][0],
+ recv_from_rank,
+ bytes,mu);
+ Grid.StencilSendToRecvFromComplete(requests,mu);
requests.resize(0);
comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
- Grid.StencilSendToRecvFromBegin(requests,
- (void *)&xbuf[mu+4][0],
- xmit_to_rank,
- (void *)&rbuf[mu+4][0],
- recv_from_rank,
- bytes);
- Grid.StencilSendToRecvFromComplete(requests);
+ dbytes+=
+ Grid.StencilSendToRecvFromBegin(requests,
+ (void *)&xbuf[mu+4][0],
+ xmit_to_rank,
+ (void *)&rbuf[mu+4][0],
+ recv_from_rank,
+ bytes,mu+4);
+ Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0);
}
}
- Grid.Barrier();
- double stop=usecond();
- t_time[i] = stop-start; // microseconds
-
+ Grid.Barrier();
+ double stop=usecond();
+ t_time[i] = stop-start; // microseconds
+
}
timestat.statistics(t_time);
- double dbytes = bytes;
- double xbytes = dbytes*2.0*ncomm;
- double rbytes = xbytes;
- double bidibytes = xbytes+rbytes;
+ dbytes=dbytes*ppn;
+ double xbytes = dbytes*0.5;
+ double rbytes = dbytes*0.5;
+ double bidibytes = dbytes;
std::cout< latt_size ({lat*mpi_layout[0],
+ lat*mpi_layout[1],
+ lat*mpi_layout[2],
+ lat*mpi_layout[3]});
+
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
+
+ std::vector xbuf(8);
+ std::vector rbuf(8);
+ Grid.ShmBufferFreeAll();
+ for(int d=0;d<8;d++){
+ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ }
+
+ int ncomm;
+ int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ double dbytes;
+ for(int i=0;i requests;
+ dbytes=0;
+ ncomm=0;
+
+ parallel_for(int dir=0;dir<8;dir++){
+
+ double tbytes;
+ int mu =dir % 4;
+
+ if (mpi_layout[mu]>1 ) {
+
+ ncomm++;
+ int xmit_to_rank;
+ int recv_from_rank;
+ if ( dir == mu ) {
+ int comm_proc=1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ } else {
+ int comm_proc = mpi_layout[mu]-1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ }
+
+ tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+ (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
+
+#pragma omp atomic
+ dbytes+=tbytes;
+ }
+ }
+ Grid.Barrier();
+ double stop=usecond();
+ t_time[i] = stop-start; // microseconds
+ }
+
+ timestat.statistics(t_time);
+
+ dbytes=dbytes*ppn;
+ double xbytes = dbytes*0.5;
+ double rbytes = dbytes*0.5;
+ double bidibytes = dbytes;
+
+
+ std::cout<Barrier();
Dw.ZeroCounters();
@@ -302,6 +302,7 @@ int main (int argc, char ** argv)
std::cout<< "sD ERR \n " << err < latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
uint64_t Nloop=NLOOP;
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeVec z(&Grid); //random(pRNG,z);
- LatticeVec x(&Grid); //random(pRNG,x);
- LatticeVec y(&Grid); //random(pRNG,y);
+ LatticeVec z(&Grid);// random(pRNG,z);
+ LatticeVec x(&Grid);// random(pRNG,x);
+ LatticeVec y(&Grid);// random(pRNG,y);
double a=2.0;
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
- double bytes=3*vol*Nvec*sizeof(Real);
+ double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeVec z(&Grid); //random(pRNG,z);
- LatticeVec x(&Grid); //random(pRNG,x);
- LatticeVec y(&Grid); //random(pRNG,y);
+ LatticeVec z(&Grid);// random(pRNG,z);
+ LatticeVec x(&Grid);// random(pRNG,x);
+ LatticeVec y(&Grid);// random(pRNG,y);
double a=2.0;
uint64_t Nloop=NLOOP;
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
- double bytes=3*vol*Nvec*sizeof(Real);
+ double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
uint64_t Nloop=NLOOP;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeVec z(&Grid); //random(pRNG,z);
- LatticeVec x(&Grid); //random(pRNG,x);
- LatticeVec y(&Grid); //random(pRNG,y);
+ LatticeVec z(&Grid);// random(pRNG,z);
+ LatticeVec x(&Grid);// random(pRNG,x);
+ LatticeVec y(&Grid);// random(pRNG,y);
RealD a=2.0;
@@ -154,7 +154,7 @@ int main (int argc, char ** argv)
double stop=usecond();
double time = (stop-start)/Nloop*1000;
- double bytes=2*vol*Nvec*sizeof(Real);
+ double bytes=2.0*vol*Nvec*sizeof(Real);
double flops=vol*Nvec*1;// mul
std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
uint64_t Nloop=NLOOP;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
- LatticeVec z(&Grid); //random(pRNG,z);
- LatticeVec x(&Grid); //random(pRNG,x);
- LatticeVec y(&Grid); //random(pRNG,y);
+ // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
+ LatticeVec z(&Grid);// random(pRNG,z);
+ LatticeVec x(&Grid);// random(pRNG,x);
+ LatticeVec y(&Grid);// random(pRNG,y);
RealD a=2.0;
Real nn;
double start=usecond();
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
double stop=usecond();
double time = (stop-start)/Nloop*1000;
- double bytes=vol*Nvec*sizeof(Real);
+ double bytes=1.0*vol*Nvec*sizeof(Real);
double flops=vol*Nvec*2;// mul,add
std::cout< simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector mpi_layout = GridDefaultMpi();
- int threads = GridThread::GetThreads();
+ int64_t threads = GridThread::GetThreads();
std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeColourMatrix z(&Grid);// random(pRNG,z);
- LatticeColourMatrix x(&Grid);// random(pRNG,x);
- LatticeColourMatrix y(&Grid);// random(pRNG,y);
+ LatticeColourMatrix z(&Grid); random(pRNG,z);
+ LatticeColourMatrix x(&Grid); random(pRNG,x);
+ LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
- for(int i=0;i latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeColourMatrix z(&Grid); //random(pRNG,z);
- LatticeColourMatrix x(&Grid); //random(pRNG,x);
- LatticeColourMatrix y(&Grid); //random(pRNG,y);
+ LatticeColourMatrix z(&Grid); random(pRNG,z);
+ LatticeColourMatrix x(&Grid); random(pRNG,x);
+ LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
- for(int i=0;i latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeColourMatrix z(&Grid); //random(pRNG,z);
- LatticeColourMatrix x(&Grid); //random(pRNG,x);
- LatticeColourMatrix y(&Grid); //random(pRNG,y);
+ LatticeColourMatrix z(&Grid); random(pRNG,z);
+ LatticeColourMatrix x(&Grid); random(pRNG,x);
+ LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
- for(int i=0;i latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeColourMatrix z(&Grid); //random(pRNG,z);
- LatticeColourMatrix x(&Grid); //random(pRNG,x);
- LatticeColourMatrix y(&Grid); //random(pRNG,y);
+ LatticeColourMatrix z(&Grid); random(pRNG,z);
+ LatticeColourMatrix x(&Grid); random(pRNG,x);
+ LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
- for(int i=0;i]])
AC_CHECK_DECLS([be64toh],[], [], [[#include ]])
+############## Standard libraries
+AC_CHECK_LIB([m],[cos])
+AC_CHECK_LIB([stdc++],[abort])
+
############### GMP and MPFR
AC_ARG_WITH([gmp],
[AS_HELP_STRING([--with-gmp=prefix],
@@ -184,6 +192,15 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
In order to use ILGG file format please install or provide the correct path to your installation
Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
+AC_SEARCH_LIBS([crc32], [z],
+ [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
+ [have_zlib=true] [LIBS="${LIBS} -lz"],
+ [AC_MSG_ERROR(zlib library was not found in your system.)])
+
+AC_SEARCH_LIBS([move_pages], [numa],
+ [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
+ [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
+ [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
[AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
@@ -237,6 +254,7 @@ case ${ax_cv_cxx_compiler_vendor} in
SIMD_FLAGS='';;
KNL)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+ AC_DEFINE([KNL],[1],[Knights landing processor])
SIMD_FLAGS='-march=knl';;
GEN)
AC_DEFINE([GEN],[1],[generic vector code])
@@ -244,6 +262,9 @@ case ${ax_cv_cxx_compiler_vendor} in
[generic SIMD vector width (in bytes)])
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
SIMD_FLAGS='';;
+ NEONv8)
+ AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
+ SIMD_FLAGS='-march=armv8-a';;
QPX|BGQ)
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
SIMD_FLAGS='';;
@@ -272,6 +293,7 @@ case ${ax_cv_cxx_compiler_vendor} in
SIMD_FLAGS='';;
KNL)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
+ AC_DEFINE([KNL],[1],[Knights landing processor])
SIMD_FLAGS='-xmic-avx512';;
GEN)
AC_DEFINE([GEN],[1],[generic vector code])
@@ -320,14 +342,14 @@ case ${ac_COMMS} in
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
comms_type='none'
;;
- mpi3l*)
- AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
- comms_type='mpi3l'
- ;;
mpi3*)
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
comms_type='mpi3'
;;
+ mpit)
+ AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
+ comms_type='mpit'
+ ;;
mpi*)
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
comms_type='mpi'
@@ -355,7 +377,7 @@ esac
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
-AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
+AM_CONDITIONAL(BUILD_COMMS_MPIT, [ test "${comms_type}X" == "mpitX" ] )
AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
############### RNG selection
diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp
index b36745e3..4e1e62f7 100644
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -8,6 +8,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -23,4 +24,3 @@
#include
#include
#include
-#include
diff --git a/extras/Hadrons/Modules/Quark.hpp b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
similarity index 65%
rename from extras/Hadrons/Modules/Quark.hpp
rename to extras/Hadrons/Modules/MFermion/GaugeProp.hpp
index cf7d4c28..b4f9edcc 100644
--- a/extras/Hadrons/Modules/Quark.hpp
+++ b/extras/Hadrons/Modules/MFermion/GaugeProp.hpp
@@ -1,34 +1,5 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: extras/Hadrons/Modules/Quark.hpp
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Antonin Portelli
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/* END LEGAL */
-
-#ifndef Hadrons_Quark_hpp_
-#define Hadrons_Quark_hpp_
+#ifndef Hadrons_MFermion_GaugeProp_hpp_
+#define Hadrons_MFermion_GaugeProp_hpp_
#include
#include
@@ -37,27 +8,29 @@ See the full license in the file "LICENSE" in the top level distribution directo
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
- * TQuark *
+ * GaugeProp *
******************************************************************************/
-class QuarkPar: Serializable
+BEGIN_MODULE_NAMESPACE(MFermion)
+
+class GaugePropPar: Serializable
{
public:
- GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar,
+ GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar,
std::string, source,
std::string, solver);
};
template
-class TQuark: public Module
+class TGaugeProp: public Module
{
public:
FGS_TYPE_ALIASES(FImpl,);
public:
// constructor
- TQuark(const std::string name);
+ TGaugeProp(const std::string name);
// destructor
- virtual ~TQuark(void) = default;
- // dependencies/products
+ virtual ~TGaugeProp(void) = default;
+ // dependency relation
virtual std::vector getInput(void);
virtual std::vector getOutput(void);
// setup
@@ -69,20 +42,20 @@ private:
SolverFn *solver_{nullptr};
};
-MODULE_REGISTER(Quark, TQuark);
+MODULE_REGISTER_NS(GaugeProp, TGaugeProp, MFermion);
/******************************************************************************
- * TQuark implementation *
+ * TGaugeProp implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
template
-TQuark::TQuark(const std::string name)
-: Module(name)
+TGaugeProp::TGaugeProp(const std::string name)
+: Module(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
template
-std::vector TQuark::getInput(void)
+std::vector TGaugeProp::getInput(void)
{
std::vector in = {par().source, par().solver};
@@ -90,7 +63,7 @@ std::vector TQuark::getInput(void)
}
template
-std::vector TQuark::getOutput(void)
+std::vector TGaugeProp::getOutput(void)
{
std::vector out = {getName(), getName() + "_5d"};
@@ -99,7 +72,7 @@ std::vector TQuark::getOutput(void)
// setup ///////////////////////////////////////////////////////////////////////
template
-void TQuark::setup(void)
+void TGaugeProp::setup(void)
{
Ls_ = env().getObjectLs(par().solver);
env().template registerLattice(getName());
@@ -111,13 +84,13 @@ void TQuark::setup(void)
// execution ///////////////////////////////////////////////////////////////////
template
-void TQuark::execute(void)
+void TGaugeProp::execute(void)
{
LOG(Message) << "Computing quark propagator '" << getName() << "'"
- << std::endl;
+ << std::endl;
FermionField source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)),
- tmp(env().getGrid());
+ tmp(env().getGrid());
std::string propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
PropagatorField &prop = *env().template createLattice(propName);
PropagatorField &fullSrc = *env().template getObject(par().source);
@@ -128,12 +101,12 @@ void TQuark::execute(void)
}
LOG(Message) << "Inverting using solver '" << par().solver
- << "' on source '" << par().source << "'" << std::endl;
+ << "' on source '" << par().source << "'" << std::endl;
for (unsigned int s = 0; s < Ns; ++s)
for (unsigned int c = 0; c < Nc; ++c)
{
LOG(Message) << "Inversion for spin= " << s << ", color= " << c
- << std::endl;
+ << std::endl;
// source conversion for 4D sources
if (!env().isObject5d(par().source))
{
@@ -170,7 +143,7 @@ void TQuark::execute(void)
if (Ls_ > 1)
{
PropagatorField &p4d =
- *env().template getObject(getName());
+ *env().template getObject(getName());
axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
@@ -180,6 +153,8 @@ void TQuark::execute(void)
}
}
+END_MODULE_NAMESPACE
+
END_HADRONS_NAMESPACE
-#endif // Hadrons_Quark_hpp_
+#endif // Hadrons_MFermion_GaugeProp_hpp_
diff --git a/extras/Hadrons/Modules/MGauge/Load.cc b/extras/Hadrons/Modules/MGauge/Load.cc
index e5ee8abb..062e7e98 100644
--- a/extras/Hadrons/Modules/MGauge/Load.cc
+++ b/extras/Hadrons/Modules/MGauge/Load.cc
@@ -65,7 +65,7 @@ void TLoad::setup(void)
// execution ///////////////////////////////////////////////////////////////////
void TLoad::execute(void)
{
- NerscField header;
+ FieldMetaData header;
std::string fileName = par().file + "."
+ std::to_string(env().getTrajectory());
@@ -74,5 +74,5 @@ void TLoad::execute(void)
LatticeGaugeField &U = *env().createLattice(getName());
NerscIO::readConfiguration(U, header, fileName);
LOG(Message) << "NERSC header:" << std::endl;
- dump_nersc_header(header, LOG(Message));
+ dump_meta_data(header, LOG(Message));
}
diff --git a/extras/Hadrons/modules.inc b/extras/Hadrons/modules.inc
index e324d3ec..29d63f58 100644
--- a/extras/Hadrons/modules.inc
+++ b/extras/Hadrons/modules.inc
@@ -21,6 +21,7 @@ modules_hpp =\
Modules/MContraction/WeakHamiltonianEye.hpp \
Modules/MContraction/WeakHamiltonianNonEye.hpp \
Modules/MContraction/WeakNeutral4ptDisc.hpp \
+ Modules/MFermion/GaugeProp.hpp \
Modules/MGauge/Load.hpp \
Modules/MGauge/Random.hpp \
Modules/MGauge/StochEm.hpp \
@@ -35,6 +36,5 @@ modules_hpp =\
Modules/MSource/Point.hpp \
Modules/MSource/SeqGamma.hpp \
Modules/MSource/Wall.hpp \
- Modules/MSource/Z2.hpp \
- Modules/Quark.hpp
+ Modules/MSource/Z2.hpp
diff --git a/lib/Grid.h b/lib/Grid.h
index 543b0330..9dcc207b 100644
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -41,7 +41,9 @@ Author: paboyle
#include
#include
#include
+#include
#include
+#include
#include
#endif
diff --git a/lib/GridStd.h b/lib/GridStd.h
index fb5e5b21..097e62ab 100644
--- a/lib/GridStd.h
+++ b/lib/GridStd.h
@@ -7,6 +7,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -18,6 +19,7 @@
#include
#include
#include
+#include
///////////////////
// Grid config
diff --git a/lib/Makefile.am b/lib/Makefile.am
index fac622ca..6dd7899e 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_base.cc
endif
-if BUILD_COMMS_MPI3L
- extra_sources+=communicator/Communicator_mpi3_leader.cc
+if BUILD_COMMS_MPIT
+ extra_sources+=communicator/Communicator_mpit.cc
extra_sources+=communicator/Communicator_base.cc
endif
diff --git a/lib/algorithms/densematrix/DenseMatrix.h b/lib/algorithms/densematrix/DenseMatrix.h
deleted file mode 100644
index d86add21..00000000
--- a/lib/algorithms/densematrix/DenseMatrix.h
+++ /dev/null
@@ -1,137 +0,0 @@
- /*************************************************************************************
-
- Grid physics library, www.github.com/paboyle/Grid
-
- Source file: ./lib/algorithms/iterative/DenseMatrix.h
-
- Copyright (C) 2015
-
-Author: Peter Boyle
-Author: paboyle
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License along
- with this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
- See the full license in the file "LICENSE" in the top level distribution directory
- *************************************************************************************/
- /* END LEGAL */
-#ifndef GRID_DENSE_MATRIX_H
-#define GRID_DENSE_MATRIX_H
-
-namespace Grid {
- /////////////////////////////////////////////////////////////
- // Matrix untils
- /////////////////////////////////////////////////////////////
-
-template using DenseVector = std::vector;
-template using DenseMatrix = DenseVector >;
-
-template void Size(DenseVector & vec, int &N)
-{
- N= vec.size();
-}
-template void Size(DenseMatrix & mat, int &N,int &M)
-{
- N= mat.size();
- M= mat[0].size();
-}
-
-template void SizeSquare(DenseMatrix & mat, int &N)
-{
- int M; Size(mat,N,M);
- assert(N==M);
-}
-
-template void Resize(DenseVector & mat, int N) {
- mat.resize(N);
-}
-template void Resize(DenseMatrix & mat, int N, int M) {
- mat.resize(N);
- for(int i=0;i void Fill(DenseMatrix & mat, T&val) {
- int N,M;
- Size(mat,N,M);
- for(int i=0;i DenseMatrix Transpose(DenseMatrix & mat){
- int N,M;
- Size(mat,N,M);
- DenseMatrix C; Resize(C,M,N);
- for(int i=0;i void Unity(DenseMatrix &A){
- int N; SizeSquare(A,N);
- for(int i=0;i
-void PlusUnit(DenseMatrix & A,T c){
- int dim; SizeSquare(A,dim);
- for(int i=0;i
-DenseMatrix HermitianConj(DenseMatrix &mat){
-
- int dim; SizeSquare(mat,dim);
-
- DenseMatrix C; Resize(C,dim,dim);
-
- for(int i=0;i
-DenseMatrix GetSubMtx(DenseMatrix