diff --git a/.travis.yml b/.travis.yml
index 64dae823..7d8203ce 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,68 +9,6 @@ matrix:
- os: osx
osx_image: xcode8.3
compiler: clang
- - compiler: gcc
- dist: trusty
- sudo: required
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-4.9
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: VERSION=-4.9
- - compiler: gcc
- dist: trusty
- sudo: required
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-5
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: VERSION=-5
- - compiler: clang
- dist: trusty
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-4.8
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
- - compiler: clang
- dist: trusty
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-4.8
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
before_install:
- export GRIDDIR=`pwd`
@@ -106,9 +44,3 @@ script:
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check
- - echo make clean
- - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
- - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
- - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
-
-
diff --git a/README.md b/README.md
index 9432abe1..13dd6996 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,44 @@
-# Grid
-
-
- Last stable release |
-
-
- |
-
-
- Development branch |
-
-
- |
-
-
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
**Data parallel C++ mathematical object library.**
License: GPL v2.
-Last update Nov 2016.
+Last update June 2017.
_Please do not send pull requests to the `master` branch which is reserved for releases._
+
+
+### Description
+This library provides data parallel C++ container classes with internal memory layout
+that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
+are provided, similar to HPF and cmfortran, and user control is given over the mapping of
+array indices to both MPI tasks and SIMD processing elements.
+
+* Identically shaped arrays then be processed with perfect data parallelisation.
+* Such identically shaped arrays are called conformable arrays.
+
+The transformation is based on the observation that Cartesian array processing involves
+identical processing to be performed on different regions of the Cartesian array.
+
+The library will both geometrically decompose into MPI tasks and across SIMD lanes.
+Local vector loops are parallelised with OpenMP pragmas.
+
+Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
+optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
+for most programmers.
+
+The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
+Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
+
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types.
+The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
+
+MPI, OpenMP, and SIMD parallelism are present in the library.
+Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
+
+
### Compilers
Intel ICPC v16.0.3 and later
@@ -56,35 +73,25 @@ When you file an issue, please go though the following checklist:
6. Attach the output of `make V=1`.
7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
+### Required libraries
+Grid requires:
+[GMP](https://gmplib.org/),
-### Description
-This library provides data parallel C++ container classes with internal memory layout
-that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
-are provided, similar to HPF and cmfortran, and user control is given over the mapping of
-array indices to both MPI tasks and SIMD processing elements.
+[MPFR](http://www.mpfr.org/)
-* Identically shaped arrays then be processed with perfect data parallelisation.
-* Such identically shaped arrays are called conformable arrays.
+Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
-The transformation is based on the observation that Cartesian array processing involves
-identical processing to be performed on different regions of the Cartesian array.
+Grid optionally uses:
-The library will both geometrically decompose into MPI tasks and across SIMD lanes.
-Local vector loops are parallelised with OpenMP pragmas.
+[HDF5](https://support.hdfgroup.org/HDF5/)
-Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
-optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
-for most programmers.
+[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support.
-The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
+[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
-These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
-The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
+LAPACK either generic version or Intel MKL library.
-MPI, OpenMP, and SIMD parallelism are present in the library.
-Please see https://arxiv.org/abs/1512.03487 for more detail.
### Quick start
First, start by cloning the repository:
@@ -155,7 +162,6 @@ The following options can be use with the `--enable-comms=` option to target dif
| `none` | no communications |
| `mpi[-auto]` | MPI communications |
| `mpi3[-auto]` | MPI communications using MPI 3 shared memory |
-| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
| `shmem ` | Cray SHMEM communications |
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.
@@ -173,7 +179,8 @@ The following options can be use with the `--enable-simd=` option to target diff
| `AVXFMA4` | AVX (256 bit) + FMA4 |
| `AVX2` | AVX 2 (256 bit) |
| `AVX512` | AVX 512 bit |
-| `QPX` | QPX (256 bit) |
+| `NEONv8` | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit) |
+| `QPX` | IBM QPX (256 bit) |
Alternatively, some CPU codenames can be directly used:
@@ -195,21 +202,205 @@ The following configuration is recommended for the Intel Knights Landing platfor
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
- --enable-comms=mpi-auto \
- --with-gmp= \
- --with-mpfr= \
+ --enable-comms=mpi-auto \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
-where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi \
- --with-gmp= \
- --with-mpfr= \
--enable-mkl \
CXX=CC CC=cc
-```
\ No newline at end of file
+```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+ --with-gmp= \
+ --with-mpfr= \
+```
+where `` is the UNIX prefix where GMP and MPFR are installed.
+
+Knight's Landing with Intel Omnipath adapters with two adapters per node
+presently performs better with use of more than one rank per node, using shared memory
+for interior communication. This is the mpi3 communications implementation.
+We recommend four ranks per node for best performance, but optimum is local volume dependent.
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=KNL \
+ --enable-comms=mpi3-auto \
+ --enable-mkl \
+ CC=icpc MPICXX=mpiicpc
+```
+
+### Build setup for Intel Haswell Xeon platform
+
+The following configuration is recommended for the Intel Haswell platform:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX2 \
+ --enable-comms=mpi3-auto \
+ --enable-mkl \
+ CXX=icpc MPICXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+ --with-gmp= \
+ --with-mpfr= \
+```
+where `` is the UNIX prefix where GMP and MPFR are installed.
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX2 \
+ --enable-comms=mpi3 \
+ --enable-mkl \
+ CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+```
+ export I_MPI_PIN=1
+```
+This is the default.
+
+### Build setup for Intel Skylake Xeon platform
+
+The following configuration is recommended for the Intel Skylake platform:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX512 \
+ --enable-comms=mpi3 \
+ --enable-mkl \
+ CXX=mpiicpc
+```
+The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+ --with-gmp= \
+ --with-mpfr= \
+```
+where `` is the UNIX prefix where GMP and MPFR are installed.
+
+If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX512 \
+ --enable-comms=mpi3 \
+ --enable-mkl \
+ CXX=CC CC=cc
+```
+Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of
+one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
+```
+ export I_MPI_PIN=1
+```
+This is the default.
+
+#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping):
+
+mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18
+
+TBA
+
+
+### Build setup for AMD EPYC / RYZEN
+
+The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
+So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
+are common. Each chip within the module exposes a separate NUMA domain.
+There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
+MPI-3 is recommended with the use of four ranks per socket,
+and 8 threads per rank.
+
+The following configuration is recommended for the AMD EPYC platform.
+
+``` bash
+../configure --enable-precision=double\
+ --enable-simd=AVX2 \
+ --enable-comms=mpi3 \
+ CXX=mpicxx
+```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+ --with-gmp= \
+ --with-mpfr= \
+```
+where `` is the UNIX prefix where GMP and MPFR are installed.
+
+Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
+This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this.
+
+It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
+shared memory to communicate within this node:
+
+mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4
+
+Where omp_bind.sh does the following:
+```
+#!/bin/bash
+
+numanode=` expr $PMI_RANK % 8 `
+basecore=`expr $numanode \* 16`
+core0=`expr $basecore + 0 `
+core1=`expr $basecore + 2 `
+core2=`expr $basecore + 4 `
+core3=`expr $basecore + 6 `
+core4=`expr $basecore + 8 `
+core5=`expr $basecore + 10 `
+core6=`expr $basecore + 12 `
+core7=`expr $basecore + 14 `
+
+export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
+echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
+
+$@
+```
+
+Performance:
+
+#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping):
+
+mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
+
+TBA
+
+### Build setup for BlueGene/Q
+
+To be written...
+
+### Build setup for ARM Neon
+
+To be written...
+
+### Build setup for laptops, other compilers, non-cluster builds
+
+Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
+and omit the enable-mkl flag.
+
+Single node builds are enabled with
+```
+ --enable-comms=none
+```
+
+FFTW support that is not in the default search path may then enabled with
+```
+ --with-fftw=
+```
+
+BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
+
diff --git a/TODO b/TODO
index a778f476..83bfda5e 100644
--- a/TODO
+++ b/TODO
@@ -9,10 +9,8 @@ Large item work list:
3a)- RNG I/O in ILDG/SciDAC (minor)
3b)- Precision conversion and sort out localConvert <-- partial/easy
3c)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
---
4)- Physical propagator interface
5)- Conserved currents
---
6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
7)- HDCR resume
diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
new file mode 100644
index 00000000..666e4830
--- /dev/null
+++ b/benchmarks/Benchmark_ITT.cc
@@ -0,0 +1,800 @@
+ /*************************************************************************************
+
+ Grid physics library, www.github.com/paboyle/Grid
+
+ Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
+
+ Copyright (C) 2015
+
+Author: Peter Boyle
+Author: paboyle
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+ /* END LEGAL */
+#include
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+typedef WilsonFermion5D WilsonFermion5DR;
+typedef WilsonFermion5D WilsonFermion5DF;
+typedef WilsonFermion5D WilsonFermion5DD;
+
+
+std::vector L_list;
+std::vector Ls_list;
+std::vector mflop_list;
+
+double mflop_ref;
+double mflop_ref_err;
+
+int NN_global;
+
+struct time_statistics{
+ double mean;
+ double err;
+ double min;
+ double max;
+
+ void statistics(std::vector v){
+ double sum = std::accumulate(v.begin(), v.end(), 0.0);
+ mean = sum / v.size();
+
+ std::vector diff(v.size());
+ std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+ double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+ err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+
+ auto result = std::minmax_element(v.begin(), v.end());
+ min = *result.first;
+ max = *result.second;
+}
+};
+
+void comms_header(){
+ std::cout < simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+ std::vector mpi_layout = GridDefaultMpi();
+
+ for(int mu=0;mu1) nmu++;
+
+ std::vector t_time(Nloop);
+ time_statistics timestat;
+
+ std::cout< latt_size ({lat*mpi_layout[0],
+ lat*mpi_layout[1],
+ lat*mpi_layout[2],
+ lat*mpi_layout[3]});
+
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
+
+ std::vector xbuf(8);
+ std::vector rbuf(8);
+ Grid.ShmBufferFreeAll();
+ for(int d=0;d<8;d++){
+ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ }
+
+ int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ int ncomm;
+ double dbytes;
+ std::vector times(Nloop);
+ for(int i=0;i1 ) {
+
+ int xmit_to_rank;
+ int recv_from_rank;
+ if ( dir == mu ) {
+ int comm_proc=1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ } else {
+ int comm_proc = mpi_layout[mu]-1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ }
+ tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+ (void *)&rbuf[dir][0], recv_from_rank,
+ bytes,dir);
+
+#ifdef GRID_OMP
+#pragma omp atomic
+#endif
+ ncomm++;
+
+#ifdef GRID_OMP
+#pragma omp atomic
+#endif
+ dbytes+=tbytes;
+ }
+ }
+ Grid.Barrier();
+ double stop=usecond();
+ t_time[i] = stop-start; // microseconds
+ }
+
+ timestat.statistics(t_time);
+ // for(int i=0;i > LatticeVec;
+ typedef iVector Vec;
+
+ std::vector simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
+ std::vector mpi_layout = GridDefaultMpi();
+
+ std::cout<({45,12,81,9}));
+ for(int lat=8;lat<=lmax;lat+=4){
+
+ std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+
+ NP= Grid.RankCount();
+ NN =Grid.NodeCount();
+
+ Vec rn ; random(sRNG,rn);
+
+ LatticeVec z(&Grid); z=rn;
+ LatticeVec x(&Grid); x=rn;
+ LatticeVec y(&Grid); y=rn;
+ double a=2.0;
+
+ uint64_t Nloop=NLOOP;
+
+ double start=usecond();
+ for(int i=0;i mflops_all;
+
+ ///////////////////////////////////////////////////////
+ // Set/Get the layout & grid size
+ ///////////////////////////////////////////////////////
+ int threads = GridThread::GetThreads();
+ std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4);
+ std::vector local({L,L,L,L});
+
+ GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}),
+ GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+ uint64_t NP = TmpGrid->RankCount();
+ uint64_t NN = TmpGrid->NodeCount();
+ NN_global=NN;
+ uint64_t SHM=NP/NN;
+
+ std::vector internal;
+ if ( SHM == 1 ) internal = std::vector({1,1,1,1});
+ else if ( SHM == 2 ) internal = std::vector({2,1,1,1});
+ else if ( SHM == 4 ) internal = std::vector({2,2,1,1});
+ else if ( SHM == 8 ) internal = std::vector({2,2,2,1});
+ else assert(0);
+
+ std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
+ std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+
+ ///////// Welcome message ////////////
+ std::cout< seeds4({1,2,3,4});
+ std::vector seeds5({5,6,7,8});
+ GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+ GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
+ std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+ ///////// Source preparation ////////////
+ LatticeFermion src (sFGrid); random(RNG5,src);
+ LatticeFermion tmp (sFGrid);
+
+ RealD N2 = 1.0/::sqrt(norm2(src));
+ src = src*N2;
+
+ LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+
+ WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
+ LatticeFermion src_e (sFrbGrid);
+ LatticeFermion src_o (sFrbGrid);
+ LatticeFermion r_e (sFrbGrid);
+ LatticeFermion r_o (sFrbGrid);
+ LatticeFermion r_eo (sFGrid);
+ LatticeFermion err (sFGrid);
+ {
+
+ pickCheckerboard(Even,src_e,src);
+ pickCheckerboard(Odd,src_o,src);
+
+#if defined(AVX512)
+ const int num_cases = 6;
+ std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
+#else
+ const int num_cases = 4;
+ std::string fmt("U/S ; U/O ; G/S ; G/O ");
+#endif
+ controls Cases [] = {
+#ifdef AVX512
+ { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+#endif
+ { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
+ };
+
+ for(int c=0;cBarrier();
+ for(int i=0;iBarrier();
+ double t1=usecond();
+
+ sDw.ZeroCounters();
+ time_statistics timestat;
+ std::vector t_time(ncall);
+ for(uint64_t i=0;iBarrier();
+
+ double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops;
+ if ( mflops mflops_all;
+
+ ///////////////////////////////////////////////////////
+ // Set/Get the layout & grid size
+ ///////////////////////////////////////////////////////
+ int threads = GridThread::GetThreads();
+ std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4);
+ std::vector local({L,L,L,L});
+
+ GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}),
+ GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+ uint64_t NP = TmpGrid->RankCount();
+ uint64_t NN = TmpGrid->NodeCount();
+ NN_global=NN;
+ uint64_t SHM=NP/NN;
+
+ std::vector internal;
+ if ( SHM == 1 ) internal = std::vector({1,1,1,1});
+ else if ( SHM == 2 ) internal = std::vector({2,1,1,1});
+ else if ( SHM == 4 ) internal = std::vector({2,2,1,1});
+ else if ( SHM == 8 ) internal = std::vector({2,2,2,1});
+ else assert(0);
+
+ std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
+ std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+
+ ///////// Welcome message ////////////
+ std::cout< seeds4({1,2,3,4});
+ std::vector seeds5({5,6,7,8});
+ GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+ GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
+ std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+ ///////// Source preparation ////////////
+ LatticeFermion src (FGrid); random(RNG5,src);
+ LatticeFermion ref (FGrid);
+ LatticeFermion tmp (FGrid);
+
+ RealD N2 = 1.0/::sqrt(norm2(src));
+ src = src*N2;
+
+ LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+
+ DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+ ////////////////////////////////////
+ // Naive wilson implementation
+ ////////////////////////////////////
+ {
+ LatticeGaugeField Umu5d(FGrid);
+ std::vector U(4,FGrid);
+ for(int ss=0;ssoSites();ss++){
+ for(int s=0;s(Umu5d,mu);
+ }
+ for(int mu=0;muBarrier();
+ for(int i=0;iBarrier();
+ double t1=usecond();
+ // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+ // if (ncall < 500) ncall = 500;
+ uint64_t ncall = 1000;
+
+ FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+ // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall);
+ for(uint64_t i=0;iBarrier();
+
+ double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops;
+ if ( mflops({8,2,2,2});
+#else
+ LebesgueOrder::Block = std::vector({2,2,2,2});
+#endif
+ Benchmark::Decomposition();
+
+ int do_memory=1;
+ int do_comms =1;
+ int do_su3 =0;
+ int do_wilson=1;
+ int do_dwf =1;
+
+ if ( do_su3 ) {
+ // empty for now
+ }
+#if 1
+ int sel=2;
+ std::vector L_list({8,12,16,24});
+#else
+ int sel=1;
+ std::vector L_list({8,12});
+#endif
+ int selm1=sel-1;
+ std::vector robust_list;
+
+ std::vector wilson;
+ std::vector dwf4;
+ std::vector dwf5;
+
+ if ( do_wilson ) {
+ int Ls=1;
+ std::cout<1) ) {
+ std::cout<1) nmu++;
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
std::cout< latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@@ -92,11 +92,16 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
- std::vector > xbuf(8,std::vector(lat*lat*lat*Ls));
- std::vector > rbuf(8,std::vector(lat*lat*lat*Ls));
+ std::vector > xbuf(8);
+ std::vector > rbuf(8);
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ for(int mu=0;mu<8;mu++){
+ xbuf[mu].resize(lat*lat*lat*Ls);
+ rbuf[mu].resize(lat*lat*lat*Ls);
+ // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < latt_size ({lat,lat,lat,lat});
@@ -172,9 +176,14 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
- std::vector > xbuf(8,std::vector(lat*lat*lat*Ls));
- std::vector > rbuf(8,std::vector(lat*lat*lat*Ls));
+ std::vector > xbuf(8);
+ std::vector > rbuf(8);
+ for(int mu=0;mu<8;mu++){
+ xbuf[mu].resize(lat*lat*lat*Ls);
+ rbuf[mu].resize(lat*lat*lat*Ls);
+ // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@@ -299,7 +308,7 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
- bytes);
+ bytes,mu);
comm_proc = mpi_layout[mu]-1;
@@ -310,11 +319,11 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
- bytes);
+ bytes,mu+4);
}
}
- Grid.StencilSendToRecvFromComplete(requests);
+ Grid.StencilSendToRecvFromComplete(requests,0);
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
@@ -346,7 +355,7 @@ int main (int argc, char ** argv)
header();
for(int lat=4;lat<=maxlat;lat+=4){
- for(int Ls=8;Ls<=32;Ls*=2){
+ for(int Ls=8;Ls<=8;Ls*=2){
std::vector latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@@ -393,8 +402,8 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
- bytes);
- Grid.StencilSendToRecvFromComplete(requests);
+ bytes,mu);
+ Grid.StencilSendToRecvFromComplete(requests,mu);
requests.resize(0);
comm_proc = mpi_layout[mu]-1;
@@ -406,8 +415,8 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
- bytes);
- Grid.StencilSendToRecvFromComplete(requests);
+ bytes,mu+4);
+ Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0);
}
@@ -436,5 +445,97 @@ int main (int argc, char ** argv)
}
}
+
+
+ std::cout< latt_size ({lat*mpi_layout[0],
+ lat*mpi_layout[1],
+ lat*mpi_layout[2],
+ lat*mpi_layout[3]});
+
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
+
+ std::vector xbuf(8);
+ std::vector rbuf(8);
+ Grid.ShmBufferFreeAll();
+ for(int d=0;d<8;d++){
+ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ }
+
+ int ncomm;
+ int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ double dbytes;
+ for(int i=0;i requests;
+ dbytes=0;
+ ncomm=0;
+
+ parallel_for(int dir=0;dir<8;dir++){
+
+ double tbytes;
+ int mu =dir % 4;
+
+ if (mpi_layout[mu]>1 ) {
+
+ ncomm++;
+ int xmit_to_rank;
+ int recv_from_rank;
+ if ( dir == mu ) {
+ int comm_proc=1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ } else {
+ int comm_proc = mpi_layout[mu]-1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ }
+
+ tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+ (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
+
+#pragma omp atomic
+ dbytes+=tbytes;
+ }
+ }
+ Grid.Barrier();
+ double stop=usecond();
+ t_time[i] = stop-start; // microseconds
+ }
+
+ timestat.statistics(t_time);
+
+ dbytes=dbytes*ppn;
+ double xbytes = dbytes*0.5;
+ double rbytes = dbytes*0.5;
+ double bidibytes = dbytes;
+
+
+ std::cout< latt4 = GridDefaultLatt();
- const int Ls=16;
+ int Ls=16;
+ for(int i=0;i> Ls;
+ }
+
+
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -165,7 +171,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "*****************************************************************" <Barrier();
Dw.ZeroCounters();
@@ -302,6 +308,7 @@ int main (int argc, char ** argv)
std::cout<< "sD ERR \n " << err <
+#include
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template
+struct scal {
+ d internal;
+};
+
+ Gamma::Algebra Gmu [] = {
+ Gamma::Algebra::GammaX,
+ Gamma::Algebra::GammaY,
+ Gamma::Algebra::GammaZ,
+ Gamma::Algebra::GammaT
+ };
+
+typedef typename GparityDomainWallFermionF::FermionField GparityLatticeFermionF;
+typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
+
+
+
+int main (int argc, char ** argv)
+{
+ Grid_init(&argc,&argv);
+
+ int Ls=16;
+ for(int i=0;i> Ls;
+ }
+
+
+ int threads = GridThread::GetThreads();
+ std::cout< latt4 = GridDefaultLatt();
+
+ GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+ GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+ GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+ GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+ std::vector seeds4({1,2,3,4});
+ std::vector seeds5({5,6,7,8});
+
+ std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+ GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+ std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+ GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
+ std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+ GparityLatticeFermionF src (FGrid); random(RNG5,src);
+ RealD N2 = 1.0/::sqrt(norm2(src));
+ src = src*N2;
+
+ GparityLatticeFermionF result(FGrid); result=zero;
+ GparityLatticeFermionF ref(FGrid); ref=zero;
+ GparityLatticeFermionF tmp(FGrid);
+ GparityLatticeFermionF err(FGrid);
+
+ std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+ LatticeGaugeFieldF Umu(UGrid);
+ SU3::HotConfiguration(RNG4,Umu);
+ std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+
+ RealD mass=0.1;
+ RealD M5 =1.8;
+
+ RealD NP = UGrid->_Nprocessors;
+ RealD NN = UGrid->NodeCount();
+
+ std::cout << GridLogMessage<< "*****************************************************************" <Barrier();
+ Dw.ZeroCounters();
+ Dw.Dhop(src,result,0);
+ std::cout<Barrier();
+
+ double volume=Ls; for(int mu=0;muBarrier();
+ DwH.ZeroCounters();
+ DwH.Dhop(src,result,0);
+ double t0=usecond();
+ for(int i=0;iBarrier();
+
+ double volume=Ls; for(int mu=0;muBarrier();
+ DwD.ZeroCounters();
+ DwD.Dhop(src_d,result_d,0);
+ std::cout<Barrier();
+
+ double volume=Ls; for(int mu=0;mu latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
uint64_t Nloop=NLOOP;
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeVec z(&Grid); //random(pRNG,z);
- LatticeVec x(&Grid); //random(pRNG,x);
- LatticeVec y(&Grid); //random(pRNG,y);
+ LatticeVec z(&Grid);// random(pRNG,z);
+ LatticeVec x(&Grid);// random(pRNG,x);
+ LatticeVec y(&Grid);// random(pRNG,y);
double a=2.0;
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
- double bytes=3*vol*Nvec*sizeof(Real);
+ double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeVec z(&Grid); //random(pRNG,z);
- LatticeVec x(&Grid); //random(pRNG,x);
- LatticeVec y(&Grid); //random(pRNG,y);
+ LatticeVec z(&Grid);// random(pRNG,z);
+ LatticeVec x(&Grid);// random(pRNG,x);
+ LatticeVec y(&Grid);// random(pRNG,y);
double a=2.0;
uint64_t Nloop=NLOOP;
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
- double bytes=3*vol*Nvec*sizeof(Real);
+ double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
uint64_t Nloop=NLOOP;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeVec z(&Grid); //random(pRNG,z);
- LatticeVec x(&Grid); //random(pRNG,x);
- LatticeVec y(&Grid); //random(pRNG,y);
+ LatticeVec z(&Grid);// random(pRNG,z);
+ LatticeVec x(&Grid);// random(pRNG,x);
+ LatticeVec y(&Grid);// random(pRNG,y);
RealD a=2.0;
@@ -154,7 +154,7 @@ int main (int argc, char ** argv)
double stop=usecond();
double time = (stop-start)/Nloop*1000;
- double bytes=2*vol*Nvec*sizeof(Real);
+ double bytes=2.0*vol*Nvec*sizeof(Real);
double flops=vol*Nvec*1;// mul
std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
uint64_t Nloop=NLOOP;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
- LatticeVec z(&Grid); //random(pRNG,z);
- LatticeVec x(&Grid); //random(pRNG,x);
- LatticeVec y(&Grid); //random(pRNG,y);
+ // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
+ LatticeVec z(&Grid);// random(pRNG,z);
+ LatticeVec x(&Grid);// random(pRNG,x);
+ LatticeVec y(&Grid);// random(pRNG,y);
RealD a=2.0;
Real nn;
double start=usecond();
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
double stop=usecond();
double time = (stop-start)/Nloop*1000;
- double bytes=vol*Nvec*sizeof(Real);
+ double bytes=1.0*vol*Nvec*sizeof(Real);
double flops=vol*Nvec*2;// mul,add
std::cout< simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector mpi_layout = GridDefaultMpi();
- int threads = GridThread::GetThreads();
+ int64_t threads = GridThread::GetThreads();
std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeColourMatrix z(&Grid);// random(pRNG,z);
- LatticeColourMatrix x(&Grid);// random(pRNG,x);
- LatticeColourMatrix y(&Grid);// random(pRNG,y);
+ LatticeColourMatrix z(&Grid); random(pRNG,z);
+ LatticeColourMatrix x(&Grid); random(pRNG,x);
+ LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
- for(int i=0;i latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeColourMatrix z(&Grid); //random(pRNG,z);
- LatticeColourMatrix x(&Grid); //random(pRNG,x);
- LatticeColourMatrix y(&Grid); //random(pRNG,y);
+ LatticeColourMatrix z(&Grid); random(pRNG,z);
+ LatticeColourMatrix x(&Grid); random(pRNG,x);
+ LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
- for(int i=0;i latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeColourMatrix z(&Grid); //random(pRNG,z);
- LatticeColourMatrix x(&Grid); //random(pRNG,x);
- LatticeColourMatrix y(&Grid); //random(pRNG,y);
+ LatticeColourMatrix z(&Grid); random(pRNG,z);
+ LatticeColourMatrix x(&Grid); random(pRNG,x);
+ LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
- for(int i=0;i latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
- int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9});
+ GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9}));
- LatticeColourMatrix z(&Grid); //random(pRNG,z);
- LatticeColourMatrix x(&Grid); //random(pRNG,x);
- LatticeColourMatrix y(&Grid); //random(pRNG,y);
+ LatticeColourMatrix z(&Grid); random(pRNG,z);
+ LatticeColourMatrix x(&Grid); random(pRNG,x);
+ LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
- for(int i=0;i]])
AC_CHECK_DECLS([be64toh],[], [], [[#include ]])
+############## Standard libraries
+AC_CHECK_LIB([m],[cos])
+AC_CHECK_LIB([stdc++],[abort])
+
############### GMP and MPFR
AC_ARG_WITH([gmp],
[AS_HELP_STRING([--with-gmp=prefix],
@@ -186,9 +194,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
AC_SEARCH_LIBS([crc32], [z],
[AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
- [have_zlib=true],
+ [have_zlib=true] [LIBS="${LIBS} -lz"],
[AC_MSG_ERROR(zlib library was not found in your system.)])
+AC_SEARCH_LIBS([move_pages], [numa],
+ [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
+ [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
+ [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
+
AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
[AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
[have_hdf5=true]
@@ -241,6 +254,7 @@ case ${ax_cv_cxx_compiler_vendor} in
SIMD_FLAGS='';;
KNL)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+ AC_DEFINE([KNL],[1],[Knights landing processor])
SIMD_FLAGS='-march=knl';;
GEN)
AC_DEFINE([GEN],[1],[generic vector code])
@@ -248,6 +262,9 @@ case ${ax_cv_cxx_compiler_vendor} in
[generic SIMD vector width (in bytes)])
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
SIMD_FLAGS='';;
+ NEONv8)
+ AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
+ SIMD_FLAGS='-march=armv8-a';;
QPX|BGQ)
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
SIMD_FLAGS='';;
@@ -276,6 +293,7 @@ case ${ax_cv_cxx_compiler_vendor} in
SIMD_FLAGS='';;
KNL)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
+ AC_DEFINE([KNL],[1],[Knights landing processor])
SIMD_FLAGS='-xmic-avx512';;
GEN)
AC_DEFINE([GEN],[1],[generic vector code])
@@ -313,8 +331,41 @@ case ${ac_PRECISION} in
double)
AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
;;
+ *)
+ AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
+ ;;
esac
+###################### Shared memory allocation technique under MPI3
+AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs],
+ [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
+
+case ${ac_SHM} in
+
+ shmget)
+ AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
+ ;;
+
+ shmopen)
+ AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
+ ;;
+
+ hugetlbfs)
+ AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
+ ;;
+
+ *)
+ AC_MSG_ERROR([${ac_SHM} unsupported --enable-shm option]);
+ ;;
+esac
+
+###################### Shared base path for SHMMMAP
+AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
+ [Select SHM mmap base path for hugetlbfs])],
+ [ac_SHMPATH=${enable_shmpath}],
+ [ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
+AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
+
############### communication type selection
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
@@ -324,14 +375,14 @@ case ${ac_COMMS} in
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
comms_type='none'
;;
- mpi3l*)
- AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
- comms_type='mpi3l'
- ;;
mpi3*)
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
comms_type='mpi3'
;;
+ mpit)
+ AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
+ comms_type='mpit'
+ ;;
mpi*)
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
comms_type='mpi'
@@ -359,7 +410,7 @@ esac
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
-AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
+AM_CONDITIONAL(BUILD_COMMS_MPIT, [ test "${comms_type}X" == "mpitX" ] )
AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
############### RNG selection
@@ -464,6 +515,8 @@ compiler version : ${ax_cv_gxx_version}
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
Threading : ${ac_openmp}
Communications type : ${comms_type}
+Shared memory allocator : ${ac_SHM}
+Shared memory mmap path : ${ac_SHMPATH}
Default precision : ${ac_PRECISION}
Software FP16 conversion : ${ac_SFW_FP16}
RNG choice : ${ac_RNG}
diff --git a/extras/Hadrons/Environment.cc b/extras/Hadrons/Environment.cc
index 37f2a3d7..0e7a4326 100644
--- a/extras/Hadrons/Environment.cc
+++ b/extras/Hadrons/Environment.cc
@@ -41,9 +41,10 @@ using namespace Hadrons;
// constructor /////////////////////////////////////////////////////////////////
Environment::Environment(void)
{
- nd_ = GridDefaultLatt().size();
+ dim_ = GridDefaultLatt();
+ nd_ = dim_.size();
grid4d_.reset(SpaceTimeGrid::makeFourDimGrid(
- GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()),
+ dim_, GridDefaultSimd(nd_, vComplex::Nsimd()),
GridDefaultMpi()));
gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get()));
auto loc = getGrid()->LocalDimensions();
@@ -132,6 +133,16 @@ unsigned int Environment::getNd(void) const
return nd_;
}
+std::vector Environment::getDim(void) const
+{
+ return dim_;
+}
+
+int Environment::getDim(const unsigned int mu) const
+{
+ return dim_[mu];
+}
+
// random number generator /////////////////////////////////////////////////////
void Environment::setSeed(const std::vector &seed)
{
@@ -271,6 +282,21 @@ std::string Environment::getModuleType(const std::string name) const
return getModuleType(getModuleAddress(name));
}
+std::string Environment::getModuleNamespace(const unsigned int address) const
+{
+ std::string type = getModuleType(address), ns;
+
+ auto pos2 = type.rfind("::");
+ auto pos1 = type.rfind("::", pos2 - 2);
+
+ return type.substr(pos1 + 2, pos2 - pos1 - 2);
+}
+
+std::string Environment::getModuleNamespace(const std::string name) const
+{
+ return getModuleNamespace(getModuleAddress(name));
+}
+
bool Environment::hasModule(const unsigned int address) const
{
return (address < module_.size());
@@ -492,7 +518,14 @@ std::string Environment::getObjectType(const unsigned int address) const
{
if (hasRegisteredObject(address))
{
- return typeName(object_[address].type);
+ if (object_[address].type)
+ {
+ return typeName(object_[address].type);
+ }
+ else
+ {
+ return "";
+ }
}
else if (hasObject(address))
{
@@ -532,6 +565,23 @@ Environment::Size Environment::getObjectSize(const std::string name) const
return getObjectSize(getObjectAddress(name));
}
+unsigned int Environment::getObjectModule(const unsigned int address) const
+{
+ if (hasObject(address))
+ {
+ return object_[address].module;
+ }
+ else
+ {
+ HADRON_ERROR("no object with address " + std::to_string(address));
+ }
+}
+
+unsigned int Environment::getObjectModule(const std::string name) const
+{
+ return getObjectModule(getObjectAddress(name));
+}
+
unsigned int Environment::getObjectLs(const unsigned int address) const
{
if (hasRegisteredObject(address))
diff --git a/extras/Hadrons/Environment.hpp b/extras/Hadrons/Environment.hpp
index 2628e5a0..13264bd5 100644
--- a/extras/Hadrons/Environment.hpp
+++ b/extras/Hadrons/Environment.hpp
@@ -106,6 +106,8 @@ public:
void createGrid(const unsigned int Ls);
GridCartesian * getGrid(const unsigned int Ls = 1) const;
GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
+ std::vector getDim(void) const;
+ int getDim(const unsigned int mu) const;
unsigned int getNd(void) const;
// random number generator
void setSeed(const std::vector &seed);
@@ -131,6 +133,8 @@ public:
std::string getModuleName(const unsigned int address) const;
std::string getModuleType(const unsigned int address) const;
std::string getModuleType(const std::string name) const;
+ std::string getModuleNamespace(const unsigned int address) const;
+ std::string getModuleNamespace(const std::string name) const;
bool hasModule(const unsigned int address) const;
bool hasModule(const std::string name) const;
Graph makeModuleGraph(void) const;
@@ -171,6 +175,8 @@ public:
std::string getObjectType(const std::string name) const;
Size getObjectSize(const unsigned int address) const;
Size getObjectSize(const std::string name) const;
+ unsigned int getObjectModule(const unsigned int address) const;
+ unsigned int getObjectModule(const std::string name) const;
unsigned int getObjectLs(const unsigned int address) const;
unsigned int getObjectLs(const std::string name) const;
bool hasObject(const unsigned int address) const;
@@ -181,6 +187,10 @@ public:
bool hasCreatedObject(const std::string name) const;
bool isObject5d(const unsigned int address) const;
bool isObject5d(const std::string name) const;
+ template
+ bool isObjectOfType(const unsigned int address) const;
+ template
+ bool isObjectOfType(const std::string name) const;
Environment::Size getTotalSize(void) const;
void addOwnership(const unsigned int owner,
const unsigned int property);
@@ -197,6 +207,7 @@ private:
bool dryRun_{false};
unsigned int traj_, locVol_;
// grids
+ std::vector dim_;
GridPt grid4d_;
std::map grid5d_;
GridRbPt gridRb4d_;
@@ -343,7 +354,7 @@ T * Environment::getObject(const unsigned int address) const
else
{
HADRON_ERROR("object with address " + std::to_string(address) +
- " does not have type '" + typeid(T).name() +
+ " does not have type '" + typeName(&typeid(T)) +
"' (has type '" + getObjectType(address) + "')");
}
}
@@ -380,6 +391,37 @@ T * Environment::createLattice(const std::string name)
return createLattice(getObjectAddress(name));
}
+template
+bool Environment::isObjectOfType(const unsigned int address) const
+{
+ if (hasRegisteredObject(address))
+ {
+ if (auto h = dynamic_cast *>(object_[address].data.get()))
+ {
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ else if (hasObject(address))
+ {
+ HADRON_ERROR("object with address " + std::to_string(address) +
+ " exists but is not registered");
+ }
+ else
+ {
+ HADRON_ERROR("no object with address " + std::to_string(address));
+ }
+}
+
+template
+bool Environment::isObjectOfType(const std::string name) const
+{
+ return isObjectOfType(getObjectAddress(name));
+}
+
END_HADRONS_NAMESPACE
#endif // Hadrons_Environment_hpp_
diff --git a/extras/Hadrons/Global.hpp b/extras/Hadrons/Global.hpp
index 3e11ddf8..9de01623 100644
--- a/extras/Hadrons/Global.hpp
+++ b/extras/Hadrons/Global.hpp
@@ -51,23 +51,43 @@ using Grid::operator<<;
* error with GCC 5 (clang & GCC 6 compile fine without it).
*/
-// FIXME: find a way to do that in a more general fashion
#ifndef FIMPL
#define FIMPL WilsonImplR
#endif
+#ifndef SIMPL
+#define SIMPL ScalarImplCR
+#endif
BEGIN_HADRONS_NAMESPACE
// type aliases
-#define TYPE_ALIASES(FImpl, suffix)\
+#define FERM_TYPE_ALIASES(FImpl, suffix)\
typedef FermionOperator FMat##suffix; \
typedef typename FImpl::FermionField FermionField##suffix; \
typedef typename FImpl::PropagatorField PropagatorField##suffix; \
typedef typename FImpl::SitePropagator SitePropagator##suffix; \
-typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;\
-typedef std::function \
+ SlicedPropagator##suffix;
+
+#define GAUGE_TYPE_ALIASES(FImpl, suffix)\
+typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
+
+#define SCALAR_TYPE_ALIASES(SImpl, suffix)\
+typedef typename SImpl::Field ScalarField##suffix;\
+typedef typename SImpl::Field PropagatorField##suffix;
+
+#define SOLVER_TYPE_ALIASES(FImpl, suffix)\
+typedef std::function SolverFn##suffix;
+#define SINK_TYPE_ALIASES(suffix)\
+typedef std::function SinkFn##suffix;
+
+#define FGS_TYPE_ALIASES(FImpl, suffix)\
+FERM_TYPE_ALIASES(FImpl, suffix)\
+GAUGE_TYPE_ALIASES(FImpl, suffix)\
+SOLVER_TYPE_ALIASES(FImpl, suffix)
+
// logger
class HadronsLogger: public Logger
{
diff --git a/extras/Hadrons/Modules.hpp b/extras/Hadrons/Modules.hpp
index 05ad1697..c27254aa 100644
--- a/extras/Hadrons/Modules.hpp
+++ b/extras/Hadrons/Modules.hpp
@@ -1,31 +1,3 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: extras/Hadrons/Modules.hpp
-
-Copyright (C) 2015
-Copyright (C) 2016
-
-Author: Antonin Portelli
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/* END LEGAL */
#include
#include
#include
@@ -36,13 +8,18 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include
#include
#include
+#include
#include
#include
+#include
#include
#include
+#include
+#include
+#include
+#include
#include
#include
#include
#include
#include
-#include
diff --git a/extras/Hadrons/Modules/MAction/DWF.hpp b/extras/Hadrons/Modules/MAction/DWF.hpp
index 880fe7b9..78e0916c 100644
--- a/extras/Hadrons/Modules/MAction/DWF.hpp
+++ b/extras/Hadrons/Modules/MAction/DWF.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
-#ifndef Hadrons_DWF_hpp_
-#define Hadrons_DWF_hpp_
+#ifndef Hadrons_MAction_DWF_hpp_
+#define Hadrons_MAction_DWF_hpp_
#include
#include
@@ -56,7 +56,7 @@ template
class TDWF: public Module
{
public:
- TYPE_ALIASES(FImpl,);
+ FGS_TYPE_ALIASES(FImpl,);
public:
// constructor
TDWF(const std::string name);
@@ -137,4 +137,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
-#endif // Hadrons_DWF_hpp_
+#endif // Hadrons_MAction_DWF_hpp_
diff --git a/extras/Hadrons/Modules/MAction/Wilson.hpp b/extras/Hadrons/Modules/MAction/Wilson.hpp
index 4b84bda5..aab54245 100644
--- a/extras/Hadrons/Modules/MAction/Wilson.hpp
+++ b/extras/Hadrons/Modules/MAction/Wilson.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
-#ifndef Hadrons_Wilson_hpp_
-#define Hadrons_Wilson_hpp_
+#ifndef Hadrons_MAction_Wilson_hpp_
+#define Hadrons_MAction_Wilson_hpp_
#include
#include
@@ -54,7 +54,7 @@ template
class TWilson: public Module
{
public:
- TYPE_ALIASES(FImpl,);
+ FGS_TYPE_ALIASES(FImpl,);
public:
// constructor
TWilson(const std::string name);
diff --git a/extras/Hadrons/Modules/MContraction/Baryon.hpp b/extras/Hadrons/Modules/MContraction/Baryon.hpp
index be7d919c..78bde5a2 100644
--- a/extras/Hadrons/Modules/MContraction/Baryon.hpp
+++ b/extras/Hadrons/Modules/MContraction/Baryon.hpp
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
-#ifndef Hadrons_Baryon_hpp_
-#define Hadrons_Baryon_hpp_
+#ifndef Hadrons_MContraction_Baryon_hpp_
+#define Hadrons_MContraction_Baryon_hpp_
#include
#include
@@ -55,9 +55,9 @@ template
class TBaryon: public Module
{
public:
- TYPE_ALIASES(FImpl1, 1);
- TYPE_ALIASES(FImpl2, 2);
- TYPE_ALIASES(FImpl3, 3);
+ FERM_TYPE_ALIASES(FImpl1, 1);
+ FERM_TYPE_ALIASES(FImpl2, 2);
+ FERM_TYPE_ALIASES(FImpl3, 3);
class Result: Serializable
{
public:
@@ -121,11 +121,11 @@ void TBaryon::execute(void)
// FIXME: do contractions
- write(writer, "meson", result);
+ // write(writer, "meson", result);
}
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
-#endif // Hadrons_Baryon_hpp_
+#endif // Hadrons_MContraction_Baryon_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp b/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
index 4ad12e90..4f782cd3 100644
--- a/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
+++ b/extras/Hadrons/Modules/MContraction/DiscLoop.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
-#ifndef Hadrons_DiscLoop_hpp_
-#define Hadrons_DiscLoop_hpp_
+#ifndef Hadrons_MContraction_DiscLoop_hpp_
+#define Hadrons_MContraction_DiscLoop_hpp_
#include
#include
@@ -52,7 +52,7 @@ public:
template
class TDiscLoop: public Module
{
- TYPE_ALIASES(FImpl,);
+ FERM_TYPE_ALIASES(FImpl,);
class Result: Serializable
{
public:
@@ -141,4 +141,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
-#endif // Hadrons_DiscLoop_hpp_
+#endif // Hadrons_MContraction_DiscLoop_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp b/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
index e5e73fa6..7f643d49 100644
--- a/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
+++ b/extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
-#ifndef Hadrons_Gamma3pt_hpp_
-#define Hadrons_Gamma3pt_hpp_
+#ifndef Hadrons_MContraction_Gamma3pt_hpp_
+#define Hadrons_MContraction_Gamma3pt_hpp_
#include
#include
@@ -72,9 +72,9 @@ public:
template
class TGamma3pt: public Module
{
- TYPE_ALIASES(FImpl1, 1);
- TYPE_ALIASES(FImpl2, 2);
- TYPE_ALIASES(FImpl3, 3);
+ FERM_TYPE_ALIASES(FImpl1, 1);
+ FERM_TYPE_ALIASES(FImpl2, 2);
+ FERM_TYPE_ALIASES(FImpl3, 3);
class Result: Serializable
{
public:
@@ -167,4 +167,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
-#endif // Hadrons_Gamma3pt_hpp_
+#endif // Hadrons_MContraction_Gamma3pt_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/Meson.hpp b/extras/Hadrons/Modules/MContraction/Meson.hpp
index 09c2a6e1..7810326a 100644
--- a/extras/Hadrons/Modules/MContraction/Meson.hpp
+++ b/extras/Hadrons/Modules/MContraction/Meson.hpp
@@ -29,8 +29,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
-#ifndef Hadrons_Meson_hpp_
-#define Hadrons_Meson_hpp_
+#ifndef Hadrons_MContraction_Meson_hpp_
+#define Hadrons_MContraction_Meson_hpp_
#include
#include
@@ -69,7 +69,7 @@ public:
std::string, q1,
std::string, q2,
std::string, gammas,
- std::string, mom,
+ std::string, sink,
std::string, output);
};
@@ -77,8 +77,10 @@ template
class TMeson: public Module
{
public:
- TYPE_ALIASES(FImpl1, 1);
- TYPE_ALIASES(FImpl2, 2);
+ FERM_TYPE_ALIASES(FImpl1, 1);
+ FERM_TYPE_ALIASES(FImpl2, 2);
+ FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
+ SINK_TYPE_ALIASES(Scalar);
class Result: Serializable
{
public:
@@ -115,7 +117,7 @@ TMeson::TMeson(const std::string name)
template
std::vector TMeson::getInput(void)
{
- std::vector input = {par().q1, par().q2};
+ std::vector input = {par().q1, par().q2, par().sink};
return input;
}
@@ -154,6 +156,9 @@ void TMeson::parseGammaString(std::vector &gammaList)
// execution ///////////////////////////////////////////////////////////////////
+#define mesonConnected(q1, q2, gSnk, gSrc) \
+(g5*(gSnk))*(q1)*(adj(gSrc)*g5)*adj(q2)
+
template
void TMeson::execute(void)
{
@@ -161,43 +166,72 @@ void TMeson::execute(void)
<< " quarks '" << par().q1 << "' and '" << par().q2 << "'"
<< std::endl;
- CorrWriter writer(par().output);
- PropagatorField1 &q1 = *env().template getObject(par().q1);
- PropagatorField2 &q2 = *env().template getObject(par().q2);
- LatticeComplex c(env().getGrid());
- Gamma g5(Gamma::Algebra::Gamma5);
- std::vector gammaList;
+ CorrWriter writer(par().output);
std::vector buf;
std::vector result;
- std::vector p;
-
- p = strToVec(par().mom);
- LatticeComplex ph(env().getGrid()), coor(env().getGrid());
- Complex i(0.0,1.0);
- ph = zero;
- for(unsigned int mu = 0; mu < env().getNd(); mu++)
- {
- LatticeCoordinate(coor, mu);
- ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
- }
- ph = exp((Real)(2*M_PI)*i*ph);
+ Gamma g5(Gamma::Algebra::Gamma5);
+ std::vector gammaList;
+ int nt = env().getDim(Tp);
parseGammaString(gammaList);
-
result.resize(gammaList.size());
for (unsigned int i = 0; i < result.size(); ++i)
{
- Gamma gSnk(gammaList[i].first);
- Gamma gSrc(gammaList[i].second);
- c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph;
- sliceSum(c, buf, Tp);
-
result[i].gamma_snk = gammaList[i].first;
result[i].gamma_src = gammaList[i].second;
- result[i].corr.resize(buf.size());
- for (unsigned int t = 0; t < buf.size(); ++t)
+ result[i].corr.resize(nt);
+ }
+ if (env().template isObjectOfType(par().q1) and
+ env().template isObjectOfType(par().q2))
+ {
+ SlicedPropagator1 &q1 = *env().template getObject(par().q1);
+ SlicedPropagator2 &q2 = *env().template getObject(par().q2);
+
+ LOG(Message) << "(propagator already sinked)" << std::endl;
+ for (unsigned int i = 0; i < result.size(); ++i)
{
- result[i].corr[t] = TensorRemove(buf[t]);
+ Gamma gSnk(gammaList[i].first);
+ Gamma gSrc(gammaList[i].second);
+
+ for (unsigned int t = 0; t < buf.size(); ++t)
+ {
+ result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
+ }
+ }
+ }
+ else
+ {
+ PropagatorField1 &q1 = *env().template getObject(par().q1);
+ PropagatorField2 &q2 = *env().template getObject(par().q2);
+ LatticeComplex c(env().getGrid());
+
+ LOG(Message) << "(using sink '" << par().sink << "')" << std::endl;
+ for (unsigned int i = 0; i < result.size(); ++i)
+ {
+ Gamma gSnk(gammaList[i].first);
+ Gamma gSrc(gammaList[i].second);
+ std::string ns;
+
+ ns = env().getModuleNamespace(env().getObjectModule(par().sink));
+ if (ns == "MSource")
+ {
+ PropagatorField1 &sink =
+ *env().template getObject(par().sink);
+
+ c = trace(mesonConnected(q1, q2, gSnk, gSrc)*sink);
+ sliceSum(c, buf, Tp);
+ }
+ else if (ns == "MSink")
+ {
+ SinkFnScalar &sink = *env().template getObject(par().sink);
+
+ c = trace(mesonConnected(q1, q2, gSnk, gSrc));
+ buf = sink(c);
+ }
+ for (unsigned int t = 0; t < buf.size(); ++t)
+ {
+ result[i].corr[t] = TensorRemove(buf[t]);
+ }
}
}
write(writer, "meson", result);
@@ -207,4 +241,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
-#endif // Hadrons_Meson_hpp_
+#endif // Hadrons_MContraction_Meson_hpp_
diff --git a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp b/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
index 23482feb..0a3c2e31 100644
--- a/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
+++ b/extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
-#ifndef Hadrons_WeakHamiltonian_hpp_
-#define Hadrons_WeakHamiltonian_hpp_
+#ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
+#define Hadrons_MContraction_WeakHamiltonian_hpp_
#include
#include
@@ -83,7 +83,7 @@ public:
class T##modname: public Module