mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/multi-communicator
This commit is contained in:
		
							
								
								
									
										261
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										261
									
								
								README.md
									
									
									
									
									
								
							@@ -18,10 +18,41 @@
 | 
			
		||||
 | 
			
		||||
License: GPL v2.
 | 
			
		||||
 | 
			
		||||
Last update Nov 2016.
 | 
			
		||||
Last update June 2017.
 | 
			
		||||
 | 
			
		||||
_Please do not send pull requests to the `master` branch which is reserved for releases._
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Description
 | 
			
		||||
This library provides data parallel C++ container classes with internal memory layout
 | 
			
		||||
that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 | 
			
		||||
are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 | 
			
		||||
array indices to both MPI tasks and SIMD processing elements.
 | 
			
		||||
 | 
			
		||||
* Identically shaped arrays then be processed with perfect data parallelisation.
 | 
			
		||||
* Such identically shaped arrays are called conformable arrays.
 | 
			
		||||
 | 
			
		||||
The transformation is based on the observation that Cartesian array processing involves
 | 
			
		||||
identical processing to be performed on different regions of the Cartesian array.
 | 
			
		||||
 | 
			
		||||
The library will both geometrically decompose into MPI tasks and across SIMD lanes.
 | 
			
		||||
Local vector loops are parallelised with OpenMP pragmas.
 | 
			
		||||
 | 
			
		||||
Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
 | 
			
		||||
optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 | 
			
		||||
for most programmers.
 | 
			
		||||
 | 
			
		||||
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
 | 
			
		||||
Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
 | 
			
		||||
 | 
			
		||||
These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
 | 
			
		||||
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 | 
			
		||||
 | 
			
		||||
MPI, OpenMP, and SIMD parallelism are present in the library.
 | 
			
		||||
Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Compilers
 | 
			
		||||
 | 
			
		||||
Intel ICPC v16.0.3 and later
 | 
			
		||||
@@ -56,35 +87,25 @@ When you file an issue, please go though the following checklist:
 | 
			
		||||
6. Attach the output of `make V=1`.
 | 
			
		||||
7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 | 
			
		||||
 | 
			
		||||
### Required libraries
 | 
			
		||||
Grid requires:
 | 
			
		||||
 | 
			
		||||
[GMP](https://gmplib.org/), 
 | 
			
		||||
 | 
			
		||||
### Description
 | 
			
		||||
This library provides data parallel C++ container classes with internal memory layout
 | 
			
		||||
that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 | 
			
		||||
are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 | 
			
		||||
array indices to both MPI tasks and SIMD processing elements.
 | 
			
		||||
[MPFR](http://www.mpfr.org/) 
 | 
			
		||||
 | 
			
		||||
* Identically shaped arrays then be processed with perfect data parallelisation.
 | 
			
		||||
* Such identically shaped arrays are called conformable arrays.
 | 
			
		||||
Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
 | 
			
		||||
 | 
			
		||||
The transformation is based on the observation that Cartesian array processing involves
 | 
			
		||||
identical processing to be performed on different regions of the Cartesian array.
 | 
			
		||||
Grid optionally uses:
 | 
			
		||||
 | 
			
		||||
The library will both geometrically decompose into MPI tasks and across SIMD lanes.
 | 
			
		||||
Local vector loops are parallelised with OpenMP pragmas.
 | 
			
		||||
[HDF5](https://support.hdfgroup.org/HDF5/)  
 | 
			
		||||
 | 
			
		||||
Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
 | 
			
		||||
optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 | 
			
		||||
for most programmers.
 | 
			
		||||
[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. 
 | 
			
		||||
 | 
			
		||||
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
 | 
			
		||||
Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
 | 
			
		||||
[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
 | 
			
		||||
 | 
			
		||||
These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
 | 
			
		||||
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 | 
			
		||||
LAPACK either generic version or Intel MKL library.
 | 
			
		||||
 | 
			
		||||
MPI, OpenMP, and SIMD parallelism are present in the library.
 | 
			
		||||
Please see https://arxiv.org/abs/1512.03487 for more detail.
 | 
			
		||||
 | 
			
		||||
### Quick start
 | 
			
		||||
First, start by cloning the repository:
 | 
			
		||||
@@ -155,7 +176,6 @@ The following options can be use with the `--enable-comms=` option to target dif
 | 
			
		||||
| `none`         | no communications                                             |
 | 
			
		||||
| `mpi[-auto]`   | MPI communications                                            |
 | 
			
		||||
| `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
 | 
			
		||||
| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | 
			
		||||
| `shmem `       | Cray SHMEM communications                                     |
 | 
			
		||||
 | 
			
		||||
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
 | 
			
		||||
@@ -173,7 +193,8 @@ The following options can be use with the `--enable-simd=` option to target diff
 | 
			
		||||
| `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | 
			
		||||
| `AVX2`      | AVX 2 (256 bit)                        |
 | 
			
		||||
| `AVX512`    | AVX 512 bit                            |
 | 
			
		||||
| `QPX`       | QPX (256 bit)                          |
 | 
			
		||||
| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
 | 
			
		||||
| `QPX`       | IBM QPX (256 bit)                      |
 | 
			
		||||
 | 
			
		||||
Alternatively, some CPU codenames can be directly used:
 | 
			
		||||
 | 
			
		||||
@@ -196,20 +217,204 @@ The following configuration is recommended for the Intel Knights Landing platfor
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=KNL        \
 | 
			
		||||
             --enable-comms=mpi-auto  \
 | 
			
		||||
             --with-gmp=<path>        \
 | 
			
		||||
             --with-mpfr=<path>       \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=icpc MPICXX=mpiicpc
 | 
			
		||||
```
 | 
			
		||||
The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 | 
			
		||||
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 | 
			
		||||
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=KNL        \
 | 
			
		||||
             --enable-comms=mpi       \
 | 
			
		||||
             --with-gmp=<path>        \
 | 
			
		||||
             --with-mpfr=<path>       \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=CC CC=cc
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 | 
			
		||||
``` bash
 | 
			
		||||
               --with-gmp=<path>        \
 | 
			
		||||
               --with-mpfr=<path>       \
 | 
			
		||||
```
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 | 
			
		||||
 | 
			
		||||
Knight's Landing with Intel Omnipath adapters with two adapters per node 
 | 
			
		||||
presently performs better with use of more than one rank per node, using shared memory 
 | 
			
		||||
for interior communication. This is the mpi3 communications implementation. 
 | 
			
		||||
We recommend four ranks per node for best performance, but optimum is local volume dependent.
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=KNL        \
 | 
			
		||||
             --enable-comms=mpi3-auto \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CC=icpc MPICXX=mpiicpc 
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Build setup for Intel Haswell Xeon platform
 | 
			
		||||
 | 
			
		||||
The following configuration is recommended for the Intel Haswell platform:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX2       \
 | 
			
		||||
             --enable-comms=mpi3-auto \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=icpc MPICXX=mpiicpc
 | 
			
		||||
```
 | 
			
		||||
The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 | 
			
		||||
 | 
			
		||||
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 | 
			
		||||
``` bash
 | 
			
		||||
               --with-gmp=<path>        \
 | 
			
		||||
               --with-mpfr=<path>       \
 | 
			
		||||
```
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 | 
			
		||||
 | 
			
		||||
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX2       \
 | 
			
		||||
             --enable-comms=mpi3      \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=CC CC=cc
 | 
			
		||||
```
 | 
			
		||||
Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 | 
			
		||||
one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 | 
			
		||||
```
 | 
			
		||||
        export I_MPI_PIN=1
 | 
			
		||||
```
 | 
			
		||||
This is the default.
 | 
			
		||||
 | 
			
		||||
### Build setup for Intel Skylake Xeon platform
 | 
			
		||||
 | 
			
		||||
The following configuration is recommended for the Intel Skylake platform:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX512     \
 | 
			
		||||
             --enable-comms=mpi3      \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=mpiicpc
 | 
			
		||||
```
 | 
			
		||||
The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 | 
			
		||||
 | 
			
		||||
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 | 
			
		||||
``` bash
 | 
			
		||||
               --with-gmp=<path>        \
 | 
			
		||||
               --with-mpfr=<path>       \
 | 
			
		||||
```
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 | 
			
		||||
 | 
			
		||||
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX512     \
 | 
			
		||||
             --enable-comms=mpi3      \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=CC CC=cc
 | 
			
		||||
```
 | 
			
		||||
Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 | 
			
		||||
one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 | 
			
		||||
``` 
 | 
			
		||||
        export I_MPI_PIN=1
 | 
			
		||||
```
 | 
			
		||||
This is the default. 
 | 
			
		||||
 | 
			
		||||
#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 | 
			
		||||
 | 
			
		||||
mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
 | 
			
		||||
 | 
			
		||||
TBA
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Build setup for AMD EPYC / RYZEN
 | 
			
		||||
 | 
			
		||||
The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
 | 
			
		||||
So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
 | 
			
		||||
are common. Each chip within the module exposes a separate NUMA domain.
 | 
			
		||||
There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
 | 
			
		||||
MPI-3 is recommended with the use of four ranks per socket,
 | 
			
		||||
and 8 threads per rank. 
 | 
			
		||||
 | 
			
		||||
The following configuration is recommended for the AMD EPYC platform.
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX2       \
 | 
			
		||||
             --enable-comms=mpi3 \
 | 
			
		||||
             CXX=mpicxx 
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 | 
			
		||||
``` bash
 | 
			
		||||
               --with-gmp=<path>        \
 | 
			
		||||
               --with-mpfr=<path>       \
 | 
			
		||||
```
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 | 
			
		||||
 | 
			
		||||
Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
 | 
			
		||||
This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
 | 
			
		||||
 | 
			
		||||
It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
 | 
			
		||||
shared memory to communicate within this node:
 | 
			
		||||
 | 
			
		||||
mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
 | 
			
		||||
 | 
			
		||||
Where omp_bind.sh does the following:
 | 
			
		||||
```
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
numanode=` expr $PMI_RANK % 8 `
 | 
			
		||||
basecore=`expr $numanode \* 16`
 | 
			
		||||
core0=`expr $basecore + 0 `
 | 
			
		||||
core1=`expr $basecore + 2 `
 | 
			
		||||
core2=`expr $basecore + 4 `
 | 
			
		||||
core3=`expr $basecore + 6 `
 | 
			
		||||
core4=`expr $basecore + 8 `
 | 
			
		||||
core5=`expr $basecore + 10 `
 | 
			
		||||
core6=`expr $basecore + 12 `
 | 
			
		||||
core7=`expr $basecore + 14 `
 | 
			
		||||
 | 
			
		||||
export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
 | 
			
		||||
echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
 | 
			
		||||
 | 
			
		||||
$@
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Performance:
 | 
			
		||||
 | 
			
		||||
#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 | 
			
		||||
 | 
			
		||||
mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
 | 
			
		||||
 | 
			
		||||
TBA
 | 
			
		||||
 | 
			
		||||
### Build setup for BlueGene/Q
 | 
			
		||||
 | 
			
		||||
To be written...
 | 
			
		||||
 | 
			
		||||
### Build setup for ARM Neon
 | 
			
		||||
 | 
			
		||||
To be written...
 | 
			
		||||
 | 
			
		||||
### Build setup for laptops, other compilers, non-cluster builds
 | 
			
		||||
 | 
			
		||||
Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
 | 
			
		||||
and omit the enable-mkl flag. 
 | 
			
		||||
 | 
			
		||||
Single node builds are enabled with 
 | 
			
		||||
```
 | 
			
		||||
            --enable-comms=none
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
FFTW support that is not in the default search path may then enabled with
 | 
			
		||||
```
 | 
			
		||||
    --with-fftw=<installpath>
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -165,7 +165,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
  int ncall =1000;
 | 
			
		||||
  int ncall =500;
 | 
			
		||||
  if (1) {
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
    Dw.ZeroCounters();
 | 
			
		||||
@@ -303,6 +303,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
    }
 | 
			
		||||
    assert(sum < 1.0e-4);
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    if(1){
 | 
			
		||||
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
 | 
			
		||||
@@ -381,7 +382,22 @@ int main (int argc, char ** argv)
 | 
			
		||||
      }
 | 
			
		||||
      assert(error<1.0e-4);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if(0){
 | 
			
		||||
    std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
 | 
			
		||||
    for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      sDw.Dhop(ssrc,sresult,0);
 | 
			
		||||
      PerformanceCounter Counter(i);
 | 
			
		||||
      Counter.Start();
 | 
			
		||||
      sDw.Dhop(ssrc,sresult,0);
 | 
			
		||||
      Counter.Stop();
 | 
			
		||||
      Counter.Report();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if (1)
 | 
			
		||||
  { // Naive wilson dag implementation
 | 
			
		||||
 
 | 
			
		||||
@@ -55,21 +55,21 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
  uint64_t lmax=64;
 | 
			
		||||
#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
 | 
			
		||||
  for(int lat=4;lat<=lmax;lat+=4){
 | 
			
		||||
  uint64_t lmax=96;
 | 
			
		||||
#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
 | 
			
		||||
  for(int lat=8;lat<=lmax;lat+=8){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeVec z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid);// random(pRNG,y);
 | 
			
		||||
      double a=2.0;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
      
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double bytes=3.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -94,17 +94,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
  for(int lat=4;lat<=lmax;lat+=4){
 | 
			
		||||
  for(int lat=8;lat<=lmax;lat+=8){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeVec z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid);// random(pRNG,y);
 | 
			
		||||
      double a=2.0;
 | 
			
		||||
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
     
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double bytes=3.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -129,20 +129,20 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=lmax;lat+=4){
 | 
			
		||||
  for(int lat=8;lat<=lmax;lat+=8){
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeVec z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid);// random(pRNG,y);
 | 
			
		||||
      RealD a=2.0;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -154,7 +154,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=2*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double bytes=2.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=vol*Nvec*1;// mul
 | 
			
		||||
      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 | 
			
		||||
 | 
			
		||||
@@ -166,17 +166,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=lmax;lat+=4){
 | 
			
		||||
  for(int lat=8;lat<=lmax;lat+=8){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      LatticeVec z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid); //random(pRNG,y);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
      LatticeVec z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid);// random(pRNG,y);
 | 
			
		||||
      RealD a=2.0;
 | 
			
		||||
      Real nn;      
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=vol*Nvec*sizeof(Real);
 | 
			
		||||
      double bytes=1.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -37,12 +37,12 @@ int main (int argc, char ** argv)
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
#define LMAX (64)
 | 
			
		||||
 | 
			
		||||
  int Nloop=20;
 | 
			
		||||
  int64_t Nloop=20;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int64_t threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
@@ -54,16 +54,16 @@ int main (int argc, char ** argv)
 | 
			
		||||
  for(int lat=2;lat<=LMAX;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeColourMatrix z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid);// random(pRNG,y);
 | 
			
		||||
      LatticeColourMatrix z(&Grid); random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      for(int64_t i=0;i<Nloop;i++){
 | 
			
		||||
	x=x*y;
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
@@ -86,17 +86,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  for(int lat=2;lat<=LMAX;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeColourMatrix z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeColourMatrix z(&Grid); random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      for(int64_t i=0;i<Nloop;i++){
 | 
			
		||||
	z=x*y;
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
@@ -117,17 +117,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  for(int lat=2;lat<=LMAX;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeColourMatrix z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeColourMatrix z(&Grid); random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      for(int64_t i=0;i<Nloop;i++){
 | 
			
		||||
	mult(z,x,y);
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
@@ -148,17 +148,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  for(int lat=2;lat<=LMAX;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeColourMatrix z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeColourMatrix z(&Grid); random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      for(int64_t i=0;i<Nloop;i++){
 | 
			
		||||
	mac(z,x,y);
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										22
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								configure.ac
									
									
									
									
									
								
							@@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 | 
			
		||||
################ Get git info
 | 
			
		||||
#AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 | 
			
		||||
 | 
			
		||||
################ Set flags
 | 
			
		||||
# do not move!
 | 
			
		||||
CXXFLAGS="-O3 $CXXFLAGS"
 | 
			
		||||
 | 
			
		||||
############### Checks for programs
 | 
			
		||||
AC_PROG_CXX
 | 
			
		||||
AC_PROG_RANLIB
 | 
			
		||||
@@ -27,7 +31,6 @@ AX_GXX_VERSION
 | 
			
		||||
AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
 | 
			
		||||
      [version of g++ that will compile the code])
 | 
			
		||||
 | 
			
		||||
CXXFLAGS="-g $CXXFLAGS"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
############### Checks for typedefs, structures, and compiler characteristics
 | 
			
		||||
@@ -51,9 +54,14 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 | 
			
		||||
AC_CHECK_HEADERS(malloc.h)
 | 
			
		||||
AC_CHECK_HEADERS(endian.h)
 | 
			
		||||
AC_CHECK_HEADERS(execinfo.h)
 | 
			
		||||
AC_CHECK_HEADERS(numaif.h)
 | 
			
		||||
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 | 
			
		||||
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 | 
			
		||||
 | 
			
		||||
############## Standard libraries
 | 
			
		||||
AC_CHECK_LIB([m],[cos])
 | 
			
		||||
AC_CHECK_LIB([stdc++],[abort])
 | 
			
		||||
 | 
			
		||||
############### GMP and MPFR
 | 
			
		||||
AC_ARG_WITH([gmp],
 | 
			
		||||
    [AS_HELP_STRING([--with-gmp=prefix],
 | 
			
		||||
@@ -186,9 +194,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 | 
			
		||||
 | 
			
		||||
AC_SEARCH_LIBS([crc32], [z],
 | 
			
		||||
               [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
 | 
			
		||||
               [have_zlib=true],
 | 
			
		||||
               [have_zlib=true] [LIBS="${LIBS} -lz"],
 | 
			
		||||
	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 | 
			
		||||
 | 
			
		||||
AC_SEARCH_LIBS([move_pages], [numa],
 | 
			
		||||
               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
 | 
			
		||||
               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
 | 
			
		||||
	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
 | 
			
		||||
 | 
			
		||||
AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
 | 
			
		||||
               [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
 | 
			
		||||
               [have_hdf5=true]
 | 
			
		||||
@@ -241,6 +254,7 @@ case ${ax_cv_cxx_compiler_vendor} in
 | 
			
		||||
        SIMD_FLAGS='';;
 | 
			
		||||
      KNL)
 | 
			
		||||
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
 | 
			
		||||
        AC_DEFINE([KNL],[1],[Knights landing processor])
 | 
			
		||||
        SIMD_FLAGS='-march=knl';;
 | 
			
		||||
      GEN)
 | 
			
		||||
        AC_DEFINE([GEN],[1],[generic vector code])
 | 
			
		||||
@@ -248,6 +262,9 @@ case ${ax_cv_cxx_compiler_vendor} in
 | 
			
		||||
                           [generic SIMD vector width (in bytes)])
 | 
			
		||||
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
 | 
			
		||||
        SIMD_FLAGS='';;
 | 
			
		||||
      NEONv8)
 | 
			
		||||
        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
 | 
			
		||||
        SIMD_FLAGS='-march=armv8-a';;
 | 
			
		||||
      QPX|BGQ)
 | 
			
		||||
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
 | 
			
		||||
        SIMD_FLAGS='';;
 | 
			
		||||
@@ -276,6 +293,7 @@ case ${ax_cv_cxx_compiler_vendor} in
 | 
			
		||||
        SIMD_FLAGS='';;
 | 
			
		||||
      KNL)
 | 
			
		||||
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
 | 
			
		||||
        AC_DEFINE([KNL],[1],[Knights landing processor])
 | 
			
		||||
        SIMD_FLAGS='-xmic-avx512';;
 | 
			
		||||
      GEN)
 | 
			
		||||
        AC_DEFINE([GEN],[1],[generic vector code])
 | 
			
		||||
 
 | 
			
		||||
@@ -102,7 +102,14 @@ public:
 | 
			
		||||
#else
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    // First touch optimise in threaded loop
 | 
			
		||||
    uint8_t *cp = (uint8_t *)ptr;
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
#endif
 | 
			
		||||
    for(size_type n=0;n<bytes;n+=4096){
 | 
			
		||||
      cp[n]=0;
 | 
			
		||||
    }
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -190,6 +197,13 @@ public:
 | 
			
		||||
#else
 | 
			
		||||
    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 | 
			
		||||
#endif
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
    uint8_t *cp = (uint8_t *)ptr;
 | 
			
		||||
    // One touch per 4k page, static OMP loop to catch same loop order
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
    for(size_type n=0;n<bytes;n+=4096){
 | 
			
		||||
      cp[n]=0;
 | 
			
		||||
    }
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
  void deallocate(pointer __p, size_type) { 
 | 
			
		||||
 
 | 
			
		||||
@@ -37,7 +37,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <sys/ipc.h>
 | 
			
		||||
#include <sys/shm.h>
 | 
			
		||||
#include <sys/mman.h>
 | 
			
		||||
//#include <zlib.h>
 | 
			
		||||
#include <zlib.h>
 | 
			
		||||
#ifdef HAVE_NUMAIF_H
 | 
			
		||||
#include <numaif.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifndef SHM_HUGETLB
 | 
			
		||||
#define SHM_HUGETLB 04000
 | 
			
		||||
#endif
 | 
			
		||||
@@ -214,6 +217,25 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 | 
			
		||||
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
 | 
			
		||||
      assert(((uint64_t)ptr&0x3F)==0);
 | 
			
		||||
 | 
			
		||||
      // Try to force numa domain on the shm segment if we have numaif.h
 | 
			
		||||
#ifdef HAVE_NUMAIF_H
 | 
			
		||||
	int status;
 | 
			
		||||
	int flags=MPOL_MF_MOVE;
 | 
			
		||||
#ifdef KNL
 | 
			
		||||
	int nodes=1; // numa domain == MCDRAM
 | 
			
		||||
	// Find out if in SNC2,SNC4 mode ?
 | 
			
		||||
#else
 | 
			
		||||
	int nodes=r; // numa domain == MPI ID
 | 
			
		||||
#endif
 | 
			
		||||
	unsigned long count=1;
 | 
			
		||||
	for(uint64_t page=0;page<size;page+=4096){
 | 
			
		||||
	  void *pages = (void *) ( page + (uint64_t)ptr );
 | 
			
		||||
	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
 | 
			
		||||
	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
 | 
			
		||||
	  if (ierr && (page==0)) perror("numa relocate command failed");
 | 
			
		||||
	}
 | 
			
		||||
#endif
 | 
			
		||||
      ShmCommBufs[r] =ptr;
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -540,7 +540,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
      for(int i=0;i<Nblock;i++){
 | 
			
		||||
      for(int j=0;j<Nblock;j++){
 | 
			
		||||
	auto tmp = innerProduct(Left[i],Right[j]);
 | 
			
		||||
	vector_typeD rtmp = TensorRemove(tmp);
 | 
			
		||||
	//	vector_typeD rtmp = TensorRemove(tmp);
 | 
			
		||||
	auto rtmp = TensorRemove(tmp);
 | 
			
		||||
	mat_thread(i,j) += Reduce(rtmp);
 | 
			
		||||
      }}
 | 
			
		||||
    }}
 | 
			
		||||
 
 | 
			
		||||
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
 | 
			
		||||
    // 4
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
#ifdef KNL
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
 | 
			
		||||
 
 | 
			
		||||
@@ -93,6 +93,8 @@ class ScalarImplTypes {
 | 
			
		||||
  class ScalarAdjMatrixImplTypes {
 | 
			
		||||
  public:
 | 
			
		||||
    typedef S Simd;
 | 
			
		||||
    typedef QCD::SU<N> Group;
 | 
			
		||||
    
 | 
			
		||||
    template <typename vtype>
 | 
			
		||||
    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
 | 
			
		||||
    template <typename vtype>
 | 
			
		||||
@@ -108,7 +110,7 @@ class ScalarImplTypes {
 | 
			
		||||
    typedef Field                PropagatorField;
 | 
			
		||||
 | 
			
		||||
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
 | 
			
		||||
      QCD::SU<N>::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
 | 
			
		||||
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline Field projectForce(Field& P) {return P;}
 | 
			
		||||
@@ -122,11 +124,11 @@ class ScalarImplTypes {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
 | 
			
		||||
      QCD::SU<N>::LieRandomize(pRNG, U);
 | 
			
		||||
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
 | 
			
		||||
      QCD::SU<N>::LieRandomize(pRNG, U, 0.01);
 | 
			
		||||
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
 | 
			
		||||
 
 | 
			
		||||
@@ -81,7 +81,7 @@ namespace Grid {
 | 
			
		||||
      phiStencil.HaloExchange(p, compressor);
 | 
			
		||||
      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
 | 
			
		||||
      phisquared = p*p;
 | 
			
		||||
      action = (2.0*Ndim + mass_square)*phisquared + lambda*phisquared*phisquared;
 | 
			
		||||
      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
 | 
			
		||||
      for (int mu = 0; mu < Ndim; mu++) {
 | 
			
		||||
	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 | 
			
		||||
	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
 | 
			
		||||
@@ -98,7 +98,7 @@ namespace Grid {
 | 
			
		||||
	      permute(temp2, *temp, permute_type);
 | 
			
		||||
	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
 | 
			
		||||
	    } else {
 | 
			
		||||
	      action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp);
 | 
			
		||||
	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
 | 
			
		||||
	    }
 | 
			
		||||
	  } else {
 | 
			
		||||
	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
 | 
			
		||||
@@ -113,7 +113,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    virtual void deriv(const Field &p, Field &force) {
 | 
			
		||||
      assert(p._grid->Nd() == Ndim);
 | 
			
		||||
      force = (2.0*Ndim + mass_square)*p + 2.0*lambda*p*p*p;
 | 
			
		||||
      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
 | 
			
		||||
      // move this outside
 | 
			
		||||
      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
 | 
			
		||||
      phiStencil.HaloExchange(p, compressor);
 | 
			
		||||
 
 | 
			
		||||
@@ -76,7 +76,7 @@ struct HMCparameters: Serializable {
 | 
			
		||||
 | 
			
		||||
  template < class ReaderClass > 
 | 
			
		||||
  void initialize(Reader<ReaderClass> &TheReader){
 | 
			
		||||
  	std::cout << "Reading HMC\n";
 | 
			
		||||
  	std::cout << GridLogMessage << "Reading HMC\n";
 | 
			
		||||
  	read(TheReader, "HMC", *this);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -253,6 +253,7 @@ class HMCResourceManager {
 | 
			
		||||
  template<class T, class... Types>
 | 
			
		||||
  void AddObservable(Types&&... Args){
 | 
			
		||||
    ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
 | 
			
		||||
    ObservablesList.back()->print_parameters();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
 | 
			
		||||
 
 | 
			
		||||
@@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
 | 
			
		||||
  typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
 | 
			
		||||
  using ObsBase::ObsBase; // for constructors
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // acquire resource
 | 
			
		||||
  virtual void initialize(){
 | 
			
		||||
    this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
 | 
			
		||||
@@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
 | 
			
		||||
  PlaquetteMod(): ObsBase(NoParameters()){}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template < class Impl >
 | 
			
		||||
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
 | 
			
		||||
  typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
 | 
			
		||||
class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
 | 
			
		||||
  typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
 | 
			
		||||
  using ObsBase::ObsBase; // for constructors
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // acquire resource
 | 
			
		||||
  virtual void initialize(){
 | 
			
		||||
    this->ObservablePtr.reset(new TopologicalCharge<Impl>());
 | 
			
		||||
    this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
 | 
			
		||||
  }
 | 
			
		||||
  public:
 | 
			
		||||
  TopologicalChargeMod(): ObsBase(NoParameters()){}
 | 
			
		||||
  TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
 | 
			
		||||
  TopologicalChargeMod(): ObsBase(){}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}// QCD temporarily here
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -33,9 +33,45 @@ directory
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
struct TopologySmearingParameters : Serializable {
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
 | 
			
		||||
    int, steps,
 | 
			
		||||
    float, step_size,
 | 
			
		||||
    int, meas_interval,
 | 
			
		||||
    float, maxTau);
 | 
			
		||||
 | 
			
		||||
    TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
 | 
			
		||||
        steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
 | 
			
		||||
 | 
			
		||||
    template < class ReaderClass >
 | 
			
		||||
    TopologySmearingParameters(Reader<ReaderClass>& Reader){
 | 
			
		||||
        read(Reader, "Smearing", *this);  
 | 
			
		||||
    }  
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
struct TopologyObsParameters : Serializable {
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
 | 
			
		||||
      int, interval,
 | 
			
		||||
      bool, do_smearing,
 | 
			
		||||
      TopologySmearingParameters, Smearing);  
 | 
			
		||||
 | 
			
		||||
    TopologyObsParameters(int interval = 1, bool smearing = false):
 | 
			
		||||
        interval(interval), Smearing(smearing){}
 | 
			
		||||
 | 
			
		||||
    template <class ReaderClass >
 | 
			
		||||
      TopologyObsParameters(Reader<ReaderClass>& Reader){
 | 
			
		||||
        read(Reader, "TopologyMeasurement", *this);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// this is only defined for a gauge theory
 | 
			
		||||
template <class Impl>
 | 
			
		||||
class TopologicalCharge : public HmcObservable<typename Impl::Field> {
 | 
			
		||||
    TopologyObsParameters Pars;
 | 
			
		||||
 | 
			
		||||
 public:
 | 
			
		||||
    // here forces the Impl to be of gauge fields
 | 
			
		||||
    // if not the compiler will complain
 | 
			
		||||
@@ -44,21 +80,40 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
 | 
			
		||||
    // necessary for HmcObservable compatibility
 | 
			
		||||
    typedef typename Impl::Field Field;
 | 
			
		||||
 | 
			
		||||
    TopologicalCharge(int interval = 1, bool do_smearing = false):
 | 
			
		||||
        Pars(interval, do_smearing){}
 | 
			
		||||
    
 | 
			
		||||
    TopologicalCharge(TopologyObsParameters P):Pars(P){
 | 
			
		||||
        std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void TrajectoryComplete(int traj,
 | 
			
		||||
                            Field &U,
 | 
			
		||||
                            GridSerialRNG &sRNG,
 | 
			
		||||
                            GridParallelRNG &pRNG) {
 | 
			
		||||
 | 
			
		||||
    Real q = WilsonLoops<Impl>::TopologicalCharge(U);
 | 
			
		||||
 | 
			
		||||
    if (traj%Pars.interval == 0){
 | 
			
		||||
        // Smearing
 | 
			
		||||
        Field Usmear = U;
 | 
			
		||||
        int def_prec = std::cout.precision();
 | 
			
		||||
        
 | 
			
		||||
        if (Pars.do_smearing){
 | 
			
		||||
            // using wilson flow by default here
 | 
			
		||||
            WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
 | 
			
		||||
            WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
 | 
			
		||||
            Real T0   = WF.energyDensityPlaquette(Usmear);
 | 
			
		||||
            std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 | 
			
		||||
                      << "T0                : [ " << traj << " ] "<< T0 << std::endl;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear);
 | 
			
		||||
        std::cout << GridLogMessage
 | 
			
		||||
            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
 | 
			
		||||
            << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
 | 
			
		||||
 | 
			
		||||
        std::cout.precision(def_prec);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
 | 
			
		||||
    if (maxTau - taus < epsilon){
 | 
			
		||||
        epsilon = maxTau-taus;
 | 
			
		||||
    }
 | 
			
		||||
    std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
 | 
			
		||||
    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
 | 
			
		||||
    GaugeField Z(U._grid);
 | 
			
		||||
    GaugeField Zprime(U._grid);
 | 
			
		||||
    GaugeField tmp(U._grid), Uprime(U._grid);
 | 
			
		||||
@@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
 | 
			
		||||
    // adjust integration step
 | 
			
		||||
    
 | 
			
		||||
    taus += epsilon;
 | 
			
		||||
    std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
 | 
			
		||||
    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
 | 
			
		||||
    
 | 
			
		||||
    epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
 | 
			
		||||
    std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 | 
			
		||||
    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
 | 
			
		||||
    out = in;
 | 
			
		||||
    for (unsigned int step = 1; step <= Nstep; step++) {
 | 
			
		||||
        auto start = std::chrono::high_resolution_clock::now();
 | 
			
		||||
        std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
 | 
			
		||||
        evolve_step(out);
 | 
			
		||||
        auto end = std::chrono::high_resolution_clock::now();
 | 
			
		||||
        std::chrono::duration<double> diff = end - start;
 | 
			
		||||
@@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
 | 
			
		||||
    unsigned int step = 0;
 | 
			
		||||
    do{
 | 
			
		||||
        step++;
 | 
			
		||||
        std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
 | 
			
		||||
        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
 | 
			
		||||
        evolve_step_adaptive(out, maxTau);
 | 
			
		||||
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
 | 
			
		||||
            << step << "  "
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
//#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
template <class Gimpl> 
 | 
			
		||||
class FourierAcceleratedGaugeFixer  : public Gimpl {
 | 
			
		||||
@@ -186,3 +186,5 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
 | 
			
		||||
  }  
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat>
 | 
			
		||||
 | 
			
		||||
    for (int a = 0; a < AdjointDimension; a++) {
 | 
			
		||||
      generator(a, Ta);
 | 
			
		||||
      auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep
 | 
			
		||||
      pokeColour(h_out, tmp, a);
 | 
			
		||||
      pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName)
 | 
			
		||||
                      Hdf5Type<unsigned int>::type());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Hdf5Reader::push(const std::string &s)
 | 
			
		||||
bool Hdf5Reader::push(const std::string &s)
 | 
			
		||||
{
 | 
			
		||||
  group_ = group_.openGroup(s);
 | 
			
		||||
  path_.push_back(s);
 | 
			
		||||
  
 | 
			
		||||
  return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void Hdf5Reader::pop(void)
 | 
			
		||||
 
 | 
			
		||||
@@ -54,7 +54,7 @@ namespace Grid
 | 
			
		||||
  public:
 | 
			
		||||
    Hdf5Reader(const std::string &fileName);
 | 
			
		||||
    virtual ~Hdf5Reader(void) = default;
 | 
			
		||||
    void push(const std::string &s);
 | 
			
		||||
    bool push(const std::string &s);
 | 
			
		||||
    void pop(void);
 | 
			
		||||
    template <typename U>
 | 
			
		||||
    void readDefault(const std::string &s, U &output);
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
@@ -6,8 +6,9 @@
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: neo <cossu@post.kek.jp>
 | 
			
		||||
    Author: Nils Meyer <nils.meyer@ur.de>
 | 
			
		||||
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    Author: neo <cossu@post.kek.jp>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
@@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp>
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
/*! @file Grid_sse4.h
 | 
			
		||||
  @brief Optimization libraries for NEON (ARM) instructions set ARMv8
 | 
			
		||||
 | 
			
		||||
  Experimental - Using intrinsics - DEVELOPING! 
 | 
			
		||||
/*
 | 
			
		||||
 | 
			
		||||
  ARMv8 NEON intrinsics layer by
 | 
			
		||||
 | 
			
		||||
  Nils Meyer <nils.meyer@ur.de>,
 | 
			
		||||
  University of Regensburg, Germany
 | 
			
		||||
  SFB/TRR55
 | 
			
		||||
 | 
			
		||||
*/
 | 
			
		||||
// Time-stamp: <2015-07-10 17:45:09 neo>
 | 
			
		||||
//----------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
#ifndef GEN_SIMD_WIDTH
 | 
			
		||||
#define GEN_SIMD_WIDTH 16u
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include "Grid_generic_types.h"
 | 
			
		||||
#include <arm_neon.h>
 | 
			
		||||
 | 
			
		||||
// ARMv8 supports double precision
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  template<class vtype>
 | 
			
		||||
@@ -46,14 +53,18 @@ namespace Optimization {
 | 
			
		||||
    float32x4_t f;
 | 
			
		||||
    vtype v;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  union u128f {
 | 
			
		||||
    float32x4_t v;
 | 
			
		||||
    float f[4];
 | 
			
		||||
  };
 | 
			
		||||
  union u128d {
 | 
			
		||||
    float64x2_t v;
 | 
			
		||||
    double f[4];
 | 
			
		||||
    double f[2];
 | 
			
		||||
  };
 | 
			
		||||
  // half precision
 | 
			
		||||
  union u128h {
 | 
			
		||||
    float16x8_t v;
 | 
			
		||||
    uint16_t f[8];
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vsplat{
 | 
			
		||||
@@ -64,20 +75,20 @@ namespace Optimization {
 | 
			
		||||
    }
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float32x4_t operator()(float a){
 | 
			
		||||
      return vld1q_dup_f32(&a);
 | 
			
		||||
      return vdupq_n_f32(a);
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    inline float32x4_t operator()(double a, double b){
 | 
			
		||||
      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
 | 
			
		||||
      return vld1q_f32(tmp);
 | 
			
		||||
    inline float64x2_t operator()(double a, double b){
 | 
			
		||||
      double tmp[2]={a,b};
 | 
			
		||||
      return vld1q_f64(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    //Real double
 | 
			
		||||
    inline float32x4_t operator()(double a){
 | 
			
		||||
      return vld1q_dup_f32(&a);
 | 
			
		||||
    //Real double // N:tbc
 | 
			
		||||
    inline float64x2_t operator()(double a){
 | 
			
		||||
      return vdupq_n_f64(a);
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    //Integer // N:tbc
 | 
			
		||||
    inline uint32x4_t operator()(Integer a){
 | 
			
		||||
      return vld1q_dup_u32(&a);
 | 
			
		||||
      return vdupq_n_u32(a);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
@@ -87,8 +98,8 @@ namespace Optimization {
 | 
			
		||||
      vst1q_f32(F, a);
 | 
			
		||||
    }
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(float32x4_t a, double* D){
 | 
			
		||||
      vst1q_f32((float*)D, a);
 | 
			
		||||
    inline void operator()(float64x2_t a, double* D){
 | 
			
		||||
      vst1q_f64(D, a);
 | 
			
		||||
    }
 | 
			
		||||
    //Integer
 | 
			
		||||
    inline void operator()(uint32x4_t a, Integer* I){
 | 
			
		||||
@@ -97,49 +108,49 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Vstream{
 | 
			
		||||
    //Float
 | 
			
		||||
  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
 | 
			
		||||
    //Float // N:generic
 | 
			
		||||
    inline void operator()(float * a, float32x4_t b){
 | 
			
		||||
    
 | 
			
		||||
      memcpy(a,&b,4*sizeof(float));
 | 
			
		||||
    }
 | 
			
		||||
    //Double
 | 
			
		||||
    inline void operator()(double * a, float32x4_t b){
 | 
			
		||||
  
 | 
			
		||||
    //Double // N:generic
 | 
			
		||||
    inline void operator()(double * a, float64x2_t b){
 | 
			
		||||
      memcpy(a,&b,2*sizeof(double));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  // Nils: Vset untested; not used currently in Grid at all;
 | 
			
		||||
  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
 | 
			
		||||
  struct Vset{
 | 
			
		||||
    // Complex float 
 | 
			
		||||
    // Complex float // N:ok
 | 
			
		||||
    inline float32x4_t operator()(Grid::ComplexF *a){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
 | 
			
		||||
      return vld1q_f32(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double 
 | 
			
		||||
    inline float32x4_t operator()(Grid::ComplexD *a){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
    // Complex double // N:ok
 | 
			
		||||
    inline float64x2_t operator()(Grid::ComplexD *a){
 | 
			
		||||
      double tmp[2]={a[0].imag(),a[0].real()};
 | 
			
		||||
      return vld1q_f64(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    // Real float 
 | 
			
		||||
    // Real float // N:ok
 | 
			
		||||
    inline float32x4_t operator()(float *a){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      float tmp[4]={a[3],a[2],a[1],a[0]};
 | 
			
		||||
      return vld1q_f32(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    inline float32x4_t operator()(double *a){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
    // Real double // N:ok
 | 
			
		||||
    inline float64x2_t operator()(double *a){
 | 
			
		||||
      double tmp[2]={a[1],a[0]};
 | 
			
		||||
      return vld1q_f64(tmp);
 | 
			
		||||
    }
 | 
			
		||||
    // Integer
 | 
			
		||||
    // Integer // N:ok
 | 
			
		||||
    inline uint32x4_t operator()(Integer *a){
 | 
			
		||||
      uint32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
      return vld1q_dup_u32(a);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  // N:leaving as is
 | 
			
		||||
  template <typename Out_type, typename In_type>
 | 
			
		||||
  struct Reduce{
 | 
			
		||||
    //Need templated class to overload output type
 | 
			
		||||
@@ -184,26 +195,98 @@ namespace Optimization {
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultRealPart{
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
      float32x4_t re = vtrn1q_f32(a, a);
 | 
			
		||||
      return vmulq_f32(re, b);
 | 
			
		||||
    }
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
 | 
			
		||||
      float64x2_t re = vzip1q_f64(a, a);
 | 
			
		||||
      return vmulq_f64(re, b);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MaddRealPart{
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
 | 
			
		||||
      float32x4_t re = vtrn1q_f32(a, a);
 | 
			
		||||
      return vfmaq_f32(c, re, b);
 | 
			
		||||
    }
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
 | 
			
		||||
      float64x2_t re = vzip1q_f64(a, a);
 | 
			
		||||
      return vfmaq_f64(c, re, b);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Div{
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
      return vdivq_f32(a, b);
 | 
			
		||||
    }
 | 
			
		||||
    // Real double
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
 | 
			
		||||
      return vdivq_f64(a, b);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct MultComplex{
 | 
			
		||||
    // Complex float
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
 | 
			
		||||
      float32x4_t r0, r1, r2, r3, r4;
 | 
			
		||||
 | 
			
		||||
      // a = ar ai Ar Ai
 | 
			
		||||
      // b = br bi Br Bi
 | 
			
		||||
      // collect real/imag part, negate bi and Bi
 | 
			
		||||
      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
 | 
			
		||||
      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
 | 
			
		||||
      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
 | 
			
		||||
 | 
			
		||||
      // the fun part
 | 
			
		||||
      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
 | 
			
		||||
      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
 | 
			
		||||
 | 
			
		||||
      // fma(a,b,c) = a+b*c
 | 
			
		||||
      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
 | 
			
		||||
 | 
			
		||||
      // no fma, use mul and add
 | 
			
		||||
      //float32x4_t r5;
 | 
			
		||||
      //r5 = vmulq_f32(r0, a);
 | 
			
		||||
      //return vaddq_f32(r4, r5);
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
 | 
			
		||||
      float32x4_t foo;
 | 
			
		||||
      return foo;
 | 
			
		||||
 | 
			
		||||
      float64x2_t r0, r1, r2, r3, r4;
 | 
			
		||||
 | 
			
		||||
      // b = br bi
 | 
			
		||||
      // collect real/imag part, negate bi
 | 
			
		||||
      r0 = vtrn1q_f64(b, b);       //  br  br
 | 
			
		||||
      r1 = vnegq_f64(b);           // -br -bi
 | 
			
		||||
      r2 = vtrn2q_f64(b, r1);      //  bi -bi
 | 
			
		||||
 | 
			
		||||
      // the fun part
 | 
			
		||||
      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
 | 
			
		||||
      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
 | 
			
		||||
 | 
			
		||||
      // fma(a,b,c) = a+b*c
 | 
			
		||||
      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
 | 
			
		||||
 | 
			
		||||
      // no fma, use mul and add
 | 
			
		||||
      //float64x2_t r5;
 | 
			
		||||
      //r5 = vmulq_f64(r0, a);
 | 
			
		||||
      //return vaddq_f64(r4, r5);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Mult{
 | 
			
		||||
    // Real float
 | 
			
		||||
    inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
 | 
			
		||||
      return vaddq_f32(vmulq_f32(b,c),a);
 | 
			
		||||
      //return vaddq_f32(vmulq_f32(b,c),a);
 | 
			
		||||
      return vfmaq_f32(a, b, c);
 | 
			
		||||
    }
 | 
			
		||||
    inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
 | 
			
		||||
      return vaddq_f64(vmulq_f64(b,c),a);
 | 
			
		||||
      //return vaddq_f64(vmulq_f64(b,c),a);
 | 
			
		||||
      return vfmaq_f64(a, b, c);
 | 
			
		||||
    }
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
			
		||||
      return vmulq_f32(a,b);
 | 
			
		||||
@@ -221,74 +304,259 @@ namespace Optimization {
 | 
			
		||||
  struct Conj{
 | 
			
		||||
    // Complex single
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t in){
 | 
			
		||||
      return in;
 | 
			
		||||
      // ar ai br bi -> ar -ai br -bi
 | 
			
		||||
      float32x4_t r0, r1;
 | 
			
		||||
      r0 = vnegq_f32(in);        // -ar -ai -br -bi
 | 
			
		||||
      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
 | 
			
		||||
      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
 | 
			
		||||
    }
 | 
			
		||||
    // Complex double
 | 
			
		||||
    //inline float32x4_t operator()(float32x4_t in){
 | 
			
		||||
    // return 0;
 | 
			
		||||
    //}
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t in){
 | 
			
		||||
 | 
			
		||||
      float64x2_t r0, r1;
 | 
			
		||||
      r0 = vextq_f64(in, in, 1);    //  ai  ar
 | 
			
		||||
      r1 = vnegq_f64(r0);           // -ai -ar
 | 
			
		||||
      return vextq_f64(r0, r1, 1);  //  ar -ai
 | 
			
		||||
    }
 | 
			
		||||
    // do not define for integer input
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct TimesMinusI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
 | 
			
		||||
      return in;
 | 
			
		||||
      // ar ai br bi -> ai -ar ai -br
 | 
			
		||||
      float32x4_t r0, r1;
 | 
			
		||||
      r0 = vnegq_f32(in);        // -ar -ai -br -bi
 | 
			
		||||
      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
 | 
			
		||||
      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
 | 
			
		||||
    //  return in;
 | 
			
		||||
    //}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
 | 
			
		||||
      // a ib -> b -ia
 | 
			
		||||
      float64x2_t tmp;
 | 
			
		||||
      tmp = vnegq_f64(in);
 | 
			
		||||
      return vextq_f64(in, tmp, 1);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct TimesI{
 | 
			
		||||
    //Complex single
 | 
			
		||||
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
 | 
			
		||||
      //need shuffle
 | 
			
		||||
      return in;
 | 
			
		||||
      // ar ai br bi -> -ai ar -bi br
 | 
			
		||||
      float32x4_t r0, r1;
 | 
			
		||||
      r0 = vnegq_f32(in);        // -ar -ai -br -bi
 | 
			
		||||
      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
 | 
			
		||||
      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
 | 
			
		||||
    }
 | 
			
		||||
    //Complex double
 | 
			
		||||
    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
 | 
			
		||||
    //  return 0;
 | 
			
		||||
    //}
 | 
			
		||||
    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
 | 
			
		||||
      // a ib -> -b ia
 | 
			
		||||
      float64x2_t tmp;
 | 
			
		||||
      tmp = vnegq_f64(in);
 | 
			
		||||
      return vextq_f64(tmp, in, 1);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Permute{
 | 
			
		||||
 | 
			
		||||
    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
 | 
			
		||||
      // AB CD -> CD AB
 | 
			
		||||
      return vextq_f32(in, in, 2);
 | 
			
		||||
    };
 | 
			
		||||
    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
 | 
			
		||||
      // AB CD -> BA DC
 | 
			
		||||
      return vrev64q_f32(in);
 | 
			
		||||
    };
 | 
			
		||||
    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
 | 
			
		||||
      // AB -> BA
 | 
			
		||||
      return vextq_f64(in, in, 1);
 | 
			
		||||
    };
 | 
			
		||||
    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: // AB CD -> AB CD
 | 
			
		||||
        return tRotate<0>(in);
 | 
			
		||||
        break;
 | 
			
		||||
      case 1: // AB CD -> BC DA
 | 
			
		||||
        return tRotate<1>(in);
 | 
			
		||||
        break;
 | 
			
		||||
      case 2: // AB CD -> CD AB
 | 
			
		||||
        return tRotate<2>(in);
 | 
			
		||||
        break;
 | 
			
		||||
      case 3: // AB CD -> DA BC
 | 
			
		||||
        return tRotate<3>(in);
 | 
			
		||||
        break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
 | 
			
		||||
      switch(n){
 | 
			
		||||
      case 0: // AB -> AB
 | 
			
		||||
        return tRotate<0>(in);
 | 
			
		||||
        break;
 | 
			
		||||
      case 1: // AB -> BA
 | 
			
		||||
        return tRotate<1>(in);
 | 
			
		||||
        break;
 | 
			
		||||
      default: assert(0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
// working, but no restriction on n
 | 
			
		||||
//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
 | 
			
		||||
//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
 | 
			
		||||
 | 
			
		||||
// restriction on n
 | 
			
		||||
    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
 | 
			
		||||
    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct PrecisionChange {
 | 
			
		||||
 | 
			
		||||
    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
 | 
			
		||||
      float16x4_t h = vcvt_f16_f32(a);
 | 
			
		||||
      return vcvt_high_f16_f32(h, b);
 | 
			
		||||
    }
 | 
			
		||||
    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
 | 
			
		||||
      sb = vcvt_high_f32_f16(h);
 | 
			
		||||
      // there is no direct conversion from lower float32x4_t to float64x2_t
 | 
			
		||||
      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
 | 
			
		||||
      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
 | 
			
		||||
      // workaround for clang
 | 
			
		||||
      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
 | 
			
		||||
      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
 | 
			
		||||
      sa = vcvt_high_f32_f16(h1);
 | 
			
		||||
    }
 | 
			
		||||
    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
 | 
			
		||||
      float32x2_t s = vcvt_f32_f64(a);
 | 
			
		||||
      return vcvt_high_f32_f64(s, b);
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
 | 
			
		||||
      b = vcvt_high_f64_f32(s);
 | 
			
		||||
      // there is no direct conversion from lower float32x4_t to float64x2_t
 | 
			
		||||
      float32x4_t s1 = vextq_f32(s, s, 2);
 | 
			
		||||
      a = vcvt_high_f64_f32(s1);
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
 | 
			
		||||
      float32x4_t s1 = DtoS(a, b);
 | 
			
		||||
      float32x4_t s2 = DtoS(c, d);
 | 
			
		||||
      return StoH(s1, s2);
 | 
			
		||||
    }
 | 
			
		||||
    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
 | 
			
		||||
      float32x4_t s1, s2;
 | 
			
		||||
      HtoS(h, s1, s2);
 | 
			
		||||
      StoD(s1, a, b);
 | 
			
		||||
      StoD(s2, c, d);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Exchange support
 | 
			
		||||
 | 
			
		||||
  struct Exchange{
 | 
			
		||||
    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
 | 
			
		||||
      // in1: ABCD -> out1: ABEF
 | 
			
		||||
      // in2: EFGH -> out2: CDGH
 | 
			
		||||
 | 
			
		||||
      // z: CDAB
 | 
			
		||||
      float32x4_t z = vextq_f32(in1, in1, 2);
 | 
			
		||||
      // out1: ABEF
 | 
			
		||||
      out1 = vextq_f32(z, in2, 2);
 | 
			
		||||
 | 
			
		||||
      // z: GHEF
 | 
			
		||||
      z = vextq_f32(in2, in2, 2);
 | 
			
		||||
      // out2: CDGH
 | 
			
		||||
      out2 = vextq_f32(in1, z, 2);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
 | 
			
		||||
      // in1: ABCD -> out1: AECG
 | 
			
		||||
      // in2: EFGH -> out2: BFDH
 | 
			
		||||
      out1 = vtrn1q_f32(in1, in2);
 | 
			
		||||
      out2 = vtrn2q_f32(in1, in2);
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
    // double precision
 | 
			
		||||
    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
 | 
			
		||||
      // in1: AB -> out1: AC
 | 
			
		||||
      // in2: CD -> out2: BD
 | 
			
		||||
      out1 = vzip1q_f64(in1, in2);
 | 
			
		||||
      out2 = vzip2q_f64(in1, in2);
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
  // Some Template specialization
 | 
			
		||||
  template < typename vtype > 
 | 
			
		||||
    void permute(vtype &a, vtype b, int perm) {
 | 
			
		||||
 | 
			
		||||
  }; 
 | 
			
		||||
 | 
			
		||||
  //Complex float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
 | 
			
		||||
    return 0;
 | 
			
		||||
    float32x4_t v1; // two complex
 | 
			
		||||
    v1 = Optimization::Permute::Permute0(in);
 | 
			
		||||
    v1 = vaddq_f32(v1,in);
 | 
			
		||||
    u128f conv;    conv.v=v1;
 | 
			
		||||
    return Grid::ComplexF(conv.f[0],conv.f[1]);
 | 
			
		||||
  }
 | 
			
		||||
  //Real float Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
 | 
			
		||||
    float32x2_t high = vget_high_f32(in);
 | 
			
		||||
    float32x2_t low = vget_low_f32(in);
 | 
			
		||||
    float32x2_t tmp = vadd_f32(low, high);
 | 
			
		||||
    float32x2_t sum = vpadd_f32(tmp, tmp);
 | 
			
		||||
    return vget_lane_f32(sum,0);
 | 
			
		||||
    return vaddvq_f32(in);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //Complex double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  template<> // N:by Boyle
 | 
			
		||||
  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
 | 
			
		||||
    return 0;
 | 
			
		||||
    u128d conv; conv.v = in;
 | 
			
		||||
    return Grid::ComplexD(conv.f[0],conv.f[1]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //Real double Reduce
 | 
			
		||||
  template<>
 | 
			
		||||
  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
 | 
			
		||||
    float64x2_t sum = vpaddq_f64(in, in);
 | 
			
		||||
    return vgetq_lane_f64(sum,0);
 | 
			
		||||
    return vaddvq_f64(in);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //Integer Reduce
 | 
			
		||||
@@ -302,8 +570,9 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
// typedef Optimization::vech SIMD_Htype; // Reduced precision type
 | 
			
		||||
  typedef float16x8_t  SIMD_Htype; // Half precision type
 | 
			
		||||
  typedef float32x4_t  SIMD_Ftype; // Single precision type
 | 
			
		||||
  typedef float64x2_t  SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef uint32x4_t   SIMD_Itype; // Integer type
 | 
			
		||||
@@ -312,13 +581,6 @@ namespace Grid {
 | 
			
		||||
  inline void prefetch_HINT_T0(const char *ptr){};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Gpermute function
 | 
			
		||||
  template < typename VectorSIMD > 
 | 
			
		||||
    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
 | 
			
		||||
    Optimization::permute(y.v,b.v,perm);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Function name aliases
 | 
			
		||||
  typedef Optimization::Vsplat   VsplatSIMD;
 | 
			
		||||
  typedef Optimization::Vstore   VstoreSIMD;
 | 
			
		||||
@@ -332,8 +594,11 @@ namespace Grid {
 | 
			
		||||
  // Arithmetic operations
 | 
			
		||||
  typedef Optimization::Sum         SumSIMD;
 | 
			
		||||
  typedef Optimization::Sub         SubSIMD;
 | 
			
		||||
  typedef Optimization::Div         DivSIMD;
 | 
			
		||||
  typedef Optimization::Mult        MultSIMD;
 | 
			
		||||
  typedef Optimization::MultComplex MultComplexSIMD;
 | 
			
		||||
  typedef Optimization::MultRealPart MultRealPartSIMD;
 | 
			
		||||
  typedef Optimization::MaddRealPart MaddRealPartSIMD;
 | 
			
		||||
  typedef Optimization::Conj        ConjSIMD;
 | 
			
		||||
  typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
			
		||||
  typedef Optimization::TimesI      TimesISIMD;
 | 
			
		||||
 
 | 
			
		||||
@@ -53,7 +53,7 @@ directory
 | 
			
		||||
#if defined IMCI
 | 
			
		||||
#include "Grid_imci.h"
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef NEONv8
 | 
			
		||||
#ifdef NEONV8
 | 
			
		||||
#include "Grid_neon.h"
 | 
			
		||||
#endif
 | 
			
		||||
#if defined QPX
 | 
			
		||||
 
 | 
			
		||||
@@ -32,8 +32,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
int LebesgueOrder::UseLebesgueOrder;
 | 
			
		||||
#ifdef KNL
 | 
			
		||||
std::vector<int> LebesgueOrder::Block({8,2,2,2});
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
std::vector<int> LebesgueOrder::Block({2,2,2,2});
 | 
			
		||||
#endif
 | 
			
		||||
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
 | 
			
		||||
  n--;           // 1000 0011 --> 1000 0010
 | 
			
		||||
  n |= n >> 1;   // 1000 0010 | 0100 0001 = 1100 0011
 | 
			
		||||
@@ -51,8 +54,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
 | 
			
		||||
  if ( Block[0]==0) ZGraph();
 | 
			
		||||
  else if ( Block[1]==0) NoBlocking();
 | 
			
		||||
  else CartesianBlocking();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  if (0) {
 | 
			
		||||
    std::cout << "Thread Interleaving"<<std::endl;
 | 
			
		||||
    ThreadInterleave();
 | 
			
		||||
  } 
 | 
			
		||||
}
 | 
			
		||||
void LebesgueOrder::ThreadInterleave(void)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<IndexInteger> reorder = _LebesgueReorder;
 | 
			
		||||
  std::vector<IndexInteger> throrder;
 | 
			
		||||
  int vol = _LebesgueReorder.size();
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int blockbits=3;
 | 
			
		||||
  int blocklen = 8;
 | 
			
		||||
  int msk      = 0x7;
 | 
			
		||||
 | 
			
		||||
  for(int t=0;t<threads;t++){
 | 
			
		||||
    for(int ss=0;ss<vol;ss++){
 | 
			
		||||
       if ( ( ss >> blockbits) % threads == t ) { 
 | 
			
		||||
         throrder.push_back(reorder[ss]);
 | 
			
		||||
       }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  _LebesgueReorder = throrder;
 | 
			
		||||
}
 | 
			
		||||
void LebesgueOrder::NoBlocking(void) 
 | 
			
		||||
{
 | 
			
		||||
  std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -70,6 +70,8 @@ namespace Grid {
 | 
			
		||||
		  std::vector<IndexInteger> & xi,
 | 
			
		||||
		  std::vector<IndexInteger> &dims);
 | 
			
		||||
 | 
			
		||||
    void ThreadInterleave(void);
 | 
			
		||||
 | 
			
		||||
  private:
 | 
			
		||||
    std::vector<IndexInteger> _LebesgueReorder;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
 | 
			
		||||
strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
 | 
			
		||||
                 const iVector<vtype,N> * __restrict__ rhs,
 | 
			
		||||
                 const iScalar<mtype> * __restrict__ lhs){
 | 
			
		||||
    mult(ret,lhs,rhs);
 | 
			
		||||
    for(int c1=0;c1<N;c1++){
 | 
			
		||||
        mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
 | 
			
		||||
    }                 
 | 
			
		||||
}
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -45,7 +45,7 @@ using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
class MagLogger : public HmcObservable<typename Impl::Field> {
 | 
			
		||||
class MagMeas : public HmcObservable<typename Impl::Field> {
 | 
			
		||||
public:
 | 
			
		||||
  typedef typename Impl::Field Field;
 | 
			
		||||
  typedef typename Impl::Simd::scalar_type Trace;
 | 
			
		||||
@@ -72,13 +72,13 @@ private:
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
class MagMod: public ObservableModule<MagLogger<Impl>, NoParameters>{
 | 
			
		||||
  typedef ObservableModule<MagLogger<Impl>, NoParameters> ObsBase;
 | 
			
		||||
class MagMod: public ObservableModule<MagMeas<Impl>, NoParameters>{
 | 
			
		||||
  typedef ObservableModule<MagMeas<Impl>, NoParameters> ObsBase;
 | 
			
		||||
  using ObsBase::ObsBase; // for constructors
 | 
			
		||||
  
 | 
			
		||||
  // acquire resource
 | 
			
		||||
  virtual void initialize(){
 | 
			
		||||
    this->ObservablePtr.reset(new MagLogger<Impl>());
 | 
			
		||||
    this->ObservablePtr.reset(new MagMeas<Impl>());
 | 
			
		||||
  }
 | 
			
		||||
public:
 | 
			
		||||
  MagMod(): ObsBase(NoParameters()){}
 | 
			
		||||
 
 | 
			
		||||
@@ -66,7 +66,14 @@ int main(int argc, char **argv) {
 | 
			
		||||
  typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
 | 
			
		||||
  typedef TopologicalChargeMod<HMCWrapper::ImplPolicy> QObs;
 | 
			
		||||
  TheHMC.Resources.AddObservable<PlaqObs>();
 | 
			
		||||
  TheHMC.Resources.AddObservable<QObs>();
 | 
			
		||||
  TopologyObsParameters TopParams;
 | 
			
		||||
  TopParams.interval = 5;
 | 
			
		||||
  TopParams.do_smearing = true;
 | 
			
		||||
  TopParams.Smearing.steps = 200;
 | 
			
		||||
  TopParams.Smearing.step_size = 0.01;
 | 
			
		||||
  TopParams.Smearing.meas_interval = 50;
 | 
			
		||||
  TopParams.Smearing.maxTau = 2.0; 
 | 
			
		||||
  TheHMC.Resources.AddObservable<QObs>(TopParams);
 | 
			
		||||
  //////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user