mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-15 14:27:06 +01:00
Compare commits
2 Commits
feature/CG
...
v0.6.0
Author | SHA1 | Date | |
---|---|---|---|
c363bdd784 | |||
446c768cd3 |
4
.gitignore
vendored
4
.gitignore
vendored
@ -47,9 +47,7 @@ Config.h.in
|
|||||||
config.log
|
config.log
|
||||||
config.status
|
config.status
|
||||||
.deps
|
.deps
|
||||||
Make.inc
|
*.inc
|
||||||
eigen.inc
|
|
||||||
Eigen.inc
|
|
||||||
|
|
||||||
# http://www.gnu.org/software/autoconf #
|
# http://www.gnu.org/software/autoconf #
|
||||||
########################################
|
########################################
|
||||||
|
@ -1,12 +1,10 @@
|
|||||||
# additional include paths necessary to compile the C++ library
|
# additional include paths necessary to compile the C++ library
|
||||||
SUBDIRS = lib benchmarks tests
|
SUBDIRS = lib benchmarks tests
|
||||||
|
|
||||||
include $(top_srcdir)/doxygen.inc
|
.PHONY: tests
|
||||||
|
|
||||||
tests: all
|
tests: all
|
||||||
$(MAKE) -C tests tests
|
$(MAKE) -C tests tests
|
||||||
|
|
||||||
.PHONY: tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
|
|
||||||
|
|
||||||
AM_CXXFLAGS += -I$(top_builddir)/include
|
AM_CXXFLAGS += -I$(top_builddir)/include
|
||||||
ACLOCAL_AMFLAGS = -I m4
|
ACLOCAL_AMFLAGS = -I m4
|
||||||
|
@ -116,15 +116,13 @@ If you want to build all the tests at once just use `make tests`.
|
|||||||
- `--with-fftw=<path>`: look for FFTW in the UNIX prefix `<path>`
|
- `--with-fftw=<path>`: look for FFTW in the UNIX prefix `<path>`
|
||||||
- `--enable-lapack[=<path>]`: enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional).
|
- `--enable-lapack[=<path>]`: enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional).
|
||||||
- `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
|
- `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
|
||||||
- `--enable-numa`: enable NUMA first touch optimisation
|
- `--enable-numa`: ???
|
||||||
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
|
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
|
||||||
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
|
|
||||||
- `--enable-precision={single|double}`: set the default precision (default: `double`).
|
- `--enable-precision={single|double}`: set the default precision (default: `double`).
|
||||||
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
|
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
|
||||||
- `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
|
- `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
|
||||||
- `--disable-timers`: disable system dependent high-resolution timers.
|
- `--disable-timers`: disable system dependent high-resolution timers.
|
||||||
- `--enable-chroma`: enable Chroma regression tests.
|
- `--enable-chroma`: enable Chroma regression tests.
|
||||||
- `--enable-doxygen-doc`: enable the Doxygen documentation generation (build with `make doxygen-doc`)
|
|
||||||
|
|
||||||
### Possible communication interfaces
|
### Possible communication interfaces
|
||||||
|
|
||||||
@ -138,7 +136,7 @@ The following options can be use with the `--enable-comms=` option to target dif
|
|||||||
| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
|
| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
|
||||||
| `shmem ` | Cray SHMEM communications |
|
| `shmem ` | Cray SHMEM communications |
|
||||||
|
|
||||||
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.
|
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
|
||||||
|
|
||||||
### Possible SIMD types
|
### Possible SIMD types
|
||||||
|
|
||||||
@ -167,7 +165,6 @@ Alternatively, some CPU codenames can be directly used:
|
|||||||
- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
|
- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
|
||||||
- For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
|
- For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
|
||||||
- BG/Q performances are currently rather poor. This is being investigated for future versions.
|
- BG/Q performances are currently rather poor. This is being investigated for future versions.
|
||||||
- The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.
|
|
||||||
|
|
||||||
### Build setup for Intel Knights Landing platform
|
### Build setup for Intel Knights Landing platform
|
||||||
|
|
||||||
|
@ -1,183 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
using namespace Grid::QCD;
|
|
||||||
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
|
||||||
|
|
||||||
std::vector<int> latt4 = GridDefaultLatt();
|
|
||||||
const int Ls=16;
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Making Vec5d innermost grids"<<std::endl;
|
|
||||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
|
||||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
std::cout << GridLogMessage << "Seeded"<<std::endl;
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "made random gauge fields"<<std::endl;
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
|
||||||
RealD M5 =1.8;
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
|
||||||
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{
|
|
||||||
const int ncall=1000;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
|
||||||
|
|
||||||
GridParallelRNG RNG5(FGrid);
|
|
||||||
LatticeFermion src(FGrid); random(RNG5,src);
|
|
||||||
LatticeFermion result(FGrid);
|
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
|
||||||
double t0,t1;
|
|
||||||
|
|
||||||
LatticeFermion r_eo(FGrid);
|
|
||||||
LatticeFermion src_e (FrbGrid);
|
|
||||||
LatticeFermion src_o (FrbGrid);
|
|
||||||
LatticeFermion r_e (FrbGrid);
|
|
||||||
LatticeFermion r_o (FrbGrid);
|
|
||||||
|
|
||||||
pickCheckerboard(Even,src_e,src);
|
|
||||||
pickCheckerboard(Odd,src_o,src);
|
|
||||||
|
|
||||||
setCheckerboard(r_eo,src_o);
|
|
||||||
setCheckerboard(r_eo,src_e);
|
|
||||||
|
|
||||||
r_e = zero;
|
|
||||||
r_o = zero;
|
|
||||||
|
|
||||||
|
|
||||||
#define BENCH_DW(A,in,out) \
|
|
||||||
Dw.CayleyZeroCounters(); \
|
|
||||||
Dw. A (in,out); \
|
|
||||||
FGrid->Barrier(); \
|
|
||||||
t0=usecond(); \
|
|
||||||
for(int i=0;i<ncall;i++){ \
|
|
||||||
Dw. A (in,out); \
|
|
||||||
} \
|
|
||||||
t1=usecond(); \
|
|
||||||
FGrid->Barrier(); \
|
|
||||||
Dw.CayleyReport(); \
|
|
||||||
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
|
|
||||||
std::cout<<GridLogMessage << "******************"<<std::endl;
|
|
||||||
|
|
||||||
#define BENCH_DW_MEO(A,in,out) \
|
|
||||||
Dw.CayleyZeroCounters(); \
|
|
||||||
Dw. A (in,out,0); \
|
|
||||||
FGrid->Barrier(); \
|
|
||||||
t0=usecond(); \
|
|
||||||
for(int i=0;i<ncall;i++){ \
|
|
||||||
Dw. A (in,out,0); \
|
|
||||||
} \
|
|
||||||
t1=usecond(); \
|
|
||||||
FGrid->Barrier(); \
|
|
||||||
Dw.CayleyReport(); \
|
|
||||||
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
|
|
||||||
std::cout<<GridLogMessage << "******************"<<std::endl;
|
|
||||||
|
|
||||||
BENCH_DW_MEO(Dhop ,src,result);
|
|
||||||
BENCH_DW_MEO(DhopEO ,src_o,r_e);
|
|
||||||
BENCH_DW(Meooe ,src_o,r_e);
|
|
||||||
BENCH_DW(Mooee ,src_o,r_o);
|
|
||||||
BENCH_DW(MooeeInv,src_o,r_o);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{
|
|
||||||
const int ncall=1000;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
|
||||||
|
|
||||||
GridParallelRNG RNG5(sFGrid);
|
|
||||||
LatticeFermion src(sFGrid); random(RNG5,src);
|
|
||||||
LatticeFermion sref(sFGrid);
|
|
||||||
LatticeFermion result(sFGrid);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
|
|
||||||
DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
double t0,t1;
|
|
||||||
|
|
||||||
LatticeFermion r_eo(sFGrid);
|
|
||||||
LatticeFermion src_e (sFrbGrid);
|
|
||||||
LatticeFermion src_o (sFrbGrid);
|
|
||||||
LatticeFermion r_e (sFrbGrid);
|
|
||||||
LatticeFermion r_o (sFrbGrid);
|
|
||||||
|
|
||||||
pickCheckerboard(Even,src_e,src);
|
|
||||||
pickCheckerboard(Odd,src_o,src);
|
|
||||||
|
|
||||||
setCheckerboard(r_eo,src_o);
|
|
||||||
setCheckerboard(r_eo,src_e);
|
|
||||||
|
|
||||||
r_e = zero;
|
|
||||||
r_o = zero;
|
|
||||||
|
|
||||||
BENCH_DW_MEO(Dhop ,src,result);
|
|
||||||
BENCH_DW_MEO(DhopEO ,src_o,r_e);
|
|
||||||
BENCH_DW(Meooe ,src_o,r_e);
|
|
||||||
BENCH_DW(Mooee ,src_o,r_o);
|
|
||||||
BENCH_DW(MooeeInv,src_o,r_o);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
@ -1,11 +1 @@
|
|||||||
include Make.inc
|
include Make.inc
|
||||||
|
|
||||||
simple: simple_su3_test.o simple_su3_expr.o simple_simd_test.o
|
|
||||||
|
|
||||||
EXTRA_LIBRARIES = libsimple_su3_test.a libsimple_su3_expr.a libsimple_simd_test.a
|
|
||||||
|
|
||||||
libsimple_su3_test_a_SOURCES = simple_su3_test.cc
|
|
||||||
|
|
||||||
libsimple_su3_expr_a_SOURCES = simple_su3_expr.cc
|
|
||||||
|
|
||||||
libsimple_simd_test_a_SOURCES = simple_simd_test.cc
|
|
||||||
|
@ -1,11 +0,0 @@
|
|||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
Grid::vRealD add(const Grid::vRealD &x, const Grid::vRealD &y)
|
|
||||||
{
|
|
||||||
return x+y;
|
|
||||||
}
|
|
||||||
|
|
||||||
Grid::vRealD sub(const Grid::vRealD &x, const Grid::vRealD &y)
|
|
||||||
{
|
|
||||||
return x-y;
|
|
||||||
}
|
|
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid/Grid.h>
|
#include <Grid.h>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid/Grid.h>
|
#include <Grid.h>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
46
configure.ac
46
configure.ac
@ -149,15 +149,9 @@ CXXFLAGS=$CXXFLAGS_CPY
|
|||||||
LDFLAGS=$LDFLAGS_CPY
|
LDFLAGS=$LDFLAGS_CPY
|
||||||
|
|
||||||
############### SIMD instruction selection
|
############### SIMD instruction selection
|
||||||
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],
|
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=<code>],
|
||||||
[select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])
|
[select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])
|
||||||
|
|
||||||
AC_ARG_ENABLE([gen-simd-width],
|
|
||||||
[AS_HELP_STRING([--enable-gen-simd-width=size],
|
|
||||||
[size (in bytes) of the generic SIMD vectors (default: 32)])],
|
|
||||||
[ac_gen_simd_width=$enable_gen_simd_width],
|
|
||||||
[ac_gen_simd_width=32])
|
|
||||||
|
|
||||||
case ${ax_cv_cxx_compiler_vendor} in
|
case ${ax_cv_cxx_compiler_vendor} in
|
||||||
clang|gnu)
|
clang|gnu)
|
||||||
case ${ac_SIMD} in
|
case ${ac_SIMD} in
|
||||||
@ -186,10 +180,7 @@ case ${ax_cv_cxx_compiler_vendor} in
|
|||||||
AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
|
AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
|
||||||
SIMD_FLAGS='-march=knl';;
|
SIMD_FLAGS='-march=knl';;
|
||||||
GEN)
|
GEN)
|
||||||
AC_DEFINE([GEN],[1],[generic vector code])
|
AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
|
||||||
AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
|
|
||||||
[generic SIMD vector width (in bytes)])
|
|
||||||
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
|
|
||||||
SIMD_FLAGS='';;
|
SIMD_FLAGS='';;
|
||||||
QPX|BGQ)
|
QPX|BGQ)
|
||||||
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
|
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
|
||||||
@ -206,8 +197,8 @@ case ${ax_cv_cxx_compiler_vendor} in
|
|||||||
AC_DEFINE([AVX1],[1],[AVX intrinsics])
|
AC_DEFINE([AVX1],[1],[AVX intrinsics])
|
||||||
SIMD_FLAGS='-mavx -xavx';;
|
SIMD_FLAGS='-mavx -xavx';;
|
||||||
AVXFMA)
|
AVXFMA)
|
||||||
AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
|
AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
|
||||||
SIMD_FLAGS='-mavx -fma';;
|
SIMD_FLAGS='-mavx -mfma';;
|
||||||
AVX2)
|
AVX2)
|
||||||
AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
|
AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
|
||||||
SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
|
SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
|
||||||
@ -221,10 +212,7 @@ case ${ax_cv_cxx_compiler_vendor} in
|
|||||||
AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
|
AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
|
||||||
SIMD_FLAGS='-xmic-avx512';;
|
SIMD_FLAGS='-xmic-avx512';;
|
||||||
GEN)
|
GEN)
|
||||||
AC_DEFINE([GEN],[1],[generic vector code])
|
AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
|
||||||
AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
|
|
||||||
[generic SIMD vector width (in bytes)])
|
|
||||||
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
|
|
||||||
SIMD_FLAGS='';;
|
SIMD_FLAGS='';;
|
||||||
*)
|
*)
|
||||||
AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
|
AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
|
||||||
@ -290,7 +278,7 @@ esac
|
|||||||
case ${ac_COMMS} in
|
case ${ac_COMMS} in
|
||||||
*-auto)
|
*-auto)
|
||||||
LX_FIND_MPI
|
LX_FIND_MPI
|
||||||
if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi
|
if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
|
||||||
AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
|
AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
|
||||||
AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
|
AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
|
||||||
AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
|
AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
|
||||||
@ -354,17 +342,12 @@ esac
|
|||||||
AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
|
AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
|
||||||
|
|
||||||
############### Doxygen
|
############### Doxygen
|
||||||
DX_DOXYGEN_FEATURE([OFF])
|
AC_PROG_DOXYGEN
|
||||||
DX_DOT_FEATURE([OFF])
|
|
||||||
DX_HTML_FEATURE([ON])
|
if test -n "$DOXYGEN"
|
||||||
DX_CHM_FEATURE([OFF])
|
then
|
||||||
DX_CHI_FEATURE([OFF])
|
AC_CONFIG_FILES([docs/doxy.cfg])
|
||||||
DX_MAN_FEATURE([OFF])
|
fi
|
||||||
DX_RTF_FEATURE([OFF])
|
|
||||||
DX_XML_FEATURE([OFF])
|
|
||||||
DX_PDF_FEATURE([OFF])
|
|
||||||
DX_PS_FEATURE([OFF])
|
|
||||||
DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])
|
|
||||||
|
|
||||||
############### Ouput
|
############### Ouput
|
||||||
cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
|
cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
|
||||||
@ -399,7 +382,7 @@ os (target) : $target_os
|
|||||||
compiler vendor : ${ax_cv_cxx_compiler_vendor}
|
compiler vendor : ${ax_cv_cxx_compiler_vendor}
|
||||||
compiler version : ${ax_cv_gxx_version}
|
compiler version : ${ax_cv_gxx_version}
|
||||||
----- BUILD OPTIONS -----------------------------------
|
----- BUILD OPTIONS -----------------------------------
|
||||||
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
|
SIMD : ${ac_SIMD}
|
||||||
Threading : ${ac_openmp}
|
Threading : ${ac_openmp}
|
||||||
Communications type : ${comms_type}
|
Communications type : ${comms_type}
|
||||||
Default precision : ${ac_PRECISION}
|
Default precision : ${ac_PRECISION}
|
||||||
@ -407,7 +390,8 @@ RNG choice : ${ac_RNG}
|
|||||||
GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
|
GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
|
||||||
LAPACK : ${ac_LAPACK}
|
LAPACK : ${ac_LAPACK}
|
||||||
FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
|
FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
|
||||||
build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
|
build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
|
||||||
|
graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
|
||||||
----- BUILD FLAGS -------------------------------------
|
----- BUILD FLAGS -------------------------------------
|
||||||
CXXFLAGS:
|
CXXFLAGS:
|
||||||
`echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
|
`echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
|
||||||
|
File diff suppressed because it is too large
Load Diff
2305
docs/doxy.cfg.in
Normal file
2305
docs/doxy.cfg.in
Normal file
File diff suppressed because it is too large
Load Diff
184
doxygen.inc
184
doxygen.inc
@ -1,184 +0,0 @@
|
|||||||
# Copyright (C) 2004 Oren Ben-Kiki
|
|
||||||
# This file is distributed under the same terms as the Automake macro files.
|
|
||||||
|
|
||||||
# Generate automatic documentation using Doxygen. Goals and variables values
|
|
||||||
# are controlled by the various DX_COND_??? conditionals set by autoconf.
|
|
||||||
#
|
|
||||||
# The provided goals are:
|
|
||||||
# doxygen-doc: Generate all doxygen documentation.
|
|
||||||
# doxygen-run: Run doxygen, which will generate some of the documentation
|
|
||||||
# (HTML, CHM, CHI, MAN, RTF, XML) but will not do the post
|
|
||||||
# processing required for the rest of it (PS, PDF, and some MAN).
|
|
||||||
# doxygen-man: Rename some doxygen generated man pages.
|
|
||||||
# doxygen-ps: Generate doxygen PostScript documentation.
|
|
||||||
# doxygen-pdf: Generate doxygen PDF documentation.
|
|
||||||
#
|
|
||||||
# Note that by default these are not integrated into the automake goals. If
|
|
||||||
# doxygen is used to generate man pages, you can achieve this integration by
|
|
||||||
# setting man3_MANS to the list of man pages generated and then adding the
|
|
||||||
# dependency:
|
|
||||||
#
|
|
||||||
# $(man3_MANS): doxygen-doc
|
|
||||||
#
|
|
||||||
# This will cause make to run doxygen and generate all the documentation.
|
|
||||||
#
|
|
||||||
# The following variable is intended for use in Makefile.am:
|
|
||||||
#
|
|
||||||
# DX_CLEANFILES = everything to clean.
|
|
||||||
#
|
|
||||||
# This is usually added to MOSTLYCLEANFILES.
|
|
||||||
|
|
||||||
## --------------------------------- ##
|
|
||||||
## Format-independent Doxygen rules. ##
|
|
||||||
## --------------------------------- ##
|
|
||||||
|
|
||||||
if DX_COND_doc
|
|
||||||
|
|
||||||
## ------------------------------- ##
|
|
||||||
## Rules specific for HTML output. ##
|
|
||||||
## ------------------------------- ##
|
|
||||||
|
|
||||||
if DX_COND_html
|
|
||||||
|
|
||||||
DX_CLEAN_HTML = @DX_DOCDIR@/html
|
|
||||||
|
|
||||||
endif DX_COND_html
|
|
||||||
|
|
||||||
## ------------------------------ ##
|
|
||||||
## Rules specific for CHM output. ##
|
|
||||||
## ------------------------------ ##
|
|
||||||
|
|
||||||
if DX_COND_chm
|
|
||||||
|
|
||||||
DX_CLEAN_CHM = @DX_DOCDIR@/chm
|
|
||||||
|
|
||||||
if DX_COND_chi
|
|
||||||
|
|
||||||
DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi
|
|
||||||
|
|
||||||
endif DX_COND_chi
|
|
||||||
|
|
||||||
endif DX_COND_chm
|
|
||||||
|
|
||||||
## ------------------------------ ##
|
|
||||||
## Rules specific for MAN output. ##
|
|
||||||
## ------------------------------ ##
|
|
||||||
|
|
||||||
if DX_COND_man
|
|
||||||
|
|
||||||
DX_CLEAN_MAN = @DX_DOCDIR@/man
|
|
||||||
|
|
||||||
endif DX_COND_man
|
|
||||||
|
|
||||||
## ------------------------------ ##
|
|
||||||
## Rules specific for RTF output. ##
|
|
||||||
## ------------------------------ ##
|
|
||||||
|
|
||||||
if DX_COND_rtf
|
|
||||||
|
|
||||||
DX_CLEAN_RTF = @DX_DOCDIR@/rtf
|
|
||||||
|
|
||||||
endif DX_COND_rtf
|
|
||||||
|
|
||||||
## ------------------------------ ##
|
|
||||||
## Rules specific for XML output. ##
|
|
||||||
## ------------------------------ ##
|
|
||||||
|
|
||||||
if DX_COND_xml
|
|
||||||
|
|
||||||
DX_CLEAN_XML = @DX_DOCDIR@/xml
|
|
||||||
|
|
||||||
endif DX_COND_xml
|
|
||||||
|
|
||||||
## ----------------------------- ##
|
|
||||||
## Rules specific for PS output. ##
|
|
||||||
## ----------------------------- ##
|
|
||||||
|
|
||||||
if DX_COND_ps
|
|
||||||
|
|
||||||
DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE@.ps
|
|
||||||
|
|
||||||
DX_PS_GOAL = doxygen-ps
|
|
||||||
|
|
||||||
doxygen-ps: @DX_DOCDIR@/@PACKAGE@.ps
|
|
||||||
|
|
||||||
@DX_DOCDIR@/@PACKAGE@.ps: @DX_DOCDIR@/@PACKAGE@.tag
|
|
||||||
cd @DX_DOCDIR@/latex; \
|
|
||||||
rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
|
|
||||||
$(DX_LATEX) refman.tex; \
|
|
||||||
$(MAKEINDEX_PATH) refman.idx; \
|
|
||||||
$(DX_LATEX) refman.tex; \
|
|
||||||
countdown=5; \
|
|
||||||
while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
|
|
||||||
refman.log > /dev/null 2>&1 \
|
|
||||||
&& test $$countdown -gt 0; do \
|
|
||||||
$(DX_LATEX) refman.tex; \
|
|
||||||
countdown=`expr $$countdown - 1`; \
|
|
||||||
done; \
|
|
||||||
$(DX_DVIPS) -o ../@PACKAGE@.ps refman.dvi
|
|
||||||
|
|
||||||
endif DX_COND_ps
|
|
||||||
|
|
||||||
## ------------------------------ ##
|
|
||||||
## Rules specific for PDF output. ##
|
|
||||||
## ------------------------------ ##
|
|
||||||
|
|
||||||
if DX_COND_pdf
|
|
||||||
|
|
||||||
DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE@.pdf
|
|
||||||
|
|
||||||
DX_PDF_GOAL = doxygen-pdf
|
|
||||||
|
|
||||||
doxygen-pdf: @DX_DOCDIR@/@PACKAGE@.pdf
|
|
||||||
|
|
||||||
@DX_DOCDIR@/@PACKAGE@.pdf: @DX_DOCDIR@/@PACKAGE@.tag
|
|
||||||
cd @DX_DOCDIR@/latex; \
|
|
||||||
rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
|
|
||||||
$(DX_PDFLATEX) refman.tex; \
|
|
||||||
$(DX_MAKEINDEX) refman.idx; \
|
|
||||||
$(DX_PDFLATEX) refman.tex; \
|
|
||||||
countdown=5; \
|
|
||||||
while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
|
|
||||||
refman.log > /dev/null 2>&1 \
|
|
||||||
&& test $$countdown -gt 0; do \
|
|
||||||
$(DX_PDFLATEX) refman.tex; \
|
|
||||||
countdown=`expr $$countdown - 1`; \
|
|
||||||
done; \
|
|
||||||
mv refman.pdf ../@PACKAGE@.pdf
|
|
||||||
|
|
||||||
endif DX_COND_pdf
|
|
||||||
|
|
||||||
## ------------------------------------------------- ##
|
|
||||||
## Rules specific for LaTeX (shared for PS and PDF). ##
|
|
||||||
## ------------------------------------------------- ##
|
|
||||||
|
|
||||||
if DX_COND_latex
|
|
||||||
|
|
||||||
DX_CLEAN_LATEX = @DX_DOCDIR@/latex
|
|
||||||
|
|
||||||
endif DX_COND_latex
|
|
||||||
|
|
||||||
.INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
|
|
||||||
|
|
||||||
doxygen-run: @DX_DOCDIR@/@PACKAGE@.tag
|
|
||||||
|
|
||||||
doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
|
|
||||||
|
|
||||||
@DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS)
|
|
||||||
rm -rf @DX_DOCDIR@
|
|
||||||
$(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG)
|
|
||||||
|
|
||||||
DX_CLEANFILES = \
|
|
||||||
@DX_DOCDIR@/@PACKAGE@.tag \
|
|
||||||
-r \
|
|
||||||
$(DX_CLEAN_HTML) \
|
|
||||||
$(DX_CLEAN_CHM) \
|
|
||||||
$(DX_CLEAN_CHI) \
|
|
||||||
$(DX_CLEAN_MAN) \
|
|
||||||
$(DX_CLEAN_RTF) \
|
|
||||||
$(DX_CLEAN_XML) \
|
|
||||||
$(DX_CLEAN_PS) \
|
|
||||||
$(DX_CLEAN_PDF) \
|
|
||||||
$(DX_CLEAN_LATEX)
|
|
||||||
|
|
||||||
endif DX_COND_doc
|
|
@ -1,49 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/Bitwise.cc
|
|
||||||
|
|
||||||
Copyright (C) 2016
|
|
||||||
|
|
||||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <iostream>
|
|
||||||
#include <Bitwise.h>
|
|
||||||
#include <bitset>
|
|
||||||
#include <climits>
|
|
||||||
|
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
void show_binaryrep(const unsigned char* a, size_t size) {
|
|
||||||
const unsigned char* beg = a;
|
|
||||||
const unsigned char* end = a + size;
|
|
||||||
unsigned int ctr = 0;
|
|
||||||
while (beg != end) {
|
|
||||||
std::cout << std::bitset<CHAR_BIT>(*beg++) << ' ';
|
|
||||||
ctr++;
|
|
||||||
if (ctr % GRID_REAL_BYTES == 0) std::cout << '\n';
|
|
||||||
}
|
|
||||||
std::cout << '\n';
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
@ -1,76 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/Bitwise.h
|
|
||||||
|
|
||||||
Copyright (C) 2016
|
|
||||||
|
|
||||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#ifndef GRID_BITWISE_H
|
|
||||||
#define GRID_BITWISE_H
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cfloat>
|
|
||||||
#include <bitset>
|
|
||||||
#include <climits>
|
|
||||||
#include <Config.h>
|
|
||||||
|
|
||||||
#ifdef GRID_DEFAULT_PRECISION_SINGLE
|
|
||||||
#define GRID_REAL_BYTES 4
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
|
|
||||||
#define GRID_REAL_BYTES 8
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
void show_binaryrep(const unsigned char* a, size_t size);
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void show_binaryrep(const T& a) {
|
|
||||||
const char* beg = reinterpret_cast<const char*>(&a);
|
|
||||||
const char* end = beg + sizeof(a);
|
|
||||||
unsigned int ctr = 0;
|
|
||||||
while (beg != end) {
|
|
||||||
std::cout << std::bitset<CHAR_BIT>(*beg++) << ' ';
|
|
||||||
ctr++;
|
|
||||||
if (ctr % GRID_REAL_BYTES == 0) std::cout << '\n';
|
|
||||||
}
|
|
||||||
std::cout << '\n';
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void bitwise_xor(T& l, T& r, unsigned char* xors) {
|
|
||||||
assert(sizeof(l) == sizeof(r));
|
|
||||||
unsigned char* org = reinterpret_cast<unsigned char*>(&l);
|
|
||||||
unsigned char* cur = reinterpret_cast<unsigned char*>(&r);
|
|
||||||
int words = sizeof(l) / sizeof(*org);
|
|
||||||
unsigned char result = 0;
|
|
||||||
for (int w = 0; w < words; w++) xors[w] = (org[w] ^ cur[w]);
|
|
||||||
}
|
|
||||||
|
|
||||||
}; // namespace
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
@ -244,11 +244,8 @@ namespace Grid {
|
|||||||
pokeLocalSite(s,pgbuf,cbuf);
|
pokeLocalSite(s,pgbuf,cbuf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (p != processors[dim] - 1)
|
|
||||||
{
|
|
||||||
result = Cshift(result,dim,L);
|
result = Cshift(result,dim,L);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
int NN=pencil_g.lSites();
|
int NN=pencil_g.lSites();
|
||||||
@ -290,10 +287,10 @@ namespace Grid {
|
|||||||
cgbuf = clbuf;
|
cgbuf = clbuf;
|
||||||
cgbuf[dim] = clbuf[dim]+L*pc;
|
cgbuf[dim] = clbuf[dim]+L*pc;
|
||||||
peekLocalSite(s,pgbuf,cgbuf);
|
peekLocalSite(s,pgbuf,cgbuf);
|
||||||
|
s = s * div;
|
||||||
pokeLocalSite(s,result,clbuf);
|
pokeLocalSite(s,result,clbuf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result = result*div;
|
|
||||||
|
|
||||||
// destroying plan
|
// destroying plan
|
||||||
FFTW<scalar>::fftw_destroy_plan(p);
|
FFTW<scalar>::fftw_destroy_plan(p);
|
||||||
|
@ -62,7 +62,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/serialisation/Serialisation.h>
|
#include <Grid/serialisation/Serialisation.h>
|
||||||
#include "Config.h"
|
#include "Config.h"
|
||||||
#include <Grid/Timer.h>
|
#include <Grid/Timer.h>
|
||||||
#include <Grid/Bitwise.h>
|
|
||||||
#include <Grid/PerfCount.h>
|
#include <Grid/PerfCount.h>
|
||||||
#include <Grid/Log.h>
|
#include <Grid/Log.h>
|
||||||
#include <Grid/AlignedAllocator.h>
|
#include <Grid/AlignedAllocator.h>
|
||||||
|
@ -100,7 +100,7 @@ void Grid_quiesce_nodes(void) {
|
|||||||
me = shmem_my_pe();
|
me = shmem_my_pe();
|
||||||
#endif
|
#endif
|
||||||
if (me) {
|
if (me) {
|
||||||
std::cout.setstate(std::ios::badbit);// mute all nodes except 0
|
std::cout.setstate(std::ios::badbit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,21 +38,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#ifdef GRID_NUMA
|
#ifdef GRID_NUMA
|
||||||
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
|
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
|
||||||
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
|
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
|
||||||
#else
|
#else
|
||||||
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
|
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
|
||||||
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
|
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
|
||||||
#endif
|
#endif
|
||||||
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
|
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
|
||||||
#define PARALLEL_REGION _Pragma("omp parallel")
|
#define PARALLEL_REGION _Pragma("omp parallel")
|
||||||
#define PARALLEL_FOR_LOOP_STATIC _Pragma("omp parallel for schedule(static)")
|
|
||||||
#else
|
#else
|
||||||
#define PARALLEL_FOR_LOOP
|
#define PARALLEL_FOR_LOOP
|
||||||
#define PARALLEL_FOR_LOOP_INTERN
|
#define PARALLEL_FOR_LOOP_INTERN
|
||||||
#define PARALLEL_NESTED_LOOP2
|
#define PARALLEL_NESTED_LOOP2
|
||||||
#define PARALLEL_REGION
|
#define PARALLEL_REGION
|
||||||
#define PARALLEL_FOR_LOOP_STATIC
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
@ -9,7 +9,6 @@ Copyright (C) 2015
|
|||||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -34,21 +33,6 @@ directory
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
struct CG_state {
|
|
||||||
bool do_repro;
|
|
||||||
std::vector<RealD> residuals;
|
|
||||||
|
|
||||||
CG_state() {reset();}
|
|
||||||
|
|
||||||
void reset(){
|
|
||||||
do_repro = false;
|
|
||||||
residuals.clear();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
enum CGexec_mode{ Default, ReproducibilityTest };
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
// Base classes for iterative processes based on operators
|
// Base classes for iterative processes based on operators
|
||||||
// single input vec, single output vec.
|
// single input vec, single output vec.
|
||||||
@ -61,30 +45,10 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
// Defaults true.
|
// Defaults true.
|
||||||
RealD Tolerance;
|
RealD Tolerance;
|
||||||
Integer MaxIterations;
|
Integer MaxIterations;
|
||||||
|
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
|
||||||
// Reproducibility controls
|
: Tolerance(tol),
|
||||||
bool ReproTest;
|
MaxIterations(maxit),
|
||||||
CG_state CGState; //to check reproducibility by repeating the CG
|
ErrorOnNoConverge(err_on_no_conv){};
|
||||||
ReproducibilityState<typename Field::vector_object> ReprTest; // for the inner proucts
|
|
||||||
|
|
||||||
// Constructor
|
|
||||||
ConjugateGradient(RealD tol, Integer maxit, CGexec_mode Mode = Default)
|
|
||||||
: Tolerance(tol),MaxIterations(maxit){
|
|
||||||
switch(Mode)
|
|
||||||
{
|
|
||||||
case Default :
|
|
||||||
ErrorOnNoConverge = true;
|
|
||||||
ReproTest = false;
|
|
||||||
case ReproducibilityTest :
|
|
||||||
ErrorOnNoConverge = false;
|
|
||||||
ReproTest = true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
void set_reproducibility_interval(unsigned int interval){
|
|
||||||
ReprTest.interval = interval;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
|
void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
|
||||||
Field &psi) {
|
Field &psi) {
|
||||||
@ -96,37 +60,34 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
Field p(src);
|
Field p(src);
|
||||||
Field mmp(src);
|
Field mmp(src);
|
||||||
Field r(src);
|
Field r(src);
|
||||||
Field psi_start(psi);// save for the repro test
|
|
||||||
|
|
||||||
if (CGState.do_repro && ReproTest)
|
|
||||||
std::cout << GridLogMessage << "Starting reproducibility test, full check every "
|
|
||||||
<< ReprTest.interval << " calls" << std::endl;
|
|
||||||
|
|
||||||
if(!ReprTest.do_check)
|
|
||||||
ReprTest.reset();
|
|
||||||
ReprTest.enable_reprocheck=ReproTest;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Initial residual computation & set up
|
// Initial residual computation & set up
|
||||||
RealD guess = norm2(psi, ReprTest);
|
RealD guess = norm2(psi);
|
||||||
assert(std::isnan(guess) == 0);
|
assert(std::isnan(guess) == 0);
|
||||||
|
|
||||||
Linop.HermOpAndNorm(psi, mmp, d, b);// eventually split this for the norm check
|
|
||||||
|
Linop.HermOpAndNorm(psi, mmp, d, b);
|
||||||
|
|
||||||
|
|
||||||
r = src - mmp;
|
r = src - mmp;
|
||||||
p = r;
|
p = r;
|
||||||
|
|
||||||
a = norm2(p, ReprTest);
|
a = norm2(p);
|
||||||
cp = a;
|
cp = a;
|
||||||
ssq = norm2(src, ReprTest);
|
ssq = norm2(src);
|
||||||
|
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: guess " << guess << std::endl;
|
std::cout << GridLogIterative << std::setprecision(4)
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: src " << ssq << std::endl;
|
<< "ConjugateGradient: guess " << guess << std::endl;
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: mp " << d << std::endl;
|
std::cout << GridLogIterative << std::setprecision(4)
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: mmp " << b << std::endl;
|
<< "ConjugateGradient: src " << ssq << std::endl;
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: cp,r " << cp << std::endl;
|
std::cout << GridLogIterative << std::setprecision(4)
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: p " << a << std::endl;
|
<< "ConjugateGradient: mp " << d << std::endl;
|
||||||
|
std::cout << GridLogIterative << std::setprecision(4)
|
||||||
|
<< "ConjugateGradient: mmp " << b << std::endl;
|
||||||
|
std::cout << GridLogIterative << std::setprecision(4)
|
||||||
|
<< "ConjugateGradient: cp,r " << cp << std::endl;
|
||||||
|
std::cout << GridLogIterative << std::setprecision(4)
|
||||||
|
<< "ConjugateGradient: p " << a << std::endl;
|
||||||
|
|
||||||
RealD rsq = Tolerance * Tolerance * ssq;
|
RealD rsq = Tolerance * Tolerance * ssq;
|
||||||
|
|
||||||
@ -146,10 +107,10 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
SolverTimer.Start();
|
SolverTimer.Start();
|
||||||
int k;
|
int k;
|
||||||
for (k = 1; k <= MaxIterations; k++) {
|
for (k = 1; k <= MaxIterations; k++) {
|
||||||
c = cp;// old residual
|
c = cp;
|
||||||
|
|
||||||
MatrixTimer.Start();
|
MatrixTimer.Start();
|
||||||
Linop.HermOpAndNorm(p, mmp, d, qq);// mmp = Ap, d=pAp
|
Linop.HermOpAndNorm(p, mmp, d, qq);
|
||||||
MatrixTimer.Stop();
|
MatrixTimer.Stop();
|
||||||
|
|
||||||
LinalgTimer.Start();
|
LinalgTimer.Start();
|
||||||
@ -157,31 +118,14 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
// ComplexD dck = innerProduct(p,mmp);
|
// ComplexD dck = innerProduct(p,mmp);
|
||||||
|
|
||||||
a = c / d;
|
a = c / d;
|
||||||
b_pred = a * (a * qq - d) / c;// a check
|
b_pred = a * (a * qq - d) / c;
|
||||||
|
|
||||||
|
cp = axpy_norm(r, -a, mmp, r);
|
||||||
axpy(r, -a, mmp, r);// new residual r = r_old - a * Ap
|
|
||||||
cp = norm2(r, ReprTest); // bookkeeping this norm
|
|
||||||
if (ReproTest && !CGState.do_repro) {
|
|
||||||
CGState.residuals.push_back(cp); // save residuals state
|
|
||||||
std::cout << GridLogIterative << "ReproTest: Saving state" << std::endl;
|
|
||||||
}
|
|
||||||
if (ReproTest && CGState.do_repro){
|
|
||||||
// check that the residual agrees with the previous run
|
|
||||||
std::cout << GridLogIterative << "ReproTest: Checking state k=" << k << std::endl;
|
|
||||||
if (cp != CGState.residuals[k-1]){
|
|
||||||
std::cout << GridLogMessage << "Failing reproducibility test";
|
|
||||||
std::cout << GridLogMessage << " at k=" << k << std::endl;
|
|
||||||
std::cout << GridLogMessage << "saved residual = " << CGState.residuals[k-1]
|
|
||||||
<< " cp = " << cp << std::endl;
|
|
||||||
exit(1); // exit after the first failure
|
|
||||||
}
|
|
||||||
}
|
|
||||||
b = cp / c;
|
b = cp / c;
|
||||||
|
|
||||||
// Fuse these loops ; should be really easy
|
// Fuse these loops ; should be really easy
|
||||||
psi = a * p + psi; // update solution
|
psi = a * p + psi;
|
||||||
p = p * b + r; // update search direction
|
p = p * b + r;
|
||||||
|
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
||||||
@ -212,22 +156,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
|
|
||||||
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
||||||
|
|
||||||
if (! (CGState.do_repro && ReproTest)){
|
|
||||||
CGState.do_repro = true;
|
|
||||||
ReprTest.do_check = true;
|
|
||||||
ReprTest.reset_counter();
|
|
||||||
this->operator()(Linop, src, psi_start);// run the repro test
|
|
||||||
if (ReprTest.success)
|
|
||||||
std::cout << GridLogMessage << "Reproducibility test passed" << std::endl;
|
|
||||||
else{
|
|
||||||
std::cout << GridLogMessage << "Reproducibility test failed" << std::endl;
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear state
|
|
||||||
CGState.reset();
|
|
||||||
ReprTest.reset();
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -59,7 +59,7 @@ public:
|
|||||||
|
|
||||||
int Nstop; // Number of evecs checked for convergence
|
int Nstop; // Number of evecs checked for convergence
|
||||||
int Nk; // Number of converged sought
|
int Nk; // Number of converged sought
|
||||||
int Np; // Np -- Number of spare vecs in krylov space
|
int Np; // Np -- Number of spare vecs in kryloc space
|
||||||
int Nm; // Nm -- total number of vectors
|
int Nm; // Nm -- total number of vectors
|
||||||
|
|
||||||
RealD eresid;
|
RealD eresid;
|
||||||
@ -1080,10 +1080,10 @@ say con = 2
|
|||||||
**/
|
**/
|
||||||
|
|
||||||
template<class T>
|
template<class T>
|
||||||
static void Lock(DenseMatrix<T> &H, // Hess mtx
|
static void Lock(DenseMatrix<T> &H, ///Hess mtx
|
||||||
DenseMatrix<T> &Q, // Lock Transform
|
DenseMatrix<T> &Q, ///Lock Transform
|
||||||
T val, // value to be locked
|
T val, ///value to be locked
|
||||||
int con, // number already locked
|
int con, ///number already locked
|
||||||
RealD small,
|
RealD small,
|
||||||
int dfg,
|
int dfg,
|
||||||
bool herm)
|
bool herm)
|
||||||
|
@ -65,7 +65,6 @@ const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return
|
|||||||
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
|
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
|
||||||
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
|
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -68,8 +68,6 @@ class CartesianCommunicator {
|
|||||||
static MPI_Comm communicator_world;
|
static MPI_Comm communicator_world;
|
||||||
MPI_Comm communicator;
|
MPI_Comm communicator;
|
||||||
typedef MPI_Request CommsRequest_t;
|
typedef MPI_Request CommsRequest_t;
|
||||||
static char name[MPI_MAX_PROCESSOR_NAME]; // processing node physical name
|
|
||||||
static int length;
|
|
||||||
#else
|
#else
|
||||||
typedef int CommsRequest_t;
|
typedef int CommsRequest_t;
|
||||||
#endif
|
#endif
|
||||||
@ -151,7 +149,6 @@ class CartesianCommunicator {
|
|||||||
const std::vector<int> & ProcessorGrid(void) ;
|
const std::vector<int> & ProcessorGrid(void) ;
|
||||||
int ProcessorCount(void) ;
|
int ProcessorCount(void) ;
|
||||||
|
|
||||||
void PrintRankInfo(void) ;
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -35,8 +35,6 @@ namespace Grid {
|
|||||||
// Info that is setup once and indept of cartesian layout
|
// Info that is setup once and indept of cartesian layout
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
MPI_Comm CartesianCommunicator::communicator_world;
|
MPI_Comm CartesianCommunicator::communicator_world;
|
||||||
char CartesianCommunicator::name[MPI_MAX_PROCESSOR_NAME]; // processing node physical name
|
|
||||||
int CartesianCommunicator::length;
|
|
||||||
|
|
||||||
// Should error check all MPI calls.
|
// Should error check all MPI calls.
|
||||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||||
@ -46,8 +44,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
|||||||
MPI_Init(argc,argv);
|
MPI_Init(argc,argv);
|
||||||
}
|
}
|
||||||
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||||
|
|
||||||
MPI_Get_processor_name(name, &length);
|
|
||||||
ShmInitGeneric();
|
ShmInitGeneric();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -210,10 +206,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
|||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::PrintRankInfo(){
|
|
||||||
std::cout << "Grid: Rank "<< _processor << " - Physical node name: " << name << std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}// end of namespace
|
|
||||||
|
|
||||||
|
@ -576,10 +576,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
|||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::PrintRankInfo(){
|
|
||||||
std::cout << "Grid: Rank "<< _processor << " - Physical node name: " << name << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -869,9 +869,6 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::PrintRankInfo(){
|
|
||||||
std::cout << "Grid: Rank "<< _processor << " - Physical node name: " << name << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -37,11 +37,6 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
|
|||||||
ShmInitGeneric();
|
ShmInitGeneric();
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::PrintRankInfo(){
|
|
||||||
std::cout << GridLogMessage << "No Rank Info available" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
{
|
{
|
||||||
_processors = processors;
|
_processors = processors;
|
||||||
|
@ -333,9 +333,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::PrintRankInfo(){
|
|
||||||
std::cout << GridLogMessage << "SHMEM: Rank Info not implemented yet" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,87 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define GRID_LATTICE_REDUCTION_H
|
#define GRID_LATTICE_REDUCTION_H
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
template <class T>
|
|
||||||
class ReproducibilityState {
|
|
||||||
public:
|
|
||||||
typedef typename T::vector_type vector_type;
|
|
||||||
typedef std::vector<vector_type, alignedAllocator<vector_type> > sum_type;
|
|
||||||
unsigned int n_call;
|
|
||||||
bool do_check;
|
|
||||||
bool enable_reprocheck;
|
|
||||||
bool success;
|
|
||||||
unsigned int interval;
|
|
||||||
std::vector<sum_type> th_states;
|
|
||||||
|
|
||||||
void reset_counter() { n_call = 0; }
|
|
||||||
void reset() {
|
|
||||||
th_states.clear();
|
|
||||||
do_check = false;
|
|
||||||
enable_reprocheck = false;
|
|
||||||
success = true;
|
|
||||||
n_call = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
ReproducibilityState():interval(1) {
|
|
||||||
reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
void check(GridBase* grid, sum_type &sumarray){
|
|
||||||
/////////////////////// Reproducibility section, not threaded on purpouse
|
|
||||||
if (enable_reprocheck) {
|
|
||||||
if (do_check && (n_call % interval) == 0) {
|
|
||||||
for (int thread = 0; thread < sumarray.size(); thread++) {
|
|
||||||
int words = sizeof(sumarray[thread])/sizeof(unsigned char);
|
|
||||||
unsigned char xors[words];
|
|
||||||
bitwise_xor(sumarray[thread], th_states[n_call][thread],xors);
|
|
||||||
// OR all words
|
|
||||||
unsigned char res = 0;
|
|
||||||
for (int w = 0; w < words; w++) res = res | xors[w];
|
|
||||||
|
|
||||||
Grid_unquiesce_nodes();
|
|
||||||
int rank = 0;
|
|
||||||
while (rank < grid->_Nprocessors){
|
|
||||||
if (rank == grid->ThisRank() ){
|
|
||||||
if ( res ) {
|
|
||||||
std::cout << "Reproducibility failure report" << std::endl;
|
|
||||||
grid->PrintRankInfo();
|
|
||||||
std::cout << "Call: "<< n_call << " Thread: " << thread << std::endl;
|
|
||||||
std::cout << "Size of states: " << th_states.size() << std::endl;
|
|
||||||
std::cout << std::setprecision(GRID_REAL_DIGITS+1) << std::scientific;
|
|
||||||
std::cout << "Saved partial sum : " << th_states[n_call][thread] << std::endl;
|
|
||||||
std::cout << "Current partial sum: " << sumarray[thread] << std::endl;
|
|
||||||
std::cout << "Saved state " << std::endl; show_binaryrep(th_states[n_call][thread]);
|
|
||||||
std::cout << "Current state" << std::endl; show_binaryrep(sumarray[thread]);
|
|
||||||
std::cout << "XOR result" << std::endl; show_binaryrep(xors, words);
|
|
||||||
//std::cout << std::defaultfloat; //not supported by some compilers
|
|
||||||
std::cout << std::setprecision(6);
|
|
||||||
success = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rank++;
|
|
||||||
grid->Barrier();
|
|
||||||
}
|
|
||||||
Grid_quiesce_nodes();
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if (!do_check)
|
|
||||||
{
|
|
||||||
std::cout << GridLogDebug << "Saving thread state for inner product. Call n. "
|
|
||||||
<< n_call << std::endl;
|
|
||||||
th_states.resize(n_call+1);
|
|
||||||
th_states[n_call].resize(grid->SumArraySize());
|
|
||||||
th_states[n_call] = sumarray; // save threads state
|
|
||||||
//n_call++;
|
|
||||||
}
|
|
||||||
n_call++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef GRID_WARN_SUBOPTIMAL
|
#ifdef GRID_WARN_SUBOPTIMAL
|
||||||
#warning "Optimisation alert all these reduction loops are NOT threaded "
|
#warning "Optimisation alert all these reduction loops are NOT threaded "
|
||||||
#endif
|
#endif
|
||||||
@ -120,25 +39,12 @@ class ReproducibilityState {
|
|||||||
// Deterministic Reduction operations
|
// Deterministic Reduction operations
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
||||||
ReproducibilityState<vobj> repr;
|
ComplexD nrm = innerProduct(arg,arg);
|
||||||
ComplexD nrm = innerProduct(arg, arg, repr);
|
|
||||||
return std::real(nrm);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class vobj>
|
|
||||||
inline RealD norm2(const Lattice<vobj> &arg, ReproducibilityState<vobj>& rep) {
|
|
||||||
ComplexD nrm = innerProduct(arg, arg, rep);
|
|
||||||
return std::real(nrm);
|
return std::real(nrm);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right){
|
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
|
||||||
ReproducibilityState<vobj> repr;
|
|
||||||
return innerProduct(left, right, repr);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right, ReproducibilityState<vobj>& repr)
|
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
@ -151,22 +57,18 @@ class ReproducibilityState {
|
|||||||
sumarray[i]=zero;
|
sumarray[i]=zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP_STATIC //request statically scheduled threads for reproducibility
|
PARALLEL_FOR_LOOP
|
||||||
for(int thr=0;thr<grid->SumArraySize();thr++){
|
for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm = zero; // private to thread; sub summation
|
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
|
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
|
||||||
}
|
}
|
||||||
sumarray[thr]=TensorRemove(vnrm) ;
|
sumarray[thr]=TensorRemove(vnrm) ;
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////// Reproducibility
|
|
||||||
repr.check(grid, sumarray);
|
|
||||||
///////////////////////////
|
|
||||||
|
|
||||||
vector_type vvnrm; vvnrm=zero; // sum across threads
|
vector_type vvnrm; vvnrm=zero; // sum across threads
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
vvnrm = vvnrm+sumarray[i];
|
vvnrm = vvnrm+sumarray[i];
|
||||||
@ -212,7 +114,7 @@ class ReproducibilityState {
|
|||||||
sumarray[i]=zero;
|
sumarray[i]=zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int thr=0;thr<grid->SumArraySize();thr++){
|
for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
|
||||||
@ -244,7 +146,7 @@ class ReproducibilityState {
|
|||||||
|
|
||||||
|
|
||||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
GridBase *grid = Data._grid;
|
GridBase *grid = Data._grid;
|
||||||
assert(grid!=NULL);
|
assert(grid!=NULL);
|
||||||
|
@ -195,7 +195,6 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
|
|||||||
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
|
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
|
||||||
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
|
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
|
||||||
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
|
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
|
||||||
|
|
||||||
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
|
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
|
||||||
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
|
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
|
||||||
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
|
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
|
||||||
@ -204,20 +203,6 @@ typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
|
|||||||
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
|
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
|
||||||
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
|
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
|
||||||
|
|
||||||
// Ls vectorised
|
|
||||||
typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
|
|
||||||
typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
|
|
||||||
typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
|
|
||||||
|
|
||||||
typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
|
|
||||||
typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
|
|
||||||
typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
|
|
||||||
|
|
||||||
typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
|
|
||||||
typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
|
|
||||||
typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
|
|
||||||
|
|
||||||
|
|
||||||
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
|
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
|
||||||
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
|
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
|
||||||
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
|
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
|
||||||
@ -269,7 +254,6 @@ typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
|
|||||||
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
|
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}}
|
}}
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
||||||
|
@ -62,50 +62,6 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
|
|||||||
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
|
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
|
||||||
{
|
|
||||||
this->Report();
|
|
||||||
std::vector<int> latt = GridDefaultLatt();
|
|
||||||
RealD volume = this->Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
|
||||||
RealD NP = this->_FourDimGrid->_Nprocessors;
|
|
||||||
if ( M5Dcalls > 0 ) {
|
|
||||||
std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
|
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl;
|
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl;
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
|
||||||
RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( MooeeInvCalls > 0 ) {
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
|
||||||
|
|
||||||
// Flops = 9*12*Ls*vol/2
|
|
||||||
RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
|
|
||||||
{
|
|
||||||
this->ZeroCounters();
|
|
||||||
M5Dflops=0;
|
|
||||||
M5Dcalls=0;
|
|
||||||
M5Dtime=0;
|
|
||||||
MooeeInvFlops=0;
|
|
||||||
MooeeInvCalls=0;
|
|
||||||
MooeeInvTime=0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
|
@ -121,18 +121,6 @@ namespace Grid {
|
|||||||
RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
|
RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void CayleyReport(void);
|
|
||||||
void CayleyZeroCounters(void);
|
|
||||||
|
|
||||||
double M5Dflops;
|
|
||||||
double M5Dcalls;
|
|
||||||
double M5Dtime;
|
|
||||||
|
|
||||||
double MooeeInvFlops;
|
|
||||||
double MooeeInvCalls;
|
|
||||||
double MooeeInvTime;
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
|
@ -51,9 +51,6 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
GridBase *grid=psi._grid;
|
GridBase *grid=psi._grid;
|
||||||
assert(phi.checkerboard == psi.checkerboard);
|
assert(phi.checkerboard == psi.checkerboard);
|
||||||
chi.checkerboard=psi.checkerboard;
|
chi.checkerboard=psi.checkerboard;
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
|
||||||
M5Dcalls++;
|
|
||||||
M5Dtime-=usecond();
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
@ -79,7 +76,6 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
M5Dtime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -95,9 +91,6 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
assert(phi.checkerboard == psi.checkerboard);
|
assert(phi.checkerboard == psi.checkerboard);
|
||||||
chi.checkerboard=psi.checkerboard;
|
chi.checkerboard=psi.checkerboard;
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
|
||||||
M5Dcalls++;
|
|
||||||
M5Dtime-=usecond();
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
auto tmp = psi._odata[0];
|
auto tmp = psi._odata[0];
|
||||||
@ -123,7 +116,6 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
M5Dtime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -134,14 +126,10 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
|
|||||||
|
|
||||||
chi.checkerboard=psi.checkerboard;
|
chi.checkerboard=psi.checkerboard;
|
||||||
|
|
||||||
MooeeInvCalls++;
|
|
||||||
MooeeInvTime-=usecond();
|
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
auto tmp = psi._odata[0];
|
auto tmp = psi._odata[0];
|
||||||
|
|
||||||
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
|
||||||
// Apply (L^{\prime})^{-1}
|
// Apply (L^{\prime})^{-1}
|
||||||
chi[ss]=psi[ss]; // chi[0]=psi[0]
|
chi[ss]=psi[ss]; // chi[0]=psi[0]
|
||||||
for(int s=1;s<Ls;s++){
|
for(int s=1;s<Ls;s++){
|
||||||
@ -167,9 +155,6 @@ PARALLEL_FOR_LOOP
|
|||||||
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
|
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MooeeInvTime+=usecond();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -181,8 +166,6 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
|
|||||||
assert(psi.checkerboard == psi.checkerboard);
|
assert(psi.checkerboard == psi.checkerboard);
|
||||||
chi.checkerboard=psi.checkerboard;
|
chi.checkerboard=psi.checkerboard;
|
||||||
|
|
||||||
MooeeInvCalls++;
|
|
||||||
MooeeInvTime-=usecond();
|
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
|
||||||
@ -214,9 +197,6 @@ PARALLEL_FOR_LOOP
|
|||||||
chi[ss+s] = chi[ss+s] - lee[s]*tmp;
|
chi[ss+s] = chi[ss+s] - lee[s]*tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MooeeInvTime+=usecond();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CAYLEY_DPERP_CACHE
|
#ifdef CAYLEY_DPERP_CACHE
|
||||||
|
@ -60,7 +60,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
GridBase *grid=psi._grid;
|
GridBase *grid=psi._grid;
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int LLs = grid->_rdimensions[0];
|
int LLs = grid->_rdimensions[0];
|
||||||
const int nsimd= Simd::Nsimd();
|
int nsimd= Simd::Nsimd();
|
||||||
|
|
||||||
Vector<iSinglet<Simd> > u(LLs);
|
Vector<iSinglet<Simd> > u(LLs);
|
||||||
Vector<iSinglet<Simd> > l(LLs);
|
Vector<iSinglet<Simd> > l(LLs);
|
||||||
@ -86,15 +86,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
|
|||||||
d_p[ss] = diag[s];
|
d_p[ss] = diag[s];
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
|
||||||
M5Dcalls++;
|
|
||||||
M5Dtime-=usecond();
|
|
||||||
|
|
||||||
assert(Nc==3);
|
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||||
#if 0
|
|
||||||
alignas(64) SiteHalfSpinor hp;
|
alignas(64) SiteHalfSpinor hp;
|
||||||
alignas(64) SiteHalfSpinor hm;
|
alignas(64) SiteHalfSpinor hm;
|
||||||
alignas(64) SiteSpinor fp;
|
alignas(64) SiteSpinor fp;
|
||||||
@ -111,113 +105,16 @@ PARALLEL_FOR_LOOP
|
|||||||
if ( vp<=v ) rotate(hp,hp,1);
|
if ( vp<=v ) rotate(hp,hp,1);
|
||||||
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
||||||
|
|
||||||
hp=0.5*hp;
|
hp=hp*0.5;
|
||||||
hm=0.5*hm;
|
hm=hm*0.5;
|
||||||
|
|
||||||
spRecon5m(fp,hp);
|
spRecon5m(fp,hp);
|
||||||
spRecon5p(fm,hm);
|
spRecon5p(fm,hm);
|
||||||
|
|
||||||
chi[ss+v] = d[v]*phi[ss+v];
|
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
|
||||||
chi[ss+v] = chi[ss+v] +u[v]*fp;
|
|
||||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
for(int v=0;v<LLs;v++){
|
|
||||||
|
|
||||||
vprefetch(psi[ss+v+LLs]);
|
|
||||||
// vprefetch(phi[ss+v+LLs]);
|
|
||||||
|
|
||||||
int vp= (v==LLs-1) ? 0 : v+1;
|
|
||||||
int vm= (v==0 ) ? LLs-1 : v-1;
|
|
||||||
|
|
||||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
|
||||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
|
||||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
|
||||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
|
||||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
|
||||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
|
||||||
|
|
||||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
|
||||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
|
||||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
|
||||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
|
||||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
|
||||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
|
||||||
|
|
||||||
// if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
|
|
||||||
// if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
|
|
||||||
|
|
||||||
if ( vp<=v ) {
|
|
||||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
|
||||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
|
||||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
|
||||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
|
||||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
|
||||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
|
||||||
}
|
}
|
||||||
if ( vm>=v ) {
|
|
||||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
|
||||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
|
||||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
|
||||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
|
||||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
|
||||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
|
|
||||||
if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
|
|
||||||
if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
|
|
||||||
if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
|
|
||||||
*/
|
|
||||||
Simd p_00 = d[v]()()() * phi[ss+v]()(0)(0) + l[v]()()()*hm_00;
|
|
||||||
Simd p_01 = d[v]()()() * phi[ss+v]()(0)(1) + l[v]()()()*hm_01;
|
|
||||||
Simd p_02 = d[v]()()() * phi[ss+v]()(0)(2) + l[v]()()()*hm_02;
|
|
||||||
Simd p_10 = d[v]()()() * phi[ss+v]()(1)(0) + l[v]()()()*hm_10;
|
|
||||||
Simd p_11 = d[v]()()() * phi[ss+v]()(1)(1) + l[v]()()()*hm_11;
|
|
||||||
Simd p_12 = d[v]()()() * phi[ss+v]()(1)(2) + l[v]()()()*hm_12;
|
|
||||||
Simd p_20 = d[v]()()() * phi[ss+v]()(2)(0) + u[v]()()()*hp_00;
|
|
||||||
Simd p_21 = d[v]()()() * phi[ss+v]()(2)(1) + u[v]()()()*hp_01;
|
|
||||||
Simd p_22 = d[v]()()() * phi[ss+v]()(2)(2) + u[v]()()()*hp_02;
|
|
||||||
Simd p_30 = d[v]()()() * phi[ss+v]()(3)(0) + u[v]()()()*hp_10;
|
|
||||||
Simd p_31 = d[v]()()() * phi[ss+v]()(3)(1) + u[v]()()()*hp_11;
|
|
||||||
Simd p_32 = d[v]()()() * phi[ss+v]()(3)(2) + u[v]()()()*hp_12;
|
|
||||||
|
|
||||||
|
|
||||||
// if ( ss==0){
|
|
||||||
/*
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
|
|
||||||
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
vstream(chi[ss+v]()(0)(0),p_00);
|
|
||||||
vstream(chi[ss+v]()(0)(1),p_01);
|
|
||||||
vstream(chi[ss+v]()(0)(2),p_02);
|
|
||||||
vstream(chi[ss+v]()(1)(0),p_10);
|
|
||||||
vstream(chi[ss+v]()(1)(1),p_11);
|
|
||||||
vstream(chi[ss+v]()(1)(2),p_12);
|
|
||||||
vstream(chi[ss+v]()(2)(0),p_20);
|
|
||||||
vstream(chi[ss+v]()(2)(1),p_21);
|
|
||||||
vstream(chi[ss+v]()(2)(2),p_22);
|
|
||||||
vstream(chi[ss+v]()(3)(0),p_30);
|
|
||||||
vstream(chi[ss+v]()(3)(1),p_31);
|
|
||||||
vstream(chi[ss+v]()(3)(2),p_32);
|
|
||||||
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
M5Dtime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -257,8 +154,6 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
|
|||||||
d_p[ss] = diag[s];
|
d_p[ss] = diag[s];
|
||||||
}}
|
}}
|
||||||
|
|
||||||
M5Dcalls++;
|
|
||||||
M5Dtime-=usecond();
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
|
||||||
|
|
||||||
@ -288,8 +183,8 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
M5Dtime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
|
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
|
||||||
{
|
{
|
||||||
@ -355,11 +250,13 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MooeeInvCalls++;
|
|
||||||
MooeeInvTime-=usecond();
|
|
||||||
// Dynamic allocate on stack to get per thread without serialised heap acces
|
// Dynamic allocate on stack to get per thread without serialised heap acces
|
||||||
#pragma omp parallel
|
PARALLEL_FOR_LOOP
|
||||||
{
|
for(auto site=0;site<vol;site++){
|
||||||
|
|
||||||
|
// SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
|
||||||
|
// SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
|
||||||
|
// SiteSpinor *SiteChi =(SiteSpinor *) alloca(LLs*sizeof(SiteSpinor));
|
||||||
|
|
||||||
Vector<SiteHalfSpinor> SitePplus(LLs);
|
Vector<SiteHalfSpinor> SitePplus(LLs);
|
||||||
Vector<SiteHalfSpinor> SitePminus(LLs);
|
Vector<SiteHalfSpinor> SitePminus(LLs);
|
||||||
@ -370,9 +267,6 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
|||||||
SiteHalfSpinor BcastP;
|
SiteHalfSpinor BcastP;
|
||||||
SiteHalfSpinor BcastM;
|
SiteHalfSpinor BcastM;
|
||||||
|
|
||||||
#pragma omp for
|
|
||||||
for(auto site=0;site<vol;site++){
|
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
for(int s=0;s<LLs;s++){
|
||||||
int lex = s+LLs*site;
|
int lex = s+LLs*site;
|
||||||
spProj5p(SitePplus[s] ,psi[lex]);
|
spProj5p(SitePplus[s] ,psi[lex]);
|
||||||
@ -400,8 +294,6 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
|
|||||||
chi[lex] = SiteChi[s]*0.5;
|
chi[lex] = SiteChi[s]*0.5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
MooeeInvTime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
INSTANTIATE_DPERP(DomainWallVec5dImplD);
|
INSTANTIATE_DPERP(DomainWallVec5dImplD);
|
||||||
|
@ -194,11 +194,6 @@ void WilsonFermion5D<Impl>::Report(void)
|
|||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
|
|
||||||
RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( DerivCalls > 0 ) {
|
if ( DerivCalls > 0 ) {
|
||||||
@ -214,15 +209,12 @@ void WilsonFermion5D<Impl>::Report(void)
|
|||||||
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
||||||
|
}
|
||||||
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; }
|
|
||||||
|
|
||||||
if (DerivCalls > 0 || DhopCalls > 0){
|
if (DerivCalls > 0 || DhopCalls > 0){
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D Stencil" <<std::endl; Stencil.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl; Stencil.Report();
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl; StencilOdd.Report();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,9 +66,6 @@ namespace Optimization {
|
|||||||
double f[4];
|
double f[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
struct Vsplat{
|
struct Vsplat{
|
||||||
//Complex float
|
//Complex float
|
||||||
inline __m256 operator()(float a, float b){
|
inline __m256 operator()(float a, float b){
|
||||||
@ -170,7 +167,7 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline __m256i operator()(__m256i a, __m256i b){
|
inline __m256i operator()(__m256i a, __m256i b){
|
||||||
#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
|
#if defined (AVX1) || defined (AVXFMA4)
|
||||||
__m128i a0,a1;
|
__m128i a0,a1;
|
||||||
__m128i b0,b1;
|
__m128i b0,b1;
|
||||||
a0 = _mm256_extractf128_si256(a,0);
|
a0 = _mm256_extractf128_si256(a,0);
|
||||||
@ -198,7 +195,7 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline __m256i operator()(__m256i a, __m256i b){
|
inline __m256i operator()(__m256i a, __m256i b){
|
||||||
#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
|
#if defined (AVX1) || defined (AVXFMA4)
|
||||||
__m128i a0,a1;
|
__m128i a0,a1;
|
||||||
__m128i b0,b1;
|
__m128i b0,b1;
|
||||||
a0 = _mm256_extractf128_si256(a,0);
|
a0 = _mm256_extractf128_si256(a,0);
|
||||||
@ -236,7 +233,7 @@ namespace Optimization {
|
|||||||
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX2) || defined (AVXFMA)
|
#if defined (AVX2)
|
||||||
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
|
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
|
||||||
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
|
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
|
||||||
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
@ -282,7 +279,7 @@ namespace Optimization {
|
|||||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX2) || defined (AVXFMA)
|
#if defined (AVX2)
|
||||||
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
|
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
|
||||||
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
|
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
|
||||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
@ -323,7 +320,7 @@ namespace Optimization {
|
|||||||
#if defined (AVXFMA4)
|
#if defined (AVXFMA4)
|
||||||
a= _mm256_macc_ps(b,c,a);
|
a= _mm256_macc_ps(b,c,a);
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX2) || defined (AVXFMA)
|
#if defined (AVX2)
|
||||||
a= _mm256_fmadd_ps( b, c, a);
|
a= _mm256_fmadd_ps( b, c, a);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -335,7 +332,7 @@ namespace Optimization {
|
|||||||
#if defined (AVXFMA4)
|
#if defined (AVXFMA4)
|
||||||
a= _mm256_macc_pd(b,c,a);
|
a= _mm256_macc_pd(b,c,a);
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX2) || defined (AVXFMA)
|
#if defined (AVX2)
|
||||||
a= _mm256_fmadd_pd( b, c, a);
|
a= _mm256_fmadd_pd( b, c, a);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -350,7 +347,7 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
// Integer
|
// Integer
|
||||||
inline __m256i operator()(__m256i a, __m256i b){
|
inline __m256i operator()(__m256i a, __m256i b){
|
||||||
#if defined (AVX1) || defined (AVXFMA)
|
#if defined (AVX1)
|
||||||
__m128i a0,a1;
|
__m128i a0,a1;
|
||||||
__m128i b0,b1;
|
__m128i b0,b1;
|
||||||
a0 = _mm256_extractf128_si256(a,0);
|
a0 = _mm256_extractf128_si256(a,0);
|
||||||
|
@ -27,6 +27,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
//----------------------------------------------------------------------
|
||||||
|
/*! @file Grid_knc.h
|
||||||
|
@brief Optimization libraries for AVX512 instructions set for KNC
|
||||||
|
|
||||||
|
Using intrinsics
|
||||||
|
*/
|
||||||
|
// Time-stamp: <2015-06-09 14:27:28 neo>
|
||||||
|
//----------------------------------------------------------------------
|
||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
|
||||||
@ -86,13 +95,13 @@ namespace Optimization {
|
|||||||
struct Vstream{
|
struct Vstream{
|
||||||
//Float
|
//Float
|
||||||
inline void operator()(float * a, __m512 b){
|
inline void operator()(float * a, __m512 b){
|
||||||
_mm512_stream_ps(a,b);
|
//_mm512_stream_ps(a,b);
|
||||||
// _mm512_store_ps(a,b);
|
_mm512_store_ps(a,b);
|
||||||
}
|
}
|
||||||
//Double
|
//Double
|
||||||
inline void operator()(double * a, __m512d b){
|
inline void operator()(double * a, __m512d b){
|
||||||
_mm512_stream_pd(a,b);
|
//_mm512_stream_pd(a,b);
|
||||||
// _mm512_store_pd(a,b);
|
_mm512_store_pd(a,b);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -6,7 +6,8 @@
|
|||||||
|
|
||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
|
||||||
Author: Antonin Portelli <antonin.portelli@me.com>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: neo <cossu@post.kek.jp>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -26,352 +27,133 @@ Author: Antonin Portelli <antonin.portelli@me.com>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
|
||||||
static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
|
|
||||||
|
|
||||||
//#define VECTOR_LOOPS
|
|
||||||
|
|
||||||
// playing with compiler pragmas
|
|
||||||
#ifdef VECTOR_LOOPS
|
|
||||||
#ifdef __clang__
|
|
||||||
#define VECTOR_FOR(i, w, inc)\
|
|
||||||
_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
|
|
||||||
for (unsigned int i = 0; i < w; i += inc)
|
|
||||||
#elif defined __INTEL_COMPILER
|
|
||||||
#define VECTOR_FOR(i, w, inc)\
|
|
||||||
_Pragma("simd vectorlength(w*8)")\
|
|
||||||
for (unsigned int i = 0; i < w; i += inc)
|
|
||||||
#else
|
|
||||||
#define VECTOR_FOR(i, w, inc)\
|
|
||||||
for (unsigned int i = 0; i < w; i += inc)
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define VECTOR_FOR(i, w, inc)\
|
|
||||||
for (unsigned int i = 0; i < w; i += inc)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace Optimization {
|
namespace Optimization {
|
||||||
|
|
||||||
// type traits giving the number of elements for each vector type
|
template<class vtype>
|
||||||
template <typename T> struct W;
|
union uconv {
|
||||||
template <> struct W<double> {
|
float f;
|
||||||
constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
|
vtype v;
|
||||||
constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
|
|
||||||
};
|
|
||||||
template <> struct W<float> {
|
|
||||||
constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
|
|
||||||
constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// SIMD vector types
|
union u128f {
|
||||||
template <typename T>
|
float v;
|
||||||
struct vec {
|
float f[4];
|
||||||
alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
|
};
|
||||||
|
union u128d {
|
||||||
|
double v;
|
||||||
|
double f[2];
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef vec<float> vecf;
|
|
||||||
typedef vec<double> vecd;
|
|
||||||
|
|
||||||
struct Vsplat{
|
struct Vsplat{
|
||||||
// Complex
|
//Complex float
|
||||||
template <typename T>
|
inline u128f operator()(float a, float b){
|
||||||
inline vec<T> operator()(T a, T b){
|
u128f out;
|
||||||
vec<T> out;
|
out.f[0] = a;
|
||||||
|
out.f[1] = b;
|
||||||
VECTOR_FOR(i, W<T>::r, 2)
|
out.f[2] = a;
|
||||||
{
|
out.f[3] = b;
|
||||||
out.v[i] = a;
|
|
||||||
out.v[i+1] = b;
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
// Real float
|
||||||
// Real
|
inline u128f operator()(float a){
|
||||||
template <typename T>
|
u128f out;
|
||||||
inline vec<T> operator()(T a){
|
out.f[0] = a;
|
||||||
vec<T> out;
|
out.f[1] = a;
|
||||||
|
out.f[2] = a;
|
||||||
VECTOR_FOR(i, W<T>::r, 1)
|
out.f[3] = a;
|
||||||
{
|
|
||||||
out.v[i] = a;
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
//Complex double
|
||||||
// Integer
|
inline u128d operator()(double a, double b){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = a;
|
||||||
|
out.f[1] = b;
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
//Real double
|
||||||
|
inline u128d operator()(double a){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = a;
|
||||||
|
out.f[1] = a;
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
//Integer
|
||||||
inline int operator()(Integer a){
|
inline int operator()(Integer a){
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Vstore{
|
struct Vstore{
|
||||||
// Real
|
//Float
|
||||||
template <typename T>
|
inline void operator()(u128f a, float* F){
|
||||||
inline void operator()(vec<T> a, T *D){
|
memcpy(F,a.f,4*sizeof(float));
|
||||||
*((vec<T> *)D) = a;
|
}
|
||||||
|
//Double
|
||||||
|
inline void operator()(u128d a, double* D){
|
||||||
|
memcpy(D,a.f,2*sizeof(double));
|
||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline void operator()(int a, Integer *I){
|
inline void operator()(int a, Integer* I){
|
||||||
*I = a;
|
I[0] = a;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Vstream{
|
struct Vstream{
|
||||||
// Real
|
//Float
|
||||||
template <typename T>
|
inline void operator()(float * a, u128f b){
|
||||||
inline void operator()(T * a, vec<T> b){
|
memcpy(a,b.f,4*sizeof(float));
|
||||||
*((vec<T> *)a) = b;
|
|
||||||
}
|
}
|
||||||
|
//Double
|
||||||
|
inline void operator()(double * a, u128d b){
|
||||||
|
memcpy(a,b.f,2*sizeof(double));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Vset{
|
struct Vset{
|
||||||
// Complex
|
// Complex float
|
||||||
template <typename T>
|
inline u128f operator()(Grid::ComplexF *a){
|
||||||
inline vec<T> operator()(std::complex<T> *a){
|
u128f out;
|
||||||
vec<T> out;
|
out.f[0] = a[0].real();
|
||||||
|
out.f[1] = a[0].imag();
|
||||||
VECTOR_FOR(i, W<T>::c, 1)
|
out.f[2] = a[1].real();
|
||||||
{
|
out.f[3] = a[1].imag();
|
||||||
out.v[2*i] = a[i].real();
|
|
||||||
out.v[2*i+1] = a[i].imag();
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
// Complex double
|
||||||
// Real
|
inline u128d operator()(Grid::ComplexD *a){
|
||||||
template <typename T>
|
u128d out;
|
||||||
inline vec<T> operator()(T *a){
|
out.f[0] = a[0].real();
|
||||||
vec<T> out;
|
out.f[1] = a[0].imag();
|
||||||
|
return out;
|
||||||
out = *((vec<T> *)a);
|
}
|
||||||
|
// Real float
|
||||||
|
inline u128f operator()(float *a){
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = a[0];
|
||||||
|
out.f[1] = a[1];
|
||||||
|
out.f[2] = a[2];
|
||||||
|
out.f[3] = a[3];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
// Real double
|
||||||
|
inline u128d operator()(double *a){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = a[0];
|
||||||
|
out.f[1] = a[1];
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Integer
|
// Integer
|
||||||
inline int operator()(Integer *a){
|
inline int operator()(Integer *a){
|
||||||
return *a;
|
return a[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/////////////////////////////////////////////////////
|
|
||||||
// Arithmetic operations
|
|
||||||
/////////////////////////////////////////////////////
|
|
||||||
struct Sum{
|
|
||||||
// Complex/Real
|
|
||||||
template <typename T>
|
|
||||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
|
||||||
vec<T> out;
|
|
||||||
|
|
||||||
VECTOR_FOR(i, W<T>::r, 1)
|
|
||||||
{
|
|
||||||
out.v[i] = a.v[i] + b.v[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
//I nteger
|
|
||||||
inline int operator()(int a, int b){
|
|
||||||
return a + b;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct Sub{
|
|
||||||
// Complex/Real
|
|
||||||
template <typename T>
|
|
||||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
|
||||||
vec<T> out;
|
|
||||||
|
|
||||||
VECTOR_FOR(i, W<T>::r, 1)
|
|
||||||
{
|
|
||||||
out.v[i] = a.v[i] - b.v[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
//Integer
|
|
||||||
inline int operator()(int a, int b){
|
|
||||||
return a-b;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct Mult{
|
|
||||||
// Real
|
|
||||||
template <typename T>
|
|
||||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
|
||||||
vec<T> out;
|
|
||||||
|
|
||||||
VECTOR_FOR(i, W<T>::r, 1)
|
|
||||||
{
|
|
||||||
out.v[i] = a.v[i]*b.v[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Integer
|
|
||||||
inline int operator()(int a, int b){
|
|
||||||
return a*b;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#define cmul(a, b, c, i)\
|
|
||||||
c[i] = a[i]*b[i] - a[i+1]*b[i+1];\
|
|
||||||
c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
|
|
||||||
|
|
||||||
struct MultComplex{
|
|
||||||
// Complex
|
|
||||||
template <typename T>
|
|
||||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
|
||||||
vec<T> out;
|
|
||||||
|
|
||||||
VECTOR_FOR(i, W<T>::c, 1)
|
|
||||||
{
|
|
||||||
cmul(a.v, b.v, out.v, 2*i);
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#undef cmul
|
|
||||||
|
|
||||||
struct Div{
|
|
||||||
// Real
|
|
||||||
template <typename T>
|
|
||||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
|
||||||
vec<T> out;
|
|
||||||
|
|
||||||
VECTOR_FOR(i, W<T>::r, 1)
|
|
||||||
{
|
|
||||||
out.v[i] = a.v[i]/b.v[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#define conj(a, b, i)\
|
|
||||||
b[i] = a[i];\
|
|
||||||
b[i+1] = -a[i+1];
|
|
||||||
|
|
||||||
struct Conj{
|
|
||||||
// Complex
|
|
||||||
template <typename T>
|
|
||||||
inline vec<T> operator()(vec<T> a){
|
|
||||||
vec<T> out;
|
|
||||||
|
|
||||||
VECTOR_FOR(i, W<T>::c, 1)
|
|
||||||
{
|
|
||||||
conj(a.v, out.v, 2*i);
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#undef conj
|
|
||||||
|
|
||||||
#define timesmi(a, b, i)\
|
|
||||||
b[i] = a[i+1];\
|
|
||||||
b[i+1] = -a[i];
|
|
||||||
|
|
||||||
struct TimesMinusI{
|
|
||||||
// Complex
|
|
||||||
template <typename T>
|
|
||||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
|
||||||
vec<T> out;
|
|
||||||
|
|
||||||
VECTOR_FOR(i, W<T>::c, 1)
|
|
||||||
{
|
|
||||||
timesmi(a.v, out.v, 2*i);
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#undef timesmi
|
|
||||||
|
|
||||||
#define timesi(a, b, i)\
|
|
||||||
b[i] = -a[i+1];\
|
|
||||||
b[i+1] = a[i];
|
|
||||||
|
|
||||||
struct TimesI{
|
|
||||||
// Complex
|
|
||||||
template <typename T>
|
|
||||||
inline vec<T> operator()(vec<T> a, vec<T> b){
|
|
||||||
vec<T> out;
|
|
||||||
|
|
||||||
VECTOR_FOR(i, W<T>::c, 1)
|
|
||||||
{
|
|
||||||
timesi(a.v, out.v, 2*i);
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#undef timesi
|
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
// Some Template specialization
|
|
||||||
#define perm(a, b, n, w)\
|
|
||||||
unsigned int _mask = w >> (n + 1);\
|
|
||||||
VECTOR_FOR(i, w, 1)\
|
|
||||||
{\
|
|
||||||
b[i] = a[i^_mask];\
|
|
||||||
}
|
|
||||||
|
|
||||||
#define DECL_PERMUTE_N(n)\
|
|
||||||
template <typename T>\
|
|
||||||
static inline vec<T> Permute##n(vec<T> in) {\
|
|
||||||
vec<T> out;\
|
|
||||||
perm(in.v, out.v, n, W<T>::r);\
|
|
||||||
return out;\
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Permute{
|
|
||||||
DECL_PERMUTE_N(0);
|
|
||||||
DECL_PERMUTE_N(1);
|
|
||||||
DECL_PERMUTE_N(2);
|
|
||||||
DECL_PERMUTE_N(3);
|
|
||||||
};
|
|
||||||
|
|
||||||
#undef perm
|
|
||||||
#undef DECL_PERMUTE_N
|
|
||||||
|
|
||||||
#define rot(a, b, n, w)\
|
|
||||||
VECTOR_FOR(i, w, 1)\
|
|
||||||
{\
|
|
||||||
b[i] = a[(i + n)%w];\
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Rotate{
|
|
||||||
template <typename T>
|
|
||||||
static inline vec<T> rotate(vec<T> in, int n){
|
|
||||||
vec<T> out;
|
|
||||||
|
|
||||||
rot(in.v, out.v, n, W<T>::r);
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#undef rot
|
|
||||||
|
|
||||||
#define acc(v, a, off, step, n)\
|
|
||||||
for (unsigned int i = off; i < n; i += step)\
|
|
||||||
{\
|
|
||||||
a += v[i];\
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Out_type, typename In_type>
|
template <typename Out_type, typename In_type>
|
||||||
struct Reduce{
|
struct Reduce{
|
||||||
//Need templated class to overload output type
|
//Need templated class to overload output type
|
||||||
@ -383,66 +165,315 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
//Complex float Reduce
|
/////////////////////////////////////////////////////
|
||||||
template <>
|
// Arithmetic operations
|
||||||
inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
|
/////////////////////////////////////////////////////
|
||||||
float a = 0.f, b = 0.f;
|
struct Sum{
|
||||||
|
//Complex/Real float
|
||||||
acc(in.v, a, 0, 2, W<float>::r);
|
inline u128f operator()(u128f a, u128f b){
|
||||||
acc(in.v, b, 1, 2, W<float>::r);
|
u128f out;
|
||||||
|
out.f[0] = a.f[0] + b.f[0];
|
||||||
return Grid::ComplexF(a, b);
|
out.f[1] = a.f[1] + b.f[1];
|
||||||
|
out.f[2] = a.f[2] + b.f[2];
|
||||||
|
out.f[3] = a.f[3] + b.f[3];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
|
//Complex/Real double
|
||||||
|
inline u128d operator()(u128d a, u128d b){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = a.f[0] + b.f[0];
|
||||||
|
out.f[1] = a.f[1] + b.f[1];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
//Integer
|
||||||
|
inline int operator()(int a, int b){
|
||||||
|
return a + b;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Sub{
|
||||||
|
//Complex/Real float
|
||||||
|
inline u128f operator()(u128f a, u128f b){
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = a.f[0] - b.f[0];
|
||||||
|
out.f[1] = a.f[1] - b.f[1];
|
||||||
|
out.f[2] = a.f[2] - b.f[2];
|
||||||
|
out.f[3] = a.f[3] - b.f[3];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
//Complex/Real double
|
||||||
|
inline u128d operator()(u128d a, u128d b){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = a.f[0] - b.f[0];
|
||||||
|
out.f[1] = a.f[1] - b.f[1];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
//Integer
|
||||||
|
inline int operator()(int a, int b){
|
||||||
|
return a-b;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MultComplex{
|
||||||
|
// Complex float
|
||||||
|
inline u128f operator()(u128f a, u128f b){
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
|
||||||
|
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
|
||||||
|
out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
|
||||||
|
out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
// Complex double
|
||||||
|
inline u128d operator()(u128d a, u128d b){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
|
||||||
|
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Mult{
|
||||||
|
//CK: Appear unneeded
|
||||||
|
// inline float mac(float a, float b,double c){
|
||||||
|
// return 0;
|
||||||
|
// }
|
||||||
|
// inline double mac(double a, double b,double c){
|
||||||
|
// return 0;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// Real float
|
||||||
|
inline u128f operator()(u128f a, u128f b){
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = a.f[0]*b.f[0];
|
||||||
|
out.f[1] = a.f[1]*b.f[1];
|
||||||
|
out.f[2] = a.f[2]*b.f[2];
|
||||||
|
out.f[3] = a.f[3]*b.f[3];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
// Real double
|
||||||
|
inline u128d operator()(u128d a, u128d b){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = a.f[0]*b.f[0];
|
||||||
|
out.f[1] = a.f[1]*b.f[1];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
// Integer
|
||||||
|
inline int operator()(int a, int b){
|
||||||
|
return a*b;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Conj{
|
||||||
|
// Complex single
|
||||||
|
inline u128f operator()(u128f in){
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = in.f[0];
|
||||||
|
out.f[1] = -in.f[1];
|
||||||
|
out.f[2] = in.f[2];
|
||||||
|
out.f[3] = -in.f[3];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
// Complex double
|
||||||
|
inline u128d operator()(u128d in){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = in.f[0];
|
||||||
|
out.f[1] = -in.f[1];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
// do not define for integer input
|
||||||
|
};
|
||||||
|
|
||||||
|
struct TimesMinusI{
|
||||||
|
//Complex single
|
||||||
|
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = -in.f[0];
|
||||||
|
out.f[2] = in.f[3];
|
||||||
|
out.f[3] = -in.f[2];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
//Complex double
|
||||||
|
inline u128d operator()(u128d in, u128d ret){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = -in.f[0];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct TimesI{
|
||||||
|
//Complex single
|
||||||
|
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = -in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
out.f[2] = -in.f[3];
|
||||||
|
out.f[3] = in.f[2];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
//Complex double
|
||||||
|
inline u128d operator()(u128d in, u128d ret){
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = -in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// Some Template specialization
|
||||||
|
struct Permute{
|
||||||
|
//We just have to mirror the permutes of Grid_sse4.h
|
||||||
|
static inline u128f Permute0(u128f in){ //AB CD -> CD AB
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = in.f[2];
|
||||||
|
out.f[1] = in.f[3];
|
||||||
|
out.f[2] = in.f[0];
|
||||||
|
out.f[3] = in.f[1];
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
static inline u128f Permute1(u128f in){ //AB CD -> BA DC
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
out.f[2] = in.f[3];
|
||||||
|
out.f[3] = in.f[2];
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
static inline u128f Permute2(u128f in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline u128f Permute3(u128f in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline u128d Permute0(u128d in){ //AB -> BA
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
static inline u128d Permute1(u128d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline u128d Permute2(u128d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline u128d Permute3(u128d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
template < typename vtype >
|
||||||
|
void permute(vtype &a, vtype b, int perm) {
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Rotate{
|
||||||
|
|
||||||
|
static inline u128f rotate(u128f in,int n){
|
||||||
|
u128f out;
|
||||||
|
switch(n){
|
||||||
|
case 0:
|
||||||
|
out.f[0] = in.f[0];
|
||||||
|
out.f[1] = in.f[1];
|
||||||
|
out.f[2] = in.f[2];
|
||||||
|
out.f[3] = in.f[3];
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = in.f[2];
|
||||||
|
out.f[2] = in.f[3];
|
||||||
|
out.f[3] = in.f[0];
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
out.f[0] = in.f[2];
|
||||||
|
out.f[1] = in.f[3];
|
||||||
|
out.f[2] = in.f[0];
|
||||||
|
out.f[3] = in.f[1];
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
out.f[0] = in.f[3];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
out.f[2] = in.f[1];
|
||||||
|
out.f[3] = in.f[2];
|
||||||
|
break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
static inline u128d rotate(u128d in,int n){
|
||||||
|
u128d out;
|
||||||
|
switch(n){
|
||||||
|
case 0:
|
||||||
|
out.f[0] = in.f[0];
|
||||||
|
out.f[1] = in.f[1];
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//Complex float Reduce
|
||||||
|
template<>
|
||||||
|
inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
|
||||||
|
return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
|
||||||
|
}
|
||||||
//Real float Reduce
|
//Real float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
|
inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
|
||||||
float a = 0.;
|
return in.f[0] + in.f[1] + in.f[2] + in.f[3];
|
||||||
|
|
||||||
acc(in.v, a, 0, 1, W<float>::r);
|
|
||||||
|
|
||||||
return a;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//Complex double Reduce
|
//Complex double Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
|
inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
|
||||||
double a = 0., b = 0.;
|
return Grid::ComplexD(in.f[0],in.f[1]);
|
||||||
|
|
||||||
acc(in.v, a, 0, 2, W<double>::r);
|
|
||||||
acc(in.v, b, 1, 2, W<double>::r);
|
|
||||||
|
|
||||||
return Grid::ComplexD(a, b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Real double Reduce
|
//Real double Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
|
inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
|
||||||
double a = 0.f;
|
return in.f[0] + in.f[1];
|
||||||
|
|
||||||
acc(in.v, a, 0, 1, W<double>::r);
|
|
||||||
|
|
||||||
return a;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Integer Reduce
|
//Integer Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Integer Reduce<Integer, int>::operator()(int in){
|
inline Integer Reduce<Integer, int>::operator()(int in){
|
||||||
return in;
|
// FIXME unimplemented
|
||||||
|
printf("Reduce : Missing integer implementation -> FIX\n");
|
||||||
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Here assign types
|
// Here assign types
|
||||||
|
|
||||||
typedef Optimization::vecf SIMD_Ftype; // Single precision type
|
typedef Optimization::u128f SIMD_Ftype; // Single precision type
|
||||||
typedef Optimization::vecd SIMD_Dtype; // Double precision type
|
typedef Optimization::u128d SIMD_Dtype; // Double precision type
|
||||||
typedef int SIMD_Itype; // Integer type
|
typedef int SIMD_Itype; // Integer type
|
||||||
|
|
||||||
// prefetch utilities
|
// prefetch utilities
|
||||||
inline void v_prefetch0(int size, const char *ptr){};
|
inline void v_prefetch0(int size, const char *ptr){};
|
||||||
inline void prefetch_HINT_T0(const char *ptr){};
|
inline void prefetch_HINT_T0(const char *ptr){};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Gpermute function
|
||||||
|
template < typename VectorSIMD >
|
||||||
|
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
|
||||||
|
Optimization::permute(y.v,b.v,perm);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Function name aliases
|
// Function name aliases
|
||||||
typedef Optimization::Vsplat VsplatSIMD;
|
typedef Optimization::Vsplat VsplatSIMD;
|
||||||
typedef Optimization::Vstore VstoreSIMD;
|
typedef Optimization::Vstore VstoreSIMD;
|
||||||
@ -450,13 +481,16 @@ namespace Optimization {
|
|||||||
typedef Optimization::Vstream VstreamSIMD;
|
typedef Optimization::Vstream VstreamSIMD;
|
||||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Arithmetic operations
|
// Arithmetic operations
|
||||||
typedef Optimization::Sum SumSIMD;
|
typedef Optimization::Sum SumSIMD;
|
||||||
typedef Optimization::Sub SubSIMD;
|
typedef Optimization::Sub SubSIMD;
|
||||||
typedef Optimization::Div DivSIMD;
|
|
||||||
typedef Optimization::Mult MultSIMD;
|
typedef Optimization::Mult MultSIMD;
|
||||||
typedef Optimization::MultComplex MultComplexSIMD;
|
typedef Optimization::MultComplex MultComplexSIMD;
|
||||||
typedef Optimization::Conj ConjSIMD;
|
typedef Optimization::Conj ConjSIMD;
|
||||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||||
typedef Optimization::TimesI TimesISIMD;
|
typedef Optimization::TimesI TimesISIMD;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -26,6 +26,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
//----------------------------------------------------------------------
|
||||||
|
/*! @file Grid_knc.h
|
||||||
|
@brief Optimization libraries for AVX512 instructions set for KNC
|
||||||
|
|
||||||
|
Using intrinsics
|
||||||
|
*/
|
||||||
|
// Time-stamp: <2015-06-09 14:27:28 neo>
|
||||||
|
//----------------------------------------------------------------------
|
||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#include <zmmintrin.h>
|
#include <zmmintrin.h>
|
||||||
|
@ -245,21 +245,6 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Div{
|
|
||||||
// Real double
|
|
||||||
inline vector4double operator()(vector4double a, vector4double b){
|
|
||||||
return vec_swdiv(a, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Real float
|
|
||||||
FLOAT_WRAP_2(operator(), inline)
|
|
||||||
|
|
||||||
// Integer
|
|
||||||
inline int operator()(int a, int b){
|
|
||||||
return a/b;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct Conj{
|
struct Conj{
|
||||||
// Complex double
|
// Complex double
|
||||||
inline vector4double operator()(vector4double v){
|
inline vector4double operator()(vector4double v){
|
||||||
@ -428,7 +413,6 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
|||||||
typedef Optimization::Sum SumSIMD;
|
typedef Optimization::Sum SumSIMD;
|
||||||
typedef Optimization::Sub SubSIMD;
|
typedef Optimization::Sub SubSIMD;
|
||||||
typedef Optimization::Mult MultSIMD;
|
typedef Optimization::Mult MultSIMD;
|
||||||
typedef Optimization::Div DivSIMD;
|
|
||||||
typedef Optimization::MultComplex MultComplexSIMD;
|
typedef Optimization::MultComplex MultComplexSIMD;
|
||||||
typedef Optimization::Conj ConjSIMD;
|
typedef Optimization::Conj ConjSIMD;
|
||||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||||
|
@ -38,13 +38,13 @@ directory
|
|||||||
#ifndef GRID_VECTOR_TYPES
|
#ifndef GRID_VECTOR_TYPES
|
||||||
#define GRID_VECTOR_TYPES
|
#define GRID_VECTOR_TYPES
|
||||||
|
|
||||||
#ifdef GEN
|
#ifdef GENERIC_VEC
|
||||||
#include "Grid_generic.h"
|
#include "Grid_generic.h"
|
||||||
#endif
|
#endif
|
||||||
#ifdef SSE4
|
#ifdef SSE4
|
||||||
#include "Grid_sse4.h"
|
#include "Grid_sse4.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
|
#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4)
|
||||||
#include "Grid_avx.h"
|
#include "Grid_avx.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined AVX512
|
#if defined AVX512
|
||||||
@ -60,13 +60,6 @@ directory
|
|||||||
#include "Grid_neon.h"
|
#include "Grid_neon.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_DEFAULT_PRECISION_SINGLE
|
|
||||||
#define GRID_REAL_DIGITS FLT_DIG
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
|
|
||||||
#define GRID_REAL_DIGITS DBL_DIG
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
//////////////////////////////////////
|
//////////////////////////////////////
|
||||||
@ -137,7 +130,7 @@ class Grid_simd {
|
|||||||
|
|
||||||
Vector_type v;
|
Vector_type v;
|
||||||
|
|
||||||
static inline constexpr int Nsimd(void) {
|
static inline int Nsimd(void) {
|
||||||
return sizeof(Vector_type) / sizeof(Scalar_type);
|
return sizeof(Vector_type) / sizeof(Scalar_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,23 +146,6 @@ class Grid_simd {
|
|||||||
Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps
|
Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps
|
||||||
Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
|
Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
|
||||||
|
|
||||||
/////////////////////////////
|
|
||||||
// Comparisons
|
|
||||||
/////////////////////////////
|
|
||||||
inline bool operator==(const Grid_simd& lhs){
|
|
||||||
Grid_simd::conv_t conv1;
|
|
||||||
Grid_simd::conv_t conv2;
|
|
||||||
conv1.v = v;
|
|
||||||
conv2.v = lhs.v;
|
|
||||||
return std::equal(std::begin(conv1.s), std::end(conv1.s), std::begin(conv2.s));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool operator!=(const Grid_simd& lhs){
|
|
||||||
return !(*this == lhs);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Constructors
|
// Constructors
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
54
m4/ac_prog_doxygen.m4
Normal file
54
m4/ac_prog_doxygen.m4
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
dnl Check for doxygen to create API docs
|
||||||
|
dnl
|
||||||
|
AC_DEFUN([AC_PROG_DOXYGEN],
|
||||||
|
[
|
||||||
|
AC_ARG_ENABLE(doxygen,
|
||||||
|
AS_HELP_STRING([--enable-doxygen],[enable documentation generation with doxygen (auto)]))
|
||||||
|
|
||||||
|
AC_ARG_ENABLE(dot,
|
||||||
|
AS_HELP_STRING([--enable-dot],[use 'dot' to generate graphs in doxygen (auto)]))
|
||||||
|
AC_ARG_ENABLE(html-docs,
|
||||||
|
AS_HELP_STRING([--enable-html-docs],[enable HTML generation with doxygen (yes)]),
|
||||||
|
[],[ enable_html_docs=yes])
|
||||||
|
AC_ARG_ENABLE(latex-docs,
|
||||||
|
AS_HELP_STRING([--enable-latex-docs],
|
||||||
|
[enable LaTeX documentation generation with doxygen (no)]),[],[enable_latex_docs=no])
|
||||||
|
|
||||||
|
if test "x$enable_doxygen" = xno; then
|
||||||
|
enable_doc=no
|
||||||
|
else
|
||||||
|
AC_CHECK_PROG(DOXYGEN, doxygen, doxygen)
|
||||||
|
if test x$DOXYGEN = x; then
|
||||||
|
if test "x$enable_doxygen" = xyes; then
|
||||||
|
AC_MSG_ERROR([could not find doxygen])
|
||||||
|
fi
|
||||||
|
enable_doc=no
|
||||||
|
else
|
||||||
|
doxy_ver=`doxygen --version`
|
||||||
|
doxy_major=`expr "$doxy_ver" : '\(@<:@0-9@:>@\)\..*'`
|
||||||
|
doxy_minor=`expr "$doxy_ver" : '@<:@0-9@:>@\.\(@<:@0-9@:>@\).*'`
|
||||||
|
if test $doxy_major -eq "1" -a $doxy_minor -ge "3" ; then
|
||||||
|
enable_doc=yes
|
||||||
|
AC_CHECK_PROG(DOT, dot, dot)
|
||||||
|
else
|
||||||
|
AC_MSG_WARN([doxygen version $doxy_ver too old, doxygen will not be used.])
|
||||||
|
enable_doc=no
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
AM_CONDITIONAL(DOXYGEN_DOC, test x$enable_doc = xyes)
|
||||||
|
|
||||||
|
if test x$DOT = x; then
|
||||||
|
if test "x$enable_dot" = xyes; then
|
||||||
|
AC_MSG_ERROR([could not find dot])
|
||||||
|
fi
|
||||||
|
enable_dot=no
|
||||||
|
else
|
||||||
|
enable_dot=yes
|
||||||
|
fi
|
||||||
|
|
||||||
|
AC_SUBST(enable_dot)
|
||||||
|
AC_SUBST(enable_html_docs)
|
||||||
|
AC_SUBST(enable_latex_docs)
|
||||||
|
])
|
@ -50,12 +50,6 @@ public:
|
|||||||
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
|
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
|
||||||
std::string name(void) const { return std::string("Times"); }
|
std::string name(void) const { return std::string("Times"); }
|
||||||
};
|
};
|
||||||
class funcDivide {
|
|
||||||
public:
|
|
||||||
funcDivide() {};
|
|
||||||
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;}
|
|
||||||
std::string name(void) const { return std::string("Divide"); }
|
|
||||||
};
|
|
||||||
class funcConj {
|
class funcConj {
|
||||||
public:
|
public:
|
||||||
funcConj() {};
|
funcConj() {};
|
||||||
@ -347,7 +341,6 @@ int main (int argc, char ** argv)
|
|||||||
Tester<RealF,vRealF>(funcPlus());
|
Tester<RealF,vRealF>(funcPlus());
|
||||||
Tester<RealF,vRealF>(funcMinus());
|
Tester<RealF,vRealF>(funcMinus());
|
||||||
Tester<RealF,vRealF>(funcTimes());
|
Tester<RealF,vRealF>(funcTimes());
|
||||||
Tester<RealF,vRealF>(funcDivide());
|
|
||||||
Tester<RealF,vRealF>(funcAdj());
|
Tester<RealF,vRealF>(funcAdj());
|
||||||
Tester<RealF,vRealF>(funcConj());
|
Tester<RealF,vRealF>(funcConj());
|
||||||
Tester<RealF,vRealF>(funcInnerProduct());
|
Tester<RealF,vRealF>(funcInnerProduct());
|
||||||
@ -378,7 +371,6 @@ int main (int argc, char ** argv)
|
|||||||
Tester<RealD,vRealD>(funcPlus());
|
Tester<RealD,vRealD>(funcPlus());
|
||||||
Tester<RealD,vRealD>(funcMinus());
|
Tester<RealD,vRealD>(funcMinus());
|
||||||
Tester<RealD,vRealD>(funcTimes());
|
Tester<RealD,vRealD>(funcTimes());
|
||||||
Tester<RealD,vRealD>(funcDivide());
|
|
||||||
Tester<RealD,vRealD>(funcAdj());
|
Tester<RealD,vRealD>(funcAdj());
|
||||||
Tester<RealD,vRealD>(funcConj());
|
Tester<RealD,vRealD>(funcConj());
|
||||||
Tester<RealD,vRealD>(funcInnerProduct());
|
Tester<RealD,vRealD>(funcInnerProduct());
|
||||||
|
@ -68,7 +68,7 @@ int main (int argc, char ** argv)
|
|||||||
for(int mu=0;mu<4;mu++){
|
for(int mu=0;mu<4;mu++){
|
||||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||||
LatticeCoordinate(coor,mu);
|
LatticeCoordinate(coor,mu);
|
||||||
C = C + (TwoPiL * p[mu]) * coor;
|
C = C - (TwoPiL * p[mu]) * coor;
|
||||||
}
|
}
|
||||||
|
|
||||||
C = exp(C*ci);
|
C = exp(C*ci);
|
||||||
@ -78,11 +78,10 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
FFT theFFT(&Fine);
|
FFT theFFT(&Fine);
|
||||||
|
|
||||||
Ctilde = C;
|
theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
theFFT.FFT_dim(Ctilde,C,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
|
||||||
|
|
||||||
// C=zero;
|
// C=zero;
|
||||||
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
|
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
|
||||||
@ -94,11 +93,10 @@ int main (int argc, char ** argv)
|
|||||||
C=C-Ctilde;
|
C=C-Ctilde;
|
||||||
std::cout << "diff scalar "<<norm2(C) << std::endl;
|
std::cout << "diff scalar "<<norm2(C) << std::endl;
|
||||||
|
|
||||||
Stilde = S;
|
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
|
|
||||||
|
|
||||||
SpinMatrixF Sp;
|
SpinMatrixF Sp;
|
||||||
Sp = zero; Sp = Sp+cVol;
|
Sp = zero; Sp = Sp+cVol;
|
||||||
|
@ -1,215 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./tests/Test_cayley_cg_reproducibility.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
using namespace Grid::QCD;
|
|
||||||
|
|
||||||
#define REPRODUCIBILITY_INTERVAL 1
|
|
||||||
|
|
||||||
|
|
||||||
template<class d>
|
|
||||||
struct scal {
|
|
||||||
d internal;
|
|
||||||
};
|
|
||||||
|
|
||||||
Gamma::GammaMatrix Gmu [] = {
|
|
||||||
Gamma::GammaX,
|
|
||||||
Gamma::GammaY,
|
|
||||||
Gamma::GammaZ,
|
|
||||||
Gamma::GammaT
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class What>
|
|
||||||
void TestCGinversions(What & Ddwf,
|
|
||||||
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
|
|
||||||
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
|
|
||||||
RealD mass, RealD M5,
|
|
||||||
GridParallelRNG *RNG4,
|
|
||||||
GridParallelRNG *RNG5);
|
|
||||||
template<class What>
|
|
||||||
void TestCGschur(What & Ddwf,
|
|
||||||
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
|
|
||||||
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
|
|
||||||
RealD mass, RealD M5,
|
|
||||||
GridParallelRNG *RNG4,
|
|
||||||
GridParallelRNG *RNG5);
|
|
||||||
|
|
||||||
template<class What>
|
|
||||||
void TestCGunprec(What & Ddwf,
|
|
||||||
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
|
|
||||||
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
|
|
||||||
RealD mass, RealD M5,
|
|
||||||
GridParallelRNG *RNG4,
|
|
||||||
GridParallelRNG *RNG5);
|
|
||||||
|
|
||||||
template<class What>
|
|
||||||
void TestCGprec(What & Ddwf,
|
|
||||||
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
|
|
||||||
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
|
|
||||||
RealD mass, RealD M5,
|
|
||||||
GridParallelRNG *RNG4,
|
|
||||||
GridParallelRNG *RNG5);
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "###########################################" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "This test disables the convergence alert" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "and checks whether in consecutive runs of the CG" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "the internal reductions are bit reproducible." << std::endl;
|
|
||||||
std::cout << GridLogMessage << "A warning and a report is produced for every failure" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "###########################################" << std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
const int Ls=8;
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid);
|
|
||||||
SU3::HotConfiguration(RNG4,Umu);
|
|
||||||
std::vector<LatticeColourMatrix> U(4,UGrid);
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
|
||||||
RealD M5 =1.8;
|
|
||||||
std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
|
|
||||||
DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
|
||||||
TestCGinversions<DomainWallFermionR>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
|
|
||||||
|
|
||||||
RealD b=1.5;// Scale factor b+c=2, b-c=1
|
|
||||||
RealD c=0.5;
|
|
||||||
std::cout<<GridLogMessage <<"MobiusFermion test"<<std::endl;
|
|
||||||
MobiusFermionR Dmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
|
|
||||||
TestCGinversions<MobiusFermionR>(Dmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"MobiusZolotarevFermion test"<<std::endl;
|
|
||||||
MobiusZolotarevFermionR Dzolo(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,0.1,2.0);
|
|
||||||
TestCGinversions<MobiusZolotarevFermionR>(Dzolo,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"ScaledShamirFermion test"<<std::endl;
|
|
||||||
ScaledShamirFermionR Dsham(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,2.0);
|
|
||||||
TestCGinversions<ScaledShamirFermionR>(Dsham,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"ShamirZolotarevFermion test"<<std::endl;
|
|
||||||
ShamirZolotarevFermionR Dshamz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
|
|
||||||
TestCGinversions<ShamirZolotarevFermionR>(Dshamz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"OverlapWilsonCayleyTanhFermion test"<<std::endl;
|
|
||||||
OverlapWilsonCayleyTanhFermionR Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0);
|
|
||||||
TestCGinversions<OverlapWilsonCayleyTanhFermionR>(Dov,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"OverlapWilsonCayleyZolotarevFermion test"<<std::endl;
|
|
||||||
OverlapWilsonCayleyZolotarevFermionR Dovz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
|
|
||||||
TestCGinversions<OverlapWilsonCayleyZolotarevFermionR>(Dovz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
|
|
||||||
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
||||||
template<class What>
|
|
||||||
void TestCGinversions(What & Ddwf,
|
|
||||||
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
|
|
||||||
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
|
|
||||||
RealD mass, RealD M5,
|
|
||||||
GridParallelRNG *RNG4,
|
|
||||||
GridParallelRNG *RNG5)
|
|
||||||
{
|
|
||||||
std::cout<<GridLogMessage << "Testing unpreconditioned inverter"<<std::endl;
|
|
||||||
TestCGunprec<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
|
|
||||||
std::cout<<GridLogMessage << "Testing red black preconditioned inverter"<<std::endl;
|
|
||||||
TestCGprec<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
|
|
||||||
std::cout<<GridLogMessage << "Testing red black Schur inverter"<<std::endl;
|
|
||||||
TestCGschur<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class What>
|
|
||||||
void TestCGunprec(What & Ddwf,
|
|
||||||
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
|
|
||||||
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
|
|
||||||
RealD mass, RealD M5,
|
|
||||||
GridParallelRNG *RNG4,
|
|
||||||
GridParallelRNG *RNG5)
|
|
||||||
{
|
|
||||||
LatticeFermion src (FGrid); random(*RNG5,src);
|
|
||||||
LatticeFermion result(FGrid); result=zero;
|
|
||||||
|
|
||||||
MdagMLinearOperator<What,LatticeFermion> HermOp(Ddwf);
|
|
||||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
|
|
||||||
CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
|
|
||||||
CG(HermOp,src,result);
|
|
||||||
|
|
||||||
}
|
|
||||||
template<class What>
|
|
||||||
void TestCGprec(What & Ddwf,
|
|
||||||
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
|
|
||||||
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
|
|
||||||
RealD mass, RealD M5,
|
|
||||||
GridParallelRNG *RNG4,
|
|
||||||
GridParallelRNG *RNG5)
|
|
||||||
{
|
|
||||||
LatticeFermion src (FGrid); random(*RNG5,src);
|
|
||||||
LatticeFermion src_o(FrbGrid);
|
|
||||||
LatticeFermion result_o(FrbGrid);
|
|
||||||
pickCheckerboard(Odd,src_o,src);
|
|
||||||
result_o=zero;
|
|
||||||
|
|
||||||
SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
|
|
||||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
|
|
||||||
CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
|
|
||||||
CG(HermOpEO,src_o,result_o);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class What>
|
|
||||||
void TestCGschur(What & Ddwf,
|
|
||||||
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
|
|
||||||
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
|
|
||||||
RealD mass, RealD M5,
|
|
||||||
GridParallelRNG *RNG4,
|
|
||||||
GridParallelRNG *RNG5)
|
|
||||||
{
|
|
||||||
LatticeFermion src (FGrid); random(*RNG5,src);
|
|
||||||
LatticeFermion result(FGrid); result=zero;
|
|
||||||
|
|
||||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
|
|
||||||
CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
|
|
||||||
SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
|
|
||||||
SchurSolver(Ddwf,src,result);
|
|
||||||
}
|
|
@ -63,9 +63,6 @@ class HmcRunner : public NerscHmcRunner {
|
|||||||
Real mass = -0.77;
|
Real mass = -0.77;
|
||||||
FermionAction FermOp(U, *FGrid, *FrbGrid, mass);
|
FermionAction FermOp(U, *FGrid, *FrbGrid, mass);
|
||||||
|
|
||||||
// To enable the CG reproducibility tests use
|
|
||||||
//ConjugateGradient<FermionField> CG(1.0e-8, 10000, ReproducibilityTest);
|
|
||||||
// This is the plain version
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8, 10000);
|
ConjugateGradient<FermionField> CG(1.0e-8, 10000);
|
||||||
|
|
||||||
TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG);
|
TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG);
|
||||||
|
@ -71,7 +71,7 @@ int main (int argc, char ** argv)
|
|||||||
WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
|
WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
|
||||||
|
|
||||||
MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
|
MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
|
||||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000,true, true);
|
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
|
||||||
CG(HermOp,src,result);
|
CG(HermOp,src,result);
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
|
Reference in New Issue
Block a user