1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 01:05:38 +01:00

Merge branch 'develop' into feature/hmc_generalise

This commit is contained in:
Guido Cossu 2016-12-05 05:10:27 +00:00
commit 01480da0a8
30 changed files with 1384 additions and 3038 deletions

4
.gitignore vendored
View File

@ -47,7 +47,9 @@ Config.h.in
config.log config.log
config.status config.status
.deps .deps
*.inc Make.inc
eigen.inc
Eigen.inc
# http://www.gnu.org/software/autoconf # # http://www.gnu.org/software/autoconf #
######################################## ########################################

View File

@ -1,10 +1,12 @@
# additional include paths necessary to compile the C++ library # additional include paths necessary to compile the C++ library
SUBDIRS = lib benchmarks tests SUBDIRS = lib benchmarks tests
.PHONY: tests include $(top_srcdir)/doxygen.inc
tests: all tests: all
$(MAKE) -C tests tests $(MAKE) -C tests tests
.PHONY: tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
AM_CXXFLAGS += -I$(top_builddir)/include AM_CXXFLAGS += -I$(top_builddir)/include
ACLOCAL_AMFLAGS = -I m4 ACLOCAL_AMFLAGS = -I m4

View File

@ -116,13 +116,15 @@ If you want to build all the tests at once just use `make tests`.
- `--with-fftw=<path>`: look for FFTW in the UNIX prefix `<path>` - `--with-fftw=<path>`: look for FFTW in the UNIX prefix `<path>`
- `--enable-lapack[=<path>]`: enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional). - `--enable-lapack[=<path>]`: enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional).
- `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional). - `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
- `--enable-numa`: ??? - `--enable-numa`: enable NUMA first touch optimisation
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`). - `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `). - `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
- `--disable-timers`: disable system dependent high-resolution timers. - `--disable-timers`: disable system dependent high-resolution timers.
- `--enable-chroma`: enable Chroma regression tests. - `--enable-chroma`: enable Chroma regression tests.
- `--enable-doxygen-doc`: enable the Doxygen documentation generation (build with `make doxygen-doc`)
### Possible communication interfaces ### Possible communication interfaces
@ -136,7 +138,7 @@ The following options can be use with the `--enable-comms=` option to target dif
| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model | | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
| `shmem ` | Cray SHMEM communications | | `shmem ` | Cray SHMEM communications |
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.
### Possible SIMD types ### Possible SIMD types
@ -165,6 +167,7 @@ Alternatively, some CPU codenames can be directly used:
- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced. - We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
- For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform. - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
- BG/Q performances are currently rather poor. This is being investigated for future versions. - BG/Q performances are currently rather poor. This is being investigated for future versions.
- The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.
### Build setup for Intel Knights Landing platform ### Build setup for Intel Knights Landing platform

View File

@ -0,0 +1,183 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt();
const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making Vec5d innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Seeded"<<std::endl;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "made random gauge fields"<<std::endl;
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
if (1)
{
const int ncall=1000;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
GridParallelRNG RNG5(FGrid);
LatticeFermion src(FGrid); random(RNG5,src);
LatticeFermion result(FGrid);
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
double t0,t1;
LatticeFermion r_eo(FGrid);
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
setCheckerboard(r_eo,src_o);
setCheckerboard(r_eo,src_e);
r_e = zero;
r_o = zero;
#define BENCH_DW(A,in,out) \
Dw.CayleyZeroCounters(); \
Dw. A (in,out); \
FGrid->Barrier(); \
t0=usecond(); \
for(int i=0;i<ncall;i++){ \
Dw. A (in,out); \
} \
t1=usecond(); \
FGrid->Barrier(); \
Dw.CayleyReport(); \
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl;
#define BENCH_DW_MEO(A,in,out) \
Dw.CayleyZeroCounters(); \
Dw. A (in,out,0); \
FGrid->Barrier(); \
t0=usecond(); \
for(int i=0;i<ncall;i++){ \
Dw. A (in,out,0); \
} \
t1=usecond(); \
FGrid->Barrier(); \
Dw.CayleyReport(); \
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl;
BENCH_DW_MEO(Dhop ,src,result);
BENCH_DW_MEO(DhopEO ,src_o,r_e);
BENCH_DW(Meooe ,src_o,r_e);
BENCH_DW(Mooee ,src_o,r_o);
BENCH_DW(MooeeInv,src_o,r_o);
}
if (1)
{
const int ncall=1000;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
GridParallelRNG RNG5(sFGrid);
LatticeFermion src(sFGrid); random(RNG5,src);
LatticeFermion sref(sFGrid);
LatticeFermion result(sFGrid);
std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);
std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
FGrid->Barrier();
double t0,t1;
LatticeFermion r_eo(sFGrid);
LatticeFermion src_e (sFrbGrid);
LatticeFermion src_o (sFrbGrid);
LatticeFermion r_e (sFrbGrid);
LatticeFermion r_o (sFrbGrid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
setCheckerboard(r_eo,src_o);
setCheckerboard(r_eo,src_e);
r_e = zero;
r_o = zero;
BENCH_DW_MEO(Dhop ,src,result);
BENCH_DW_MEO(DhopEO ,src_o,r_e);
BENCH_DW(Meooe ,src_o,r_e);
BENCH_DW(Mooee ,src_o,r_o);
BENCH_DW(MooeeInv,src_o,r_o);
}
Grid_finalize();
}

View File

@ -1 +1,11 @@
include Make.inc include Make.inc
simple: simple_su3_test.o simple_su3_expr.o simple_simd_test.o
EXTRA_LIBRARIES = libsimple_su3_test.a libsimple_su3_expr.a libsimple_simd_test.a
libsimple_su3_test_a_SOURCES = simple_su3_test.cc
libsimple_su3_expr_a_SOURCES = simple_su3_expr.cc
libsimple_simd_test_a_SOURCES = simple_simd_test.cc

View File

@ -0,0 +1,11 @@
#include <Grid/Grid.h>
Grid::vRealD add(const Grid::vRealD &x, const Grid::vRealD &y)
{
return x+y;
}
Grid::vRealD sub(const Grid::vRealD &x, const Grid::vRealD &y)
{
return x-y;
}

View File

@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid/Grid.h>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;

View File

@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid/Grid.h>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;

View File

@ -161,8 +161,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
############### SIMD instruction selection ############### SIMD instruction selection
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=<code>], AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],
[select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN]) [select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])
AC_ARG_ENABLE([gen-simd-width],
[AS_HELP_STRING([--enable-gen-simd-width=size],
[size (in bytes) of the generic SIMD vectors (default: 32)])],
[ac_gen_simd_width=$enable_gen_simd_width],
[ac_gen_simd_width=32])
case ${ax_cv_cxx_compiler_vendor} in case ${ax_cv_cxx_compiler_vendor} in
clang|gnu) clang|gnu)
@ -192,7 +198,10 @@ case ${ax_cv_cxx_compiler_vendor} in
AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
SIMD_FLAGS='-march=knl';; SIMD_FLAGS='-march=knl';;
GEN) GEN)
AC_DEFINE([GENERIC_VEC],[1],[generic vector code]) AC_DEFINE([GEN],[1],[generic vector code])
AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
[generic SIMD vector width (in bytes)])
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
SIMD_FLAGS='';; SIMD_FLAGS='';;
QPX|BGQ) QPX|BGQ)
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
@ -209,8 +218,8 @@ case ${ax_cv_cxx_compiler_vendor} in
AC_DEFINE([AVX1],[1],[AVX intrinsics]) AC_DEFINE([AVX1],[1],[AVX intrinsics])
SIMD_FLAGS='-mavx -xavx';; SIMD_FLAGS='-mavx -xavx';;
AVXFMA) AVXFMA)
AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4]) AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
SIMD_FLAGS='-mavx -mfma';; SIMD_FLAGS='-mavx -fma';;
AVX2) AVX2)
AC_DEFINE([AVX2],[1],[AVX2 intrinsics]) AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
SIMD_FLAGS='-march=core-avx2 -xcore-avx2';; SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
@ -224,7 +233,10 @@ case ${ax_cv_cxx_compiler_vendor} in
AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing]) AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
SIMD_FLAGS='-xmic-avx512';; SIMD_FLAGS='-xmic-avx512';;
GEN) GEN)
AC_DEFINE([GENERIC_VEC],[1],[generic vector code]) AC_DEFINE([GEN],[1],[generic vector code])
AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
[generic SIMD vector width (in bytes)])
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
SIMD_FLAGS='';; SIMD_FLAGS='';;
*) *)
AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);; AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
@ -290,7 +302,7 @@ esac
case ${ac_COMMS} in case ${ac_COMMS} in
*-auto) *-auto)
LX_FIND_MPI LX_FIND_MPI
if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi
AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS" AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS" AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS" AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
@ -354,12 +366,17 @@ esac
AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ]) AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
############### Doxygen ############### Doxygen
AC_PROG_DOXYGEN DX_DOXYGEN_FEATURE([OFF])
DX_DOT_FEATURE([OFF])
if test -n "$DOXYGEN" DX_HTML_FEATURE([ON])
then DX_CHM_FEATURE([OFF])
AC_CONFIG_FILES([docs/doxy.cfg]) DX_CHI_FEATURE([OFF])
fi DX_MAN_FEATURE([OFF])
DX_RTF_FEATURE([OFF])
DX_XML_FEATURE([OFF])
DX_PDF_FEATURE([OFF])
DX_PS_FEATURE([OFF])
DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])
############### Ouput ############### Ouput
cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd} cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
@ -399,7 +416,7 @@ os (target) : $target_os
compiler vendor : ${ax_cv_cxx_compiler_vendor} compiler vendor : ${ax_cv_cxx_compiler_vendor}
compiler version : ${ax_cv_gxx_version} compiler version : ${ax_cv_gxx_version}
----- BUILD OPTIONS ----------------------------------- ----- BUILD OPTIONS -----------------------------------
SIMD : ${ac_SIMD} SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
Threading : ${ac_openmp} Threading : ${ac_openmp}
Communications type : ${comms_type} Communications type : ${comms_type}
Default precision : ${ac_PRECISION} Default precision : ${ac_PRECISION}
@ -408,8 +425,7 @@ GMP : `if test "x$have_gmp" = xtrue; then echo yes; else
LAPACK : ${ac_LAPACK} LAPACK : ${ac_LAPACK}
FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi` FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
LIME (ILDG support) : `if test "x$have_lime" = xtrue; then echo yes; else echo no; fi` LIME (ILDG support) : `if test "x$have_lime" = xtrue; then echo yes; else echo no; fi`
build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi` build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
----- BUILD FLAGS ------------------------------------- ----- BUILD FLAGS -------------------------------------
CXXFLAGS: CXXFLAGS:
`echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'` `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

184
doxygen.inc Normal file
View File

@ -0,0 +1,184 @@
# Copyright (C) 2004 Oren Ben-Kiki
# This file is distributed under the same terms as the Automake macro files.
# Generate automatic documentation using Doxygen. Goals and variables values
# are controlled by the various DX_COND_??? conditionals set by autoconf.
#
# The provided goals are:
# doxygen-doc: Generate all doxygen documentation.
# doxygen-run: Run doxygen, which will generate some of the documentation
# (HTML, CHM, CHI, MAN, RTF, XML) but will not do the post
# processing required for the rest of it (PS, PDF, and some MAN).
# doxygen-man: Rename some doxygen generated man pages.
# doxygen-ps: Generate doxygen PostScript documentation.
# doxygen-pdf: Generate doxygen PDF documentation.
#
# Note that by default these are not integrated into the automake goals. If
# doxygen is used to generate man pages, you can achieve this integration by
# setting man3_MANS to the list of man pages generated and then adding the
# dependency:
#
# $(man3_MANS): doxygen-doc
#
# This will cause make to run doxygen and generate all the documentation.
#
# The following variable is intended for use in Makefile.am:
#
# DX_CLEANFILES = everything to clean.
#
# This is usually added to MOSTLYCLEANFILES.
## --------------------------------- ##
## Format-independent Doxygen rules. ##
## --------------------------------- ##
if DX_COND_doc
## ------------------------------- ##
## Rules specific for HTML output. ##
## ------------------------------- ##
if DX_COND_html
DX_CLEAN_HTML = @DX_DOCDIR@/html
endif DX_COND_html
## ------------------------------ ##
## Rules specific for CHM output. ##
## ------------------------------ ##
if DX_COND_chm
DX_CLEAN_CHM = @DX_DOCDIR@/chm
if DX_COND_chi
DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi
endif DX_COND_chi
endif DX_COND_chm
## ------------------------------ ##
## Rules specific for MAN output. ##
## ------------------------------ ##
if DX_COND_man
DX_CLEAN_MAN = @DX_DOCDIR@/man
endif DX_COND_man
## ------------------------------ ##
## Rules specific for RTF output. ##
## ------------------------------ ##
if DX_COND_rtf
DX_CLEAN_RTF = @DX_DOCDIR@/rtf
endif DX_COND_rtf
## ------------------------------ ##
## Rules specific for XML output. ##
## ------------------------------ ##
if DX_COND_xml
DX_CLEAN_XML = @DX_DOCDIR@/xml
endif DX_COND_xml
## ----------------------------- ##
## Rules specific for PS output. ##
## ----------------------------- ##
if DX_COND_ps
DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE@.ps
DX_PS_GOAL = doxygen-ps
doxygen-ps: @DX_DOCDIR@/@PACKAGE@.ps
@DX_DOCDIR@/@PACKAGE@.ps: @DX_DOCDIR@/@PACKAGE@.tag
cd @DX_DOCDIR@/latex; \
rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
$(DX_LATEX) refman.tex; \
$(MAKEINDEX_PATH) refman.idx; \
$(DX_LATEX) refman.tex; \
countdown=5; \
while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
refman.log > /dev/null 2>&1 \
&& test $$countdown -gt 0; do \
$(DX_LATEX) refman.tex; \
countdown=`expr $$countdown - 1`; \
done; \
$(DX_DVIPS) -o ../@PACKAGE@.ps refman.dvi
endif DX_COND_ps
## ------------------------------ ##
## Rules specific for PDF output. ##
## ------------------------------ ##
if DX_COND_pdf
DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE@.pdf
DX_PDF_GOAL = doxygen-pdf
doxygen-pdf: @DX_DOCDIR@/@PACKAGE@.pdf
@DX_DOCDIR@/@PACKAGE@.pdf: @DX_DOCDIR@/@PACKAGE@.tag
cd @DX_DOCDIR@/latex; \
rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
$(DX_PDFLATEX) refman.tex; \
$(DX_MAKEINDEX) refman.idx; \
$(DX_PDFLATEX) refman.tex; \
countdown=5; \
while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
refman.log > /dev/null 2>&1 \
&& test $$countdown -gt 0; do \
$(DX_PDFLATEX) refman.tex; \
countdown=`expr $$countdown - 1`; \
done; \
mv refman.pdf ../@PACKAGE@.pdf
endif DX_COND_pdf
## ------------------------------------------------- ##
## Rules specific for LaTeX (shared for PS and PDF). ##
## ------------------------------------------------- ##
if DX_COND_latex
DX_CLEAN_LATEX = @DX_DOCDIR@/latex
endif DX_COND_latex
.INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
doxygen-run: @DX_DOCDIR@/@PACKAGE@.tag
doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
@DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS)
rm -rf @DX_DOCDIR@
$(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG)
DX_CLEANFILES = \
@DX_DOCDIR@/@PACKAGE@.tag \
-r \
$(DX_CLEAN_HTML) \
$(DX_CLEAN_CHM) \
$(DX_CLEAN_CHI) \
$(DX_CLEAN_MAN) \
$(DX_CLEAN_RTF) \
$(DX_CLEAN_XML) \
$(DX_CLEAN_PS) \
$(DX_CLEAN_PDF) \
$(DX_CLEAN_LATEX)
endif DX_COND_doc

View File

@ -244,7 +244,10 @@ namespace Grid {
pokeLocalSite(s,pgbuf,cbuf); pokeLocalSite(s,pgbuf,cbuf);
} }
} }
result = Cshift(result,dim,L); if (p != processors[dim] - 1)
{
result = Cshift(result,dim,L);
}
} }
// Loop over orthog coords // Loop over orthog coords
@ -287,10 +290,10 @@ namespace Grid {
cgbuf = clbuf; cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc; cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf); peekLocalSite(s,pgbuf,cgbuf);
s = s * div;
pokeLocalSite(s,result,clbuf); pokeLocalSite(s,result,clbuf);
} }
} }
result = result*div;
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);

View File

@ -1080,10 +1080,10 @@ say con = 2
**/ **/
template<class T> template<class T>
static void Lock(DenseMatrix<T> &H, ///Hess mtx static void Lock(DenseMatrix<T> &H, // Hess mtx
DenseMatrix<T> &Q, ///Lock Transform DenseMatrix<T> &Q, // Lock Transform
T val, ///value to be locked T val, // value to be locked
int con, ///number already locked int con, // number already locked
RealD small, RealD small,
int dfg, int dfg,
bool herm) bool herm)

View File

@ -208,6 +208,7 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR; typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF; typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD; typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
typedef MobiusFermion<WilsonImplR> MobiusFermionR; typedef MobiusFermion<WilsonImplR> MobiusFermionR;
typedef MobiusFermion<WilsonImplF> MobiusFermionF; typedef MobiusFermion<WilsonImplF> MobiusFermionF;
typedef MobiusFermion<WilsonImplD> MobiusFermionD; typedef MobiusFermion<WilsonImplD> MobiusFermionD;
@ -216,6 +217,20 @@ typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF; typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD; typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
// Ls vectorised
typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR; typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF; typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD; typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@ -267,6 +282,7 @@ typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD; typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
}} }}
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code

View File

@ -62,6 +62,50 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
} }
} }
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
{
this->Report();
std::vector<int> latt = GridDefaultLatt();
RealD volume = this->Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = this->_FourDimGrid->_Nprocessors;
if ( M5Dcalls > 0 ) {
std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl;
// Flops = 6.0*(Nc*Ns) *Ls*vol
RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
}
if ( MooeeInvCalls > 0 ) {
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
// Flops = 9*12*Ls*vol/2
RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
}
}
template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
{
this->ZeroCounters();
M5Dflops=0;
M5Dcalls=0;
M5Dtime=0;
MooeeInvFlops=0;
MooeeInvCalls=0;
MooeeInvTime=0;
}
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
{ {

View File

@ -121,6 +121,18 @@ namespace Grid {
RealD _mass,RealD _M5,const ImplParams &p= ImplParams()); RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
void CayleyReport(void);
void CayleyZeroCounters(void);
double M5Dflops;
double M5Dcalls;
double M5Dtime;
double MooeeInvFlops;
double MooeeInvCalls;
double MooeeInvTime;
protected: protected:
void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);

View File

@ -51,6 +51,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
GridBase *grid=psi._grid; GridBase *grid=psi._grid;
assert(phi.checkerboard == psi.checkerboard); assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard; chi.checkerboard=psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
@ -76,6 +79,7 @@ PARALLEL_FOR_LOOP
} }
} }
} }
M5Dtime+=usecond();
} }
template<class Impl> template<class Impl>
@ -91,6 +95,9 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
assert(phi.checkerboard == psi.checkerboard); assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard; chi.checkerboard=psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0]; auto tmp = psi._odata[0];
@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP
} }
} }
} }
M5Dtime+=usecond();
} }
template<class Impl> template<class Impl>
@ -126,10 +134,14 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
chi.checkerboard=psi.checkerboard; chi.checkerboard=psi.checkerboard;
MooeeInvCalls++;
MooeeInvTime-=usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0]; auto tmp = psi._odata[0];
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
// Apply (L^{\prime})^{-1} // Apply (L^{\prime})^{-1}
chi[ss]=psi[ss]; // chi[0]=psi[0] chi[ss]=psi[ss]; // chi[0]=psi[0]
for(int s=1;s<Ls;s++){ for(int s=1;s<Ls;s++){
@ -155,6 +167,9 @@ PARALLEL_FOR_LOOP
chi[ss+s] = chi[ss+s] - uee[s]*tmp; chi[ss+s] = chi[ss+s] - uee[s]*tmp;
} }
} }
MooeeInvTime+=usecond();
} }
template<class Impl> template<class Impl>
@ -166,6 +181,8 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
assert(psi.checkerboard == psi.checkerboard); assert(psi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard; chi.checkerboard=psi.checkerboard;
MooeeInvCalls++;
MooeeInvTime-=usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP
chi[ss+s] = chi[ss+s] - lee[s]*tmp; chi[ss+s] = chi[ss+s] - lee[s]*tmp;
} }
} }
MooeeInvTime+=usecond();
} }
#ifdef CAYLEY_DPERP_CACHE #ifdef CAYLEY_DPERP_CACHE

View File

@ -60,7 +60,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
GridBase *grid=psi._grid; GridBase *grid=psi._grid;
int Ls = this->Ls; int Ls = this->Ls;
int LLs = grid->_rdimensions[0]; int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd(); const int nsimd= Simd::Nsimd();
Vector<iSinglet<Simd> > u(LLs); Vector<iSinglet<Simd> > u(LLs);
Vector<iSinglet<Simd> > l(LLs); Vector<iSinglet<Simd> > l(LLs);
@ -86,35 +86,138 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
d_p[ss] = diag[s]; d_p[ss] = diag[s];
}} }}
M5Dcalls++;
M5Dtime-=usecond();
assert(Nc==3);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
alignas(64) SiteHalfSpinor hp; for(int v=0;v<LLs;v++){
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0;v<LLs;v++){ int vp=(v+1)%LLs;
int vm=(v+LLs-1)%LLs;
int vp=(v+1)%LLs; spProj5m(hp,psi[ss+vp]);
int vm=(v+LLs-1)%LLs; spProj5p(hm,psi[ss+vm]);
spProj5m(hp,psi[ss+vp]); if ( vp<=v ) rotate(hp,hp,1);
spProj5p(hm,psi[ss+vm]); if ( vm>=v ) rotate(hm,hm,nsimd-1);
if ( vp<=v ) rotate(hp,hp,1); hp=0.5*hp;
if ( vm>=v ) rotate(hm,hm,nsimd-1); hm=0.5*hm;
hp=hp*0.5; spRecon5m(fp,hp);
hm=hm*0.5; spRecon5p(fm,hm);
spRecon5m(fp,hp);
spRecon5p(fm,hm);
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp; chi[ss+v] = d[v]*phi[ss+v];
chi[ss+v] = chi[ss+v] +l[v]*fm; chi[ss+v] = chi[ss+v] +u[v]*fp;
chi[ss+v] = chi[ss+v] +l[v]*fm;
} }
#else
for(int v=0;v<LLs;v++){
vprefetch(psi[ss+v+LLs]);
// vprefetch(phi[ss+v+LLs]);
int vp= (v==LLs-1) ? 0 : v+1;
int vm= (v==0 ) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(2)(0);
Simd hp_01 = psi[ss+vp]()(2)(1);
Simd hp_02 = psi[ss+vp]()(2)(2);
Simd hp_10 = psi[ss+vp]()(3)(0);
Simd hp_11 = psi[ss+vp]()(3)(1);
Simd hp_12 = psi[ss+vp]()(3)(2);
Simd hm_00 = psi[ss+vm]()(0)(0);
Simd hm_01 = psi[ss+vm]()(0)(1);
Simd hm_02 = psi[ss+vm]()(0)(2);
Simd hm_10 = psi[ss+vm]()(1)(0);
Simd hm_11 = psi[ss+vm]()(1)(1);
Simd hm_12 = psi[ss+vm]()(1)(2);
// if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
// if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
if ( vp<=v ) {
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if ( vm>=v ) {
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
/*
if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
*/
Simd p_00 = d[v]()()() * phi[ss+v]()(0)(0) + l[v]()()()*hm_00;
Simd p_01 = d[v]()()() * phi[ss+v]()(0)(1) + l[v]()()()*hm_01;
Simd p_02 = d[v]()()() * phi[ss+v]()(0)(2) + l[v]()()()*hm_02;
Simd p_10 = d[v]()()() * phi[ss+v]()(1)(0) + l[v]()()()*hm_10;
Simd p_11 = d[v]()()() * phi[ss+v]()(1)(1) + l[v]()()()*hm_11;
Simd p_12 = d[v]()()() * phi[ss+v]()(1)(2) + l[v]()()()*hm_12;
Simd p_20 = d[v]()()() * phi[ss+v]()(2)(0) + u[v]()()()*hp_00;
Simd p_21 = d[v]()()() * phi[ss+v]()(2)(1) + u[v]()()()*hp_01;
Simd p_22 = d[v]()()() * phi[ss+v]()(2)(2) + u[v]()()()*hp_02;
Simd p_30 = d[v]()()() * phi[ss+v]()(3)(0) + u[v]()()()*hp_10;
Simd p_31 = d[v]()()() * phi[ss+v]()(3)(1) + u[v]()()()*hp_11;
Simd p_32 = d[v]()()() * phi[ss+v]()(3)(2) + u[v]()()()*hp_12;
// if ( ss==0){
/*
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
}
*/
vstream(chi[ss+v]()(0)(0),p_00);
vstream(chi[ss+v]()(0)(1),p_01);
vstream(chi[ss+v]()(0)(2),p_02);
vstream(chi[ss+v]()(1)(0),p_10);
vstream(chi[ss+v]()(1)(1),p_11);
vstream(chi[ss+v]()(1)(2),p_12);
vstream(chi[ss+v]()(2)(0),p_20);
vstream(chi[ss+v]()(2)(1),p_21);
vstream(chi[ss+v]()(2)(2),p_22);
vstream(chi[ss+v]()(3)(0),p_30);
vstream(chi[ss+v]()(3)(1),p_31);
vstream(chi[ss+v]()(3)(2),p_32);
}
#endif
} }
M5Dtime+=usecond();
} }
template<class Impl> template<class Impl>
@ -154,6 +257,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
d_p[ss] = diag[s]; d_p[ss] = diag[s];
}} }}
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
@ -183,8 +288,8 @@ PARALLEL_FOR_LOOP
} }
} }
M5Dtime+=usecond();
} }
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
{ {
@ -250,13 +355,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
} }
} }
MooeeInvCalls++;
MooeeInvTime-=usecond();
// Dynamic allocate on stack to get per thread without serialised heap acces // Dynamic allocate on stack to get per thread without serialised heap acces
PARALLEL_FOR_LOOP #pragma omp parallel
for(auto site=0;site<vol;site++){ {
// SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
// SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
// SiteSpinor *SiteChi =(SiteSpinor *) alloca(LLs*sizeof(SiteSpinor));
Vector<SiteHalfSpinor> SitePplus(LLs); Vector<SiteHalfSpinor> SitePplus(LLs);
Vector<SiteHalfSpinor> SitePminus(LLs); Vector<SiteHalfSpinor> SitePminus(LLs);
@ -267,6 +370,9 @@ PARALLEL_FOR_LOOP
SiteHalfSpinor BcastP; SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM; SiteHalfSpinor BcastM;
#pragma omp for
for(auto site=0;site<vol;site++){
for(int s=0;s<LLs;s++){ for(int s=0;s<LLs;s++){
int lex = s+LLs*site; int lex = s+LLs*site;
spProj5p(SitePplus[s] ,psi[lex]); spProj5p(SitePplus[s] ,psi[lex]);
@ -294,6 +400,8 @@ PARALLEL_FOR_LOOP
chi[lex] = SiteChi[s]*0.5; chi[lex] = SiteChi[s]*0.5;
} }
} }
}
MooeeInvTime+=usecond();
} }
INSTANTIATE_DPERP(DomainWallVec5dImplD); INSTANTIATE_DPERP(DomainWallVec5dImplD);

View File

@ -194,6 +194,11 @@ void WilsonFermion5D<Impl>::Report(void)
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
} }
if ( DerivCalls > 0 ) { if ( DerivCalls > 0 ) {
@ -209,12 +214,15 @@ void WilsonFermion5D<Impl>::Report(void)
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime; RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
}
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; }
if (DerivCalls > 0 || DhopCalls > 0){ if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl; Stencil.Report(); std::cout << GridLogMessage << "WilsonFermion5D Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report(); std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl; StencilOdd.Report(); std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
} }
} }

View File

@ -167,7 +167,7 @@ namespace Optimization {
} }
//Integer //Integer
inline __m256i operator()(__m256i a, __m256i b){ inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1) || defined (AVXFMA4) #if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
__m128i a0,a1; __m128i a0,a1;
__m128i b0,b1; __m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0); a0 = _mm256_extractf128_si256(a,0);
@ -195,7 +195,7 @@ namespace Optimization {
} }
//Integer //Integer
inline __m256i operator()(__m256i a, __m256i b){ inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1) || defined (AVXFMA4) #if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
__m128i a0,a1; __m128i a0,a1;
__m128i b0,b1; __m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0); a0 = _mm256_extractf128_si256(a,0);
@ -233,7 +233,7 @@ namespace Optimization {
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif #endif
#if defined (AVX2) #if defined (AVX2) || defined (AVXFMA)
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@ -279,7 +279,7 @@ namespace Optimization {
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif #endif
#if defined (AVX2) #if defined (AVX2) || defined (AVXFMA)
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@ -320,7 +320,7 @@ namespace Optimization {
#if defined (AVXFMA4) #if defined (AVXFMA4)
a= _mm256_macc_ps(b,c,a); a= _mm256_macc_ps(b,c,a);
#endif #endif
#if defined (AVX2) #if defined (AVX2) || defined (AVXFMA)
a= _mm256_fmadd_ps( b, c, a); a= _mm256_fmadd_ps( b, c, a);
#endif #endif
} }
@ -332,7 +332,7 @@ namespace Optimization {
#if defined (AVXFMA4) #if defined (AVXFMA4)
a= _mm256_macc_pd(b,c,a); a= _mm256_macc_pd(b,c,a);
#endif #endif
#if defined (AVX2) #if defined (AVX2) || defined (AVXFMA)
a= _mm256_fmadd_pd( b, c, a); a= _mm256_fmadd_pd( b, c, a);
#endif #endif
} }
@ -347,7 +347,7 @@ namespace Optimization {
} }
// Integer // Integer
inline __m256i operator()(__m256i a, __m256i b){ inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1) #if defined (AVX1) || defined (AVXFMA)
__m128i a0,a1; __m128i a0,a1;
__m128i b0,b1; __m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0); a0 = _mm256_extractf128_si256(a,0);

View File

@ -27,15 +27,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
//----------------------------------------------------------------------
/*! @file Grid_knc.h
@brief Optimization libraries for AVX512 instructions set for KNC
Using intrinsics
*/
// Time-stamp: <2015-06-09 14:27:28 neo>
//----------------------------------------------------------------------
#include <immintrin.h> #include <immintrin.h>
@ -95,13 +86,13 @@ namespace Optimization {
struct Vstream{ struct Vstream{
//Float //Float
inline void operator()(float * a, __m512 b){ inline void operator()(float * a, __m512 b){
//_mm512_stream_ps(a,b); _mm512_stream_ps(a,b);
_mm512_store_ps(a,b); // _mm512_store_ps(a,b);
} }
//Double //Double
inline void operator()(double * a, __m512d b){ inline void operator()(double * a, __m512d b){
//_mm512_stream_pd(a,b); _mm512_stream_pd(a,b);
_mm512_store_pd(a,b); // _mm512_store_pd(a,b);
} }
}; };

View File

@ -6,8 +6,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Antonin Portelli <antonin.portelli@me.com>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -27,133 +26,352 @@ Author: neo <cossu@post.kek.jp>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
//#define VECTOR_LOOPS
// playing with compiler pragmas
#ifdef VECTOR_LOOPS
#ifdef __clang__
#define VECTOR_FOR(i, w, inc)\
_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
for (unsigned int i = 0; i < w; i += inc)
#elif defined __INTEL_COMPILER
#define VECTOR_FOR(i, w, inc)\
_Pragma("simd vectorlength(w*8)")\
for (unsigned int i = 0; i < w; i += inc)
#else
#define VECTOR_FOR(i, w, inc)\
for (unsigned int i = 0; i < w; i += inc)
#endif
#else
#define VECTOR_FOR(i, w, inc)\
for (unsigned int i = 0; i < w; i += inc)
#endif
namespace Grid { namespace Grid {
namespace Optimization { namespace Optimization {
template<class vtype> // type traits giving the number of elements for each vector type
union uconv { template <typename T> struct W;
float f; template <> struct W<double> {
vtype v; constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
};
template <> struct W<float> {
constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
}; };
union u128f { // SIMD vector types
float v; template <typename T>
float f[4]; struct vec {
}; alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
union u128d {
double v;
double f[2];
}; };
typedef vec<float> vecf;
typedef vec<double> vecd;
struct Vsplat{ struct Vsplat{
//Complex float // Complex
inline u128f operator()(float a, float b){ template <typename T>
u128f out; inline vec<T> operator()(T a, T b){
out.f[0] = a; vec<T> out;
out.f[1] = b;
out.f[2] = a; VECTOR_FOR(i, W<T>::r, 2)
out.f[3] = b; {
out.v[i] = a;
out.v[i+1] = b;
}
return out; return out;
} }
// Real float
inline u128f operator()(float a){ // Real
u128f out; template <typename T>
out.f[0] = a; inline vec<T> operator()(T a){
out.f[1] = a; vec<T> out;
out.f[2] = a;
out.f[3] = a; VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a;
}
return out; return out;
} }
//Complex double
inline u128d operator()(double a, double b){ // Integer
u128d out;
out.f[0] = a;
out.f[1] = b;
return out;
}
//Real double
inline u128d operator()(double a){
u128d out;
out.f[0] = a;
out.f[1] = a;
return out;
}
//Integer
inline int operator()(Integer a){ inline int operator()(Integer a){
return a; return a;
} }
}; };
struct Vstore{ struct Vstore{
//Float // Real
inline void operator()(u128f a, float* F){ template <typename T>
memcpy(F,a.f,4*sizeof(float)); inline void operator()(vec<T> a, T *D){
} *((vec<T> *)D) = a;
//Double
inline void operator()(u128d a, double* D){
memcpy(D,a.f,2*sizeof(double));
} }
//Integer //Integer
inline void operator()(int a, Integer* I){ inline void operator()(int a, Integer *I){
I[0] = a; *I = a;
} }
}; };
struct Vstream{ struct Vstream{
//Float // Real
inline void operator()(float * a, u128f b){ template <typename T>
memcpy(a,b.f,4*sizeof(float)); inline void operator()(T * a, vec<T> b){
*((vec<T> *)a) = b;
} }
//Double
inline void operator()(double * a, u128d b){
memcpy(a,b.f,2*sizeof(double));
}
}; };
struct Vset{ struct Vset{
// Complex float // Complex
inline u128f operator()(Grid::ComplexF *a){ template <typename T>
u128f out; inline vec<T> operator()(std::complex<T> *a){
out.f[0] = a[0].real(); vec<T> out;
out.f[1] = a[0].imag();
out.f[2] = a[1].real(); VECTOR_FOR(i, W<T>::c, 1)
out.f[3] = a[1].imag(); {
out.v[2*i] = a[i].real();
out.v[2*i+1] = a[i].imag();
}
return out; return out;
} }
// Complex double
inline u128d operator()(Grid::ComplexD *a){ // Real
u128d out; template <typename T>
out.f[0] = a[0].real(); inline vec<T> operator()(T *a){
out.f[1] = a[0].imag(); vec<T> out;
return out;
} out = *((vec<T> *)a);
// Real float
inline u128f operator()(float *a){
u128f out;
out.f[0] = a[0];
out.f[1] = a[1];
out.f[2] = a[2];
out.f[3] = a[3];
return out;
}
// Real double
inline u128d operator()(double *a){
u128d out;
out.f[0] = a[0];
out.f[1] = a[1];
return out; return out;
} }
// Integer // Integer
inline int operator()(Integer *a){ inline int operator()(Integer *a){
return a[0]; return *a;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
// Complex/Real
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a.v[i] + b.v[i];
}
return out;
} }
//I nteger
inline int operator()(int a, int b){
return a + b;
}
}; };
struct Sub{
// Complex/Real
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a.v[i] - b.v[i];
}
return out;
}
//Integer
inline int operator()(int a, int b){
return a-b;
}
};
struct Mult{
// Real
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a.v[i]*b.v[i];
}
return out;
}
// Integer
inline int operator()(int a, int b){
return a*b;
}
};
#define cmul(a, b, c, i)\
c[i] = a[i]*b[i] - a[i+1]*b[i+1];\
c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
struct MultComplex{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
cmul(a.v, b.v, out.v, 2*i);
}
return out;
}
};
#undef cmul
struct Div{
// Real
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::r, 1)
{
out.v[i] = a.v[i]/b.v[i];
}
return out;
}
};
#define conj(a, b, i)\
b[i] = a[i];\
b[i+1] = -a[i+1];
struct Conj{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
conj(a.v, out.v, 2*i);
}
return out;
}
};
#undef conj
#define timesmi(a, b, i)\
b[i] = a[i+1];\
b[i+1] = -a[i];
struct TimesMinusI{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
timesmi(a.v, out.v, 2*i);
}
return out;
}
};
#undef timesmi
#define timesi(a, b, i)\
b[i] = -a[i+1];\
b[i+1] = a[i];
struct TimesI{
// Complex
template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){
vec<T> out;
VECTOR_FOR(i, W<T>::c, 1)
{
timesi(a.v, out.v, 2*i);
}
return out;
}
};
#undef timesi
//////////////////////////////////////////////
// Some Template specialization
#define perm(a, b, n, w)\
unsigned int _mask = w >> (n + 1);\
VECTOR_FOR(i, w, 1)\
{\
b[i] = a[i^_mask];\
}
#define DECL_PERMUTE_N(n)\
template <typename T>\
static inline vec<T> Permute##n(vec<T> in) {\
vec<T> out;\
perm(in.v, out.v, n, W<T>::r);\
return out;\
}
struct Permute{
DECL_PERMUTE_N(0);
DECL_PERMUTE_N(1);
DECL_PERMUTE_N(2);
DECL_PERMUTE_N(3);
};
#undef perm
#undef DECL_PERMUTE_N
#define rot(a, b, n, w)\
VECTOR_FOR(i, w, 1)\
{\
b[i] = a[(i + n)%w];\
}
struct Rotate{
template <typename T>
static inline vec<T> rotate(vec<T> in, int n){
vec<T> out;
rot(in.v, out.v, n, W<T>::r);
return out;
}
};
#undef rot
#define acc(v, a, off, step, n)\
for (unsigned int i = off; i < n; i += step)\
{\
a += v[i];\
}
template <typename Out_type, typename In_type> template <typename Out_type, typename In_type>
struct Reduce{ struct Reduce{
//Need templated class to overload output type //Need templated class to overload output type
@ -165,315 +383,66 @@ namespace Optimization {
} }
}; };
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline u128f operator()(u128f a, u128f b){
u128f out;
out.f[0] = a.f[0] + b.f[0];
out.f[1] = a.f[1] + b.f[1];
out.f[2] = a.f[2] + b.f[2];
out.f[3] = a.f[3] + b.f[3];
return out;
}
//Complex/Real double
inline u128d operator()(u128d a, u128d b){
u128d out;
out.f[0] = a.f[0] + b.f[0];
out.f[1] = a.f[1] + b.f[1];
return out;
}
//Integer
inline int operator()(int a, int b){
return a + b;
}
};
struct Sub{
//Complex/Real float
inline u128f operator()(u128f a, u128f b){
u128f out;
out.f[0] = a.f[0] - b.f[0];
out.f[1] = a.f[1] - b.f[1];
out.f[2] = a.f[2] - b.f[2];
out.f[3] = a.f[3] - b.f[3];
return out;
}
//Complex/Real double
inline u128d operator()(u128d a, u128d b){
u128d out;
out.f[0] = a.f[0] - b.f[0];
out.f[1] = a.f[1] - b.f[1];
return out;
}
//Integer
inline int operator()(int a, int b){
return a-b;
}
};
struct MultComplex{
// Complex float
inline u128f operator()(u128f a, u128f b){
u128f out;
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
return out;
}
// Complex double
inline u128d operator()(u128d a, u128d b){
u128d out;
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
return out;
}
};
struct Mult{
//CK: Appear unneeded
// inline float mac(float a, float b,double c){
// return 0;
// }
// inline double mac(double a, double b,double c){
// return 0;
// }
// Real float
inline u128f operator()(u128f a, u128f b){
u128f out;
out.f[0] = a.f[0]*b.f[0];
out.f[1] = a.f[1]*b.f[1];
out.f[2] = a.f[2]*b.f[2];
out.f[3] = a.f[3]*b.f[3];
return out;
}
// Real double
inline u128d operator()(u128d a, u128d b){
u128d out;
out.f[0] = a.f[0]*b.f[0];
out.f[1] = a.f[1]*b.f[1];
return out;
}
// Integer
inline int operator()(int a, int b){
return a*b;
}
};
struct Conj{
// Complex single
inline u128f operator()(u128f in){
u128f out;
out.f[0] = in.f[0];
out.f[1] = -in.f[1];
out.f[2] = in.f[2];
out.f[3] = -in.f[3];
return out;
}
// Complex double
inline u128d operator()(u128d in){
u128d out;
out.f[0] = in.f[0];
out.f[1] = -in.f[1];
return out;
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
u128f out;
out.f[0] = in.f[1];
out.f[1] = -in.f[0];
out.f[2] = in.f[3];
out.f[3] = -in.f[2];
return out;
}
//Complex double
inline u128d operator()(u128d in, u128d ret){
u128d out;
out.f[0] = in.f[1];
out.f[1] = -in.f[0];
return out;
}
};
struct TimesI{
//Complex single
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
u128f out;
out.f[0] = -in.f[1];
out.f[1] = in.f[0];
out.f[2] = -in.f[3];
out.f[3] = in.f[2];
return out;
}
//Complex double
inline u128d operator()(u128d in, u128d ret){
u128d out;
out.f[0] = -in.f[1];
out.f[1] = in.f[0];
return out;
}
};
//////////////////////////////////////////////
// Some Template specialization
struct Permute{
//We just have to mirror the permutes of Grid_sse4.h
static inline u128f Permute0(u128f in){ //AB CD -> CD AB
u128f out;
out.f[0] = in.f[2];
out.f[1] = in.f[3];
out.f[2] = in.f[0];
out.f[3] = in.f[1];
return out;
};
static inline u128f Permute1(u128f in){ //AB CD -> BA DC
u128f out;
out.f[0] = in.f[1];
out.f[1] = in.f[0];
out.f[2] = in.f[3];
out.f[3] = in.f[2];
return out;
};
static inline u128f Permute2(u128f in){
return in;
};
static inline u128f Permute3(u128f in){
return in;
};
static inline u128d Permute0(u128d in){ //AB -> BA
u128d out;
out.f[0] = in.f[1];
out.f[1] = in.f[0];
return out;
};
static inline u128d Permute1(u128d in){
return in;
};
static inline u128d Permute2(u128d in){
return in;
};
static inline u128d Permute3(u128d in){
return in;
};
};
template < typename vtype >
void permute(vtype &a, vtype b, int perm) {
};
struct Rotate{
static inline u128f rotate(u128f in,int n){
u128f out;
switch(n){
case 0:
out.f[0] = in.f[0];
out.f[1] = in.f[1];
out.f[2] = in.f[2];
out.f[3] = in.f[3];
break;
case 1:
out.f[0] = in.f[1];
out.f[1] = in.f[2];
out.f[2] = in.f[3];
out.f[3] = in.f[0];
break;
case 2:
out.f[0] = in.f[2];
out.f[1] = in.f[3];
out.f[2] = in.f[0];
out.f[3] = in.f[1];
break;
case 3:
out.f[0] = in.f[3];
out.f[1] = in.f[0];
out.f[2] = in.f[1];
out.f[3] = in.f[2];
break;
default: assert(0);
}
return out;
}
static inline u128d rotate(u128d in,int n){
u128d out;
switch(n){
case 0:
out.f[0] = in.f[0];
out.f[1] = in.f[1];
break;
case 1:
out.f[0] = in.f[1];
out.f[1] = in.f[0];
break;
default: assert(0);
}
return out;
}
};
//Complex float Reduce //Complex float Reduce
template<> template <>
inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]); float a = 0.f, b = 0.f;
acc(in.v, a, 0, 2, W<float>::r);
acc(in.v, b, 1, 2, W<float>::r);
return Grid::ComplexF(a, b);
} }
//Real float Reduce //Real float Reduce
template<> template<>
inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
return in.f[0] + in.f[1] + in.f[2] + in.f[3]; float a = 0.;
}
acc(in.v, a, 0, 1, W<float>::r);
return a;
}
//Complex double Reduce //Complex double Reduce
template<> template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
return Grid::ComplexD(in.f[0],in.f[1]); double a = 0., b = 0.;
acc(in.v, a, 0, 2, W<double>::r);
acc(in.v, b, 1, 2, W<double>::r);
return Grid::ComplexD(a, b);
} }
//Real double Reduce //Real double Reduce
template<> template<>
inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
return in.f[0] + in.f[1]; double a = 0.f;
acc(in.v, a, 0, 1, W<double>::r);
return a;
} }
//Integer Reduce //Integer Reduce
template<> template<>
inline Integer Reduce<Integer, int>::operator()(int in){ inline Integer Reduce<Integer, int>::operator()(int in){
// FIXME unimplemented return in;
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
} }
} }
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
// Here assign types // Here assign types
typedef Optimization::u128f SIMD_Ftype; // Single precision type typedef Optimization::vecf SIMD_Ftype; // Single precision type
typedef Optimization::u128d SIMD_Dtype; // Double precision type typedef Optimization::vecd SIMD_Dtype; // Double precision type
typedef int SIMD_Itype; // Integer type typedef int SIMD_Itype; // Integer type
// prefetch utilities // prefetch utilities
inline void v_prefetch0(int size, const char *ptr){}; inline void v_prefetch0(int size, const char *ptr){};
inline void prefetch_HINT_T0(const char *ptr){}; inline void prefetch_HINT_T0(const char *ptr){};
// Gpermute function
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
Optimization::permute(y.v,b.v,perm);
}
// Function name aliases // Function name aliases
typedef Optimization::Vsplat VsplatSIMD; typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD; typedef Optimization::Vstore VstoreSIMD;
@ -481,16 +450,13 @@ namespace Optimization {
typedef Optimization::Vstream VstreamSIMD; typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>; template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations // Arithmetic operations
typedef Optimization::Sum SumSIMD; typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD; typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD; typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Conj ConjSIMD; typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD; typedef Optimization::TimesI TimesISIMD;
} }

View File

@ -26,14 +26,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
//----------------------------------------------------------------------
/*! @file Grid_knc.h
@brief Optimization libraries for AVX512 instructions set for KNC
Using intrinsics
*/
// Time-stamp: <2015-06-09 14:27:28 neo>
//----------------------------------------------------------------------
#include <immintrin.h> #include <immintrin.h>
#include <zmmintrin.h> #include <zmmintrin.h>

View File

@ -245,6 +245,21 @@ namespace Optimization {
} }
}; };
struct Div{
// Real double
inline vector4double operator()(vector4double a, vector4double b){
return vec_swdiv(a, b);
}
// Real float
FLOAT_WRAP_2(operator(), inline)
// Integer
inline int operator()(int a, int b){
return a/b;
}
};
struct Conj{ struct Conj{
// Complex double // Complex double
inline vector4double operator()(vector4double v){ inline vector4double operator()(vector4double v){
@ -413,6 +428,7 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
typedef Optimization::Sum SumSIMD; typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD; typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD; typedef Optimization::Mult MultSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::MultComplex MultComplexSIMD; typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Conj ConjSIMD; typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesMinusI TimesMinusISIMD;

View File

@ -38,13 +38,13 @@ directory
#ifndef GRID_VECTOR_TYPES #ifndef GRID_VECTOR_TYPES
#define GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES
#ifdef GENERIC_VEC #ifdef GEN
#include "Grid_generic.h" #include "Grid_generic.h"
#endif #endif
#ifdef SSE4 #ifdef SSE4
#include "Grid_sse4.h" #include "Grid_sse4.h"
#endif #endif
#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4) #if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
#include "Grid_avx.h" #include "Grid_avx.h"
#endif #endif
#if defined AVX512 #if defined AVX512
@ -130,7 +130,7 @@ class Grid_simd {
Vector_type v; Vector_type v;
static inline int Nsimd(void) { static inline constexpr int Nsimd(void) {
return sizeof(Vector_type) / sizeof(Scalar_type); return sizeof(Vector_type) / sizeof(Scalar_type);
} }

View File

@ -1,54 +0,0 @@
dnl Check for doxygen to create API docs
dnl
AC_DEFUN([AC_PROG_DOXYGEN],
[
AC_ARG_ENABLE(doxygen,
AS_HELP_STRING([--enable-doxygen],[enable documentation generation with doxygen (auto)]))
AC_ARG_ENABLE(dot,
AS_HELP_STRING([--enable-dot],[use 'dot' to generate graphs in doxygen (auto)]))
AC_ARG_ENABLE(html-docs,
AS_HELP_STRING([--enable-html-docs],[enable HTML generation with doxygen (yes)]),
[],[ enable_html_docs=yes])
AC_ARG_ENABLE(latex-docs,
AS_HELP_STRING([--enable-latex-docs],
[enable LaTeX documentation generation with doxygen (no)]),[],[enable_latex_docs=no])
if test "x$enable_doxygen" = xno; then
enable_doc=no
else
AC_CHECK_PROG(DOXYGEN, doxygen, doxygen)
if test x$DOXYGEN = x; then
if test "x$enable_doxygen" = xyes; then
AC_MSG_ERROR([could not find doxygen])
fi
enable_doc=no
else
doxy_ver=`doxygen --version`
doxy_major=`expr "$doxy_ver" : '\(@<:@0-9@:>@\)\..*'`
doxy_minor=`expr "$doxy_ver" : '@<:@0-9@:>@\.\(@<:@0-9@:>@\).*'`
if test $doxy_major -eq "1" -a $doxy_minor -ge "3" ; then
enable_doc=yes
AC_CHECK_PROG(DOT, dot, dot)
else
AC_MSG_WARN([doxygen version $doxy_ver too old, doxygen will not be used.])
enable_doc=no
fi
fi
fi
AM_CONDITIONAL(DOXYGEN_DOC, test x$enable_doc = xyes)
if test x$DOT = x; then
if test "x$enable_dot" = xyes; then
AC_MSG_ERROR([could not find dot])
fi
enable_dot=no
else
enable_dot=yes
fi
AC_SUBST(enable_dot)
AC_SUBST(enable_html_docs)
AC_SUBST(enable_latex_docs)
])

View File

@ -50,6 +50,12 @@ public:
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;} template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
std::string name(void) const { return std::string("Times"); } std::string name(void) const { return std::string("Times"); }
}; };
class funcDivide {
public:
funcDivide() {};
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;}
std::string name(void) const { return std::string("Divide"); }
};
class funcConj { class funcConj {
public: public:
funcConj() {}; funcConj() {};
@ -341,6 +347,7 @@ int main (int argc, char ** argv)
Tester<RealF,vRealF>(funcPlus()); Tester<RealF,vRealF>(funcPlus());
Tester<RealF,vRealF>(funcMinus()); Tester<RealF,vRealF>(funcMinus());
Tester<RealF,vRealF>(funcTimes()); Tester<RealF,vRealF>(funcTimes());
Tester<RealF,vRealF>(funcDivide());
Tester<RealF,vRealF>(funcAdj()); Tester<RealF,vRealF>(funcAdj());
Tester<RealF,vRealF>(funcConj()); Tester<RealF,vRealF>(funcConj());
Tester<RealF,vRealF>(funcInnerProduct()); Tester<RealF,vRealF>(funcInnerProduct());
@ -371,6 +378,7 @@ int main (int argc, char ** argv)
Tester<RealD,vRealD>(funcPlus()); Tester<RealD,vRealD>(funcPlus());
Tester<RealD,vRealD>(funcMinus()); Tester<RealD,vRealD>(funcMinus());
Tester<RealD,vRealD>(funcTimes()); Tester<RealD,vRealD>(funcTimes());
Tester<RealD,vRealD>(funcDivide());
Tester<RealD,vRealD>(funcAdj()); Tester<RealD,vRealD>(funcAdj());
Tester<RealD,vRealD>(funcConj()); Tester<RealD,vRealD>(funcConj());
Tester<RealD,vRealD>(funcInnerProduct()); Tester<RealD,vRealD>(funcInnerProduct());

View File

@ -68,7 +68,7 @@ int main (int argc, char ** argv)
for(int mu=0;mu<4;mu++){ for(int mu=0;mu<4;mu++){
RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(coor,mu); LatticeCoordinate(coor,mu);
C = C - (TwoPiL * p[mu]) * coor; C = C + (TwoPiL * p[mu]) * coor;
} }
C = exp(C*ci); C = exp(C*ci);
@ -78,10 +78,11 @@ int main (int argc, char ** argv)
FFT theFFT(&Fine); FFT theFFT(&Fine);
theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl; Ctilde = C;
theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Ctilde,C,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl; theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
// C=zero; // C=zero;
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde); // Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
@ -93,10 +94,11 @@ int main (int argc, char ** argv)
C=C-Ctilde; C=C-Ctilde;
std::cout << "diff scalar "<<norm2(C) << std::endl; std::cout << "diff scalar "<<norm2(C) << std::endl;
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl; Stilde = S;
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl; theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl; theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl; theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
SpinMatrixF Sp; SpinMatrixF Sp;
Sp = zero; Sp = Sp+cVol; Sp = zero; Sp = Sp+cVol;