From a6e1ea216d5d92e75a6f3c3b2ac4b291ae073a5d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 19 May 2015 13:57:35 +0100 Subject: [PATCH] Got unpreconditioned conjugate gradient to run and converge on a random (uniform random, not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop. --- Makefile.in | 44 +++-- aclocal.m4 | 61 +++---- benchmarks/Grid_wilson.cc | 35 +++- benchmarks/Grid_wilson_cg_unprec.cc | 56 +++++++ benchmarks/Makefile.am | 6 +- config.guess | 2 +- config.sub | 2 +- configure | 13 +- lib/Grid_simd.h | 20 ++- lib/algorithms/LinearOperator.h | 13 +- lib/algorithms/iterative/ConjugateGradient.h | 16 +- lib/lattice/Grid_lattice_ET.h | 4 +- lib/lattice/Grid_lattice_arith.h | 40 ++++- lib/lattice/Grid_lattice_base.h | 2 +- lib/lattice/Grid_lattice_reality.h | 4 +- lib/math/Grid_math_reality.h | 12 +- lib/qcd/Grid_qcd_wilson_dop.cc | 160 ++++++++++--------- lib/qcd/Grid_qcd_wilson_dop.h | 5 +- lib/simd/Grid_vComplexD.h | 26 +-- lib/simd/Grid_vComplexF.h | 66 +++----- lib/simd/Grid_vInteger.h | 2 +- lib/simd/Grid_vRealD.h | 55 ++----- lib/simd/Grid_vRealF.h | 49 +++--- lib/simd/Grid_vector_types.h | 6 +- scripts/build-all | 4 +- scripts/configure-all | 2 +- scripts/configure-commands | 11 +- tests/Grid_main.cc | 8 +- tests/Grid_rng_fixed.cc | 52 ++++++ tests/Grid_simd.cc | 93 ++++++++++- tests/Grid_stencil.cc | 6 +- tests/Makefile.am | 5 +- tests/test_Grid_jacobi.cc | 2 +- 33 files changed, 566 insertions(+), 316 deletions(-) create mode 100644 benchmarks/Grid_wilson_cg_unprec.cc create mode 100644 tests/Grid_rng_fixed.cc diff --git a/Makefile.in b/Makefile.in index b6894ef6..b9fba168 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.14.1 from Makefile.am. +# Makefile.in generated by automake 1.15 from Makefile.am. # @configure_input@ -# Copyright (C) 1994-2013 Free Software Foundation, Inc. +# Copyright (C) 1994-2014 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -14,7 +14,17 @@ @SET_MAKE@ VPATH = @srcdir@ -am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} am__make_running_with_option = \ case $${target_option-} in \ ?) ;; \ @@ -79,14 +89,12 @@ build_triplet = @build@ host_triplet = @host@ target_triplet = @target@ subdir = . -DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \ - $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ - $(top_srcdir)/configure $(am__configure_deps) COPYING TODO \ - compile config.guess config.sub depcomp install-sh missing ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \ + $(am__configure_deps) $(am__DIST_COMMON) am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \ configure.lineno config.status.lineno mkinstalldirs = $(install_sh) -d @@ -149,6 +157,9 @@ ETAGS = etags CTAGS = ctags CSCOPE = cscope DIST_SUBDIRS = $(SUBDIRS) +am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \ + INSTALL NEWS README TODO compile config.guess config.sub \ + depcomp install-sh missing DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) distdir = $(PACKAGE)-$(VERSION) top_distdir = $(distdir) @@ -314,7 +325,6 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --gnu Makefile -.PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ @@ -521,15 +531,15 @@ dist-xz: distdir $(am__post_remove_distdir) dist-tarZ: distdir - @echo WARNING: "Support for shar distribution archives is" \ - "deprecated." >&2 + @echo WARNING: "Support for distribution archives compressed with" \ + "legacy program 'compress' is deprecated." >&2 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2 tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z $(am__post_remove_distdir) dist-shar: distdir - @echo WARNING: "Support for distribution archives compressed with" \ - "legacy program 'compress' is deprecated." >&2 + @echo WARNING: "Support for shar distribution archives is" \ + "deprecated." >&2 @echo WARNING: "It will be removed altogether in Automake 2.0" >&2 shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz $(am__post_remove_distdir) @@ -565,17 +575,17 @@ distcheck: dist esac chmod -R a-w $(distdir) chmod u+w $(distdir) - mkdir $(distdir)/_build $(distdir)/_inst + mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst chmod a-w $(distdir) test -d $(distdir)/_build || exit 0; \ dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \ && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \ && am__cwd=`pwd` \ - && $(am__cd) $(distdir)/_build \ - && ../configure \ + && $(am__cd) $(distdir)/_build/sub \ + && ../../configure \ $(AM_DISTCHECK_CONFIGURE_FLAGS) \ $(DISTCHECK_CONFIGURE_FLAGS) \ - --srcdir=.. --prefix="$$dc_install_base" \ + --srcdir=../.. --prefix="$$dc_install_base" \ && $(MAKE) $(AM_MAKEFLAGS) \ && $(MAKE) $(AM_MAKEFLAGS) dvi \ && $(MAKE) $(AM_MAKEFLAGS) check \ @@ -749,6 +759,8 @@ uninstall-am: maintainer-clean-generic mostlyclean mostlyclean-generic pdf \ pdf-am ps ps-am tags tags-am uninstall uninstall-am +.PRECIOUS: Makefile + # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff --git a/aclocal.m4 b/aclocal.m4 index 389763bf..bf79d078 100644 --- a/aclocal.m4 +++ b/aclocal.m4 @@ -1,6 +1,6 @@ -# generated automatically by aclocal 1.14.1 -*- Autoconf -*- +# generated automatically by aclocal 1.15 -*- Autoconf -*- -# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# Copyright (C) 1996-2014 Free Software Foundation, Inc. # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -20,7 +20,7 @@ You have another version of autoconf. It may work, but is not guaranteed to. If you have problems, you may need to regenerate the build system entirely. To do so, use the procedure documented by the package, typically 'autoreconf'.])]) -# Copyright (C) 2002-2013 Free Software Foundation, Inc. +# Copyright (C) 2002-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.]) # generated from the m4 files accompanying Automake X.Y. # (This private macro should not be called outside this file.) AC_DEFUN([AM_AUTOMAKE_VERSION], -[am__api_version='1.14' +[am__api_version='1.15' dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to dnl require some minimum version. Point them to the right macro. -m4_if([$1], [1.14.1], [], +m4_if([$1], [1.15], [], [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl ]) @@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], []) # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. # This function is AC_REQUIREd by AM_INIT_AUTOMAKE. AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], -[AM_AUTOMAKE_VERSION([1.14.1])dnl +[AM_AUTOMAKE_VERSION([1.15])dnl m4_ifndef([AC_AUTOCONF_VERSION], [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) # AM_AUX_DIR_EXPAND -*- Autoconf -*- -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -103,15 +103,14 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) # configured tree to be moved without reconfiguration. AC_DEFUN([AM_AUX_DIR_EXPAND], -[dnl Rely on autoconf to set up CDPATH properly. -AC_PREREQ([2.50])dnl -# expand $ac_aux_dir to an absolute path -am_aux_dir=`cd $ac_aux_dir && pwd` +[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl +# Expand $ac_aux_dir to an absolute path. +am_aux_dir=`cd "$ac_aux_dir" && pwd` ]) # AM_CONDITIONAL -*- Autoconf -*- -# Copyright (C) 1997-2013 Free Software Foundation, Inc. +# Copyright (C) 1997-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -142,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE( Usually this means the macro was only invoked conditionally.]]) fi])]) -# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# Copyright (C) 1999-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -333,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl # Generate code to set up dependency tracking. -*- Autoconf -*- -# Copyright (C) 1999-2013 Free Software Foundation, Inc. +# Copyright (C) 1999-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -409,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS], # Do all the work for Automake. -*- Autoconf -*- -# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# Copyright (C) 1996-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -499,8 +498,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl # # AC_SUBST([mkdir_p], ['$(MKDIR_P)']) -# We need awk for the "check" target. The system "awk" is bad on -# some platforms. +# We need awk for the "check" target (and possibly the TAP driver). The +# system "awk" is bad on some platforms. AC_REQUIRE([AC_PROG_AWK])dnl AC_REQUIRE([AC_PROG_MAKE_SET])dnl AC_REQUIRE([AM_SET_LEADING_DOT])dnl @@ -573,7 +572,11 @@ to "yes", and re-run configure. END AC_MSG_ERROR([Your 'rm' program is bad, sorry.]) fi -fi]) +fi +dnl The trailing newline in this macro's definition is deliberate, for +dnl backward compatibility and to allow trailing 'dnl'-style comments +dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841. +]) dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further @@ -602,7 +605,7 @@ for _am_header in $config_headers :; do done echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -613,7 +616,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co # Define $install_sh. AC_DEFUN([AM_PROG_INSTALL_SH], [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl -if test x"${install_sh}" != xset; then +if test x"${install_sh+set}" != xset; then case $am_aux_dir in *\ * | *\ *) install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;; @@ -623,7 +626,7 @@ if test x"${install_sh}" != xset; then fi AC_SUBST([install_sh])]) -# Copyright (C) 2003-2013 Free Software Foundation, Inc. +# Copyright (C) 2003-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -644,7 +647,7 @@ AC_SUBST([am__leading_dot])]) # Check to see how 'make' treats includes. -*- Autoconf -*- -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -694,7 +697,7 @@ rm -f confinc confmf # Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- -# Copyright (C) 1997-2013 Free Software Foundation, Inc. +# Copyright (C) 1997-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -733,7 +736,7 @@ fi # Helper functions for option handling. -*- Autoconf -*- -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -764,7 +767,7 @@ AC_DEFUN([_AM_IF_OPTION], # Check to make sure that the build environment is sane. -*- Autoconf -*- -# Copyright (C) 1996-2013 Free Software Foundation, Inc. +# Copyright (C) 1996-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -845,7 +848,7 @@ AC_CONFIG_COMMANDS_PRE( rm -f conftest.file ]) -# Copyright (C) 2009-2013 Free Software Foundation, Inc. +# Copyright (C) 2009-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -905,7 +908,7 @@ AC_SUBST([AM_BACKSLASH])dnl _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl ]) -# Copyright (C) 2001-2013 Free Software Foundation, Inc. +# Copyright (C) 2001-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -933,7 +936,7 @@ fi INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" AC_SUBST([INSTALL_STRIP_PROGRAM])]) -# Copyright (C) 2006-2013 Free Software Foundation, Inc. +# Copyright (C) 2006-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -952,7 +955,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)]) # Check how to create a tarball. -*- Autoconf -*- -# Copyright (C) 2004-2013 Free Software Foundation, Inc. +# Copyright (C) 2004-2014 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, diff --git a/benchmarks/Grid_wilson.cc b/benchmarks/Grid_wilson.cc index e49d4515..69f5b48f 100644 --- a/benchmarks/Grid_wilson.cc +++ b/benchmarks/Grid_wilson.cc @@ -46,6 +46,13 @@ int main (int argc, char ** argv) volume=volume*latt_size[mu]; } + // Only one non-zero (y) + Umu=zero; + for(int nn=0;nn(Umu,U[nn],nn); + } + for(int mu=0;mu(Umu,mu); } @@ -53,7 +60,6 @@ int main (int argc, char ** argv) { // Naive wilson implementation ref = zero; for(int mu=0;mu + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + LatticeFermion src(&Grid); random(pRNG,src); + std::cout << "src norm" << norm2(src)< U(4,&Grid); + + double volume=1; + for(int mu=0;mu(Umu,mu); + } + + RealD mass=0.5; + WilsonMatrix Dw(Umu,mass); + HermitianOperator HermOp(Dw); + ConjugateGradient CG(1.0e-8,10000); + CG(HermOp,src,result); + + + Grid_finalize(); +} diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am index 2fe40feb..77ce7663 100644 --- a/benchmarks/Makefile.am +++ b/benchmarks/Makefile.am @@ -5,12 +5,14 @@ AM_LDFLAGS = -L$(top_builddir)/lib # # Test code # -bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 - +bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec Grid_wilson_SOURCES = Grid_wilson.cc Grid_wilson_LDADD = -lGrid +Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc +Grid_wilson_cg_unprec_LDADD = -lGrid + Grid_comms_SOURCES = Grid_comms.cc Grid_comms_LDADD = -lGrid diff --git a/config.guess b/config.guess index 5f6aa02d..a12faba2 120000 --- a/config.guess +++ b/config.guess @@ -1 +1 @@ -/usr/share/automake-1.14/config.guess \ No newline at end of file +/opt/local/share/automake-1.15/config.guess \ No newline at end of file diff --git a/config.sub b/config.sub index 0abfe18c..e3c9b5ca 120000 --- a/config.sub +++ b/config.sub @@ -1 +1 @@ -/usr/share/automake-1.14/config.sub \ No newline at end of file +/opt/local/share/automake-1.15/config.sub \ No newline at end of file diff --git a/configure b/configure index e7d9f32f..b42143e2 100755 --- a/configure +++ b/configure @@ -2466,7 +2466,7 @@ test -n "$target_alias" && NONENONEs,x,x, && program_prefix=${target_alias}- -am__api_version='1.14' +am__api_version='1.15' # Find a good install program. We prefer a C program (faster), # so one script is as good as another. But avoid the broken or @@ -2638,8 +2638,8 @@ test "$program_suffix" != NONE && ac_script='s/[\\$]/&&/g;s/;s,x,x,$//' program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"` -# expand $ac_aux_dir to an absolute path -am_aux_dir=`cd $ac_aux_dir && pwd` +# Expand $ac_aux_dir to an absolute path. +am_aux_dir=`cd "$ac_aux_dir" && pwd` if test x"${MISSING+set}" != xset; then case $am_aux_dir in @@ -2658,7 +2658,7 @@ else $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;} fi -if test x"${install_sh}" != xset; then +if test x"${install_sh+set}" != xset; then case $am_aux_dir in *\ * | *\ *) install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;; @@ -2986,8 +2986,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"} # mkdir_p='$(MKDIR_P)' -# We need awk for the "check" target. The system "awk" is bad on -# some platforms. +# We need awk for the "check" target (and possibly the TAP driver). The +# system "awk" is bad on some platforms. # Always define AMTAR for backward compatibility. Yes, it's still used # in the wild :-( We should find a proper way to deprecate it ... AMTAR='$${TAR-tar}' @@ -3046,6 +3046,7 @@ END fi + ac_config_headers="$ac_config_headers lib/Grid_config.h" # Check whether --enable-silent-rules was given. diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index af77591c..6d7411d3 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -50,15 +50,20 @@ namespace Grid { typedef std::complex Complex; inline RealF adj(const RealF & r){ return r; } - inline RealF conj(const RealF & r){ return r; } + inline RealF conjugate(const RealF & r){ return r; } inline RealF real(const RealF & r){ return r; } inline RealD adj(const RealD & r){ return r; } - inline RealD conj(const RealD & r){ return r; } + inline RealD conjugate(const RealD & r){ return r; } inline RealD real(const RealD & r){ return r; } - inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } - inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } + inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); } + inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); } + inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); } + inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); } + + inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; } + inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; } inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; } inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; } @@ -70,15 +75,14 @@ namespace Grid { inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} - inline ComplexD adj(const ComplexD& r){ return(conj(r)); } - // conj already supported for complex + // conjugate already supported for complex inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } - inline ComplexF adj(const ComplexF& r ){ return(conj(r)); } - //conj already supported for complex + + //conjugate already supported for complex inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index bb948b95..bdb6c387 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -7,8 +7,8 @@ namespace Grid { // LinearOperators Take a something and return a something. ///////////////////////////////////////////////////////////////////////////////////////////// // - // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugate (transpose if real): - // + // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real): + //SBase // i) F(a x + b y) = aF(x) + b F(y). // ii) = ^\ast // @@ -25,12 +25,13 @@ namespace Grid { ///////////////////////////////////////////////////////////////////////////////////////////// template class HermitianOperatorBase : public LinearOperatorBase { public: - virtual RealD OpAndNorm(const Field &in, Field &out); + virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2); void AdjOp(const Field &in, Field &out) { Op(in,out); }; void Op(const Field &in, Field &out) { - OpAndNorm(in,out); + double n1,n2; + OpAndNorm(in,out,n1,n2); }; }; @@ -80,8 +81,8 @@ namespace Grid { Matrix &_Mat; public: HermitianOperator(Matrix &Mat): _Mat(Mat) {}; - RealD OpAndNorm(const Field &in, Field &out){ - return _Mat.MdagM(in,out); + void OpAndNorm(const Field &in, Field &out,double &n1,double &n2){ + return _Mat.MdagM(in,out,n1,n2); } }; diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index bcb6ab60..f7f8559e 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -18,6 +18,7 @@ public: std::cout << Tolerance< &Linop,const Field &src, Field &psi) {assert(0);}; void operator() (HermitianOperatorBase &Linop,const Field &src, Field &psi){ RealD cp,c,a,d,b,ssq,qq,b_pred; @@ -61,21 +62,27 @@ public: Linop.OpAndNorm(p,mmp,d,qq); + // std::cout < struct name\ GridUnopClass(UnarySub,-a); GridUnopClass(UnaryAdj,adj(a)); -GridUnopClass(UnaryConj,conj(a)); +GridUnopClass(UnaryConj,conjugate(a)); GridUnopClass(UnaryTrace,trace(a)); GridUnopClass(UnaryTranspose,transpose(a)); @@ -178,7 +178,7 @@ template inline auto op(const T1 &pred,con GRID_DEF_UNOP(operator -,UnarySub); GRID_DEF_UNOP(adj,UnaryAdj); -GRID_DEF_UNOP(conj,UnaryConj); +GRID_DEF_UNOP(conjugate,UnaryConj); GRID_DEF_UNOP(trace,UnaryTrace); GRID_DEF_UNOP(transpose,UnaryTranspose); diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h index c0bbb2b6..f1e566a2 100644 --- a/lib/lattice/Grid_lattice_arith.h +++ b/lib/lattice/Grid_lattice_arith.h @@ -144,14 +144,44 @@ PARALLEL_FOR_LOOP } template strong_inline - void axpy(Lattice &ret,sobj a,const Lattice &lhs,const Lattice &rhs){ - conformable(lhs,rhs); + void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ + conformable(x,y); #pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - vobj tmp = a*lhs._odata[ss]; - vstream(ret._odata[ss],tmp+rhs._odata[ss]); + for(int ss=0;ssoSites();ss++){ + vobj tmp = a*x._odata[ss]+y._odata[ss]; + vstream(ret._odata[ss],tmp); } } + template strong_inline + void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ + conformable(x,y); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + vobj tmp = a*x._odata[ss]+b*y._odata[ss]; + vstream(ret._odata[ss],tmp); + } + } + + template strong_inline + RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ + conformable(x,y); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + vobj tmp = a*x._odata[ss]+y._odata[ss]; + vstream(ret._odata[ss],tmp); + } + return norm2(ret); + } + template strong_inline + RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ + conformable(x,y); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + vobj tmp = a*x._odata[ss]+b*y._odata[ss]; + vstream(ret._odata[ss],tmp); + } + return norm2(ret); // FIXME implement parallel norm in ss loop + } } #endif diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h index ae606ae7..0016bafa 100644 --- a/lib/lattice/Grid_lattice_base.h +++ b/lib/lattice/Grid_lattice_base.h @@ -9,7 +9,7 @@ namespace Grid { // Functionality: // -=,+=,*=,() // add,+,sub,-,mult,mac,* -// adj,conj +// adj,conjugate // real,imag // transpose,transposeIndex // trace,traceIndex diff --git a/lib/lattice/Grid_lattice_reality.h b/lib/lattice/Grid_lattice_reality.h index cf86d700..2f52ed22 100644 --- a/lib/lattice/Grid_lattice_reality.h +++ b/lib/lattice/Grid_lattice_reality.h @@ -18,11 +18,11 @@ PARALLEL_FOR_LOOP return ret; }; - template inline Lattice conj(const Lattice &lhs){ + template inline Lattice conjugate(const Lattice &lhs){ Lattice ret(lhs._grid); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ - ret._odata[ss] = conj(lhs._odata[ss]); + ret._odata[ss] = conjugate(lhs._odata[ss]); } return ret; }; diff --git a/lib/math/Grid_math_reality.h b/lib/math/Grid_math_reality.h index 18a2f89b..40d4c624 100644 --- a/lib/math/Grid_math_reality.h +++ b/lib/math/Grid_math_reality.h @@ -98,26 +98,26 @@ template inline void timesMinusI(iMatrix &ret,const /////////////////////////////////////////////// // Conj function for scalar, vector, matrix /////////////////////////////////////////////// -template inline iScalar conj(const iScalar&r) +template inline iScalar conjugate(const iScalar&r) { iScalar ret; - ret._internal = conj(r._internal); + ret._internal = conjugate(r._internal); return ret; } -template inline iVector conj(const iVector&r) +template inline iVector conjugate(const iVector&r) { iVector ret; for(int i=0;i inline iMatrix conj(const iMatrix&r) +template inline iMatrix conjugate(const iMatrix&r) { iMatrix ret; for(int i=0;i(in,comm_buf,compressor); PARALLEL_FOR_LOOP @@ -140,13 +159,13 @@ PARALLEL_FOR_LOOP int ssu= ss; // Xp - offset = Stencil._offsets [Xp][ss]; - local = Stencil._is_local[Xp][ss]; - perm = Stencil._permute[Xp][ss]; - ptype = Stencil._permute_type[Xp]; + int idx = (Xp+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; - if ( local && perm ) - { + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjXp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -154,18 +173,17 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); - //prefetch(Umu._odata[ssu](Yp)); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); spReconXp(result,Uchi); // Yp - offset = Stencil._offsets [Yp][ss]; - local = Stencil._is_local[Yp][ss]; - perm = Stencil._permute[Yp][ss]; - ptype = Stencil._permute_type[Yp]; + idx = (Yp+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; - if ( local && perm ) - { + if ( local && perm ) { spProjYp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -173,18 +191,16 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); - // prefetch(Umu._odata[ssu](Zp)); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconYp(result,Uchi); // Zp - offset = Stencil._offsets [Zp][ss]; - local = Stencil._is_local[Zp][ss]; - perm = Stencil._permute[Zp][ss]; - ptype = Stencil._permute_type[Zp]; - - if ( local && perm ) - { + idx = (Zp+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjZp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -192,18 +208,16 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); - // prefetch(Umu._odata[ssu](Tp)); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconZp(result,Uchi); // Tp - offset = Stencil._offsets [Tp][ss]; - local = Stencil._is_local[Tp][ss]; - perm = Stencil._permute[Tp][ss]; - ptype = Stencil._permute_type[Tp]; - - if ( local && perm ) - { + idx = (Tp+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjTp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -211,15 +225,15 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); - // prefetch(Umu._odata[ssu](Xm)); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconTp(result,Uchi); // Xm - offset = Stencil._offsets [Xm][ss]; - local = Stencil._is_local[Xm][ss]; - perm = Stencil._permute[Xm][ss]; - ptype = Stencil._permute_type[Xm]; + idx = (Xm+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; if ( local && perm ) { @@ -230,18 +244,18 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Xm),&chi()); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconXm(result,Uchi); // Ym - offset = Stencil._offsets [Ym][ss]; - local = Stencil._is_local[Ym][ss]; - perm = Stencil._permute[Ym][ss]; - ptype = Stencil._permute_type[Ym]; + idx = (Ym+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; - if ( local && perm ) - { + if ( local && perm ) { spProjYm(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -249,17 +263,16 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Ym),&chi()); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconYm(result,Uchi); // Zm - offset = Stencil._offsets [Zm][ss]; - local = Stencil._is_local[Zm][ss]; - perm = Stencil._permute[Zm][ss]; - ptype = Stencil._permute_type[Zm]; - - if ( local && perm ) - { + idx = (Zm+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjZm(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -267,17 +280,16 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Zm),&chi()); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconZm(result,Uchi); // Tm - offset = Stencil._offsets [Tm][ss]; - local = Stencil._is_local[Tm][ss]; - perm = Stencil._permute[Tm][ss]; - ptype = Stencil._permute_type[Tm]; - - if ( local && perm ) - { + idx = (Tm+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjTm(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -285,7 +297,7 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Tm),&chi()); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconTm(result,Uchi); vstream(out._odata[ss],result); @@ -294,10 +306,6 @@ PARALLEL_FOR_LOOP } -void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out) -{ - return; -} diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h index 1098d330..1db95b33 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.h +++ b/lib/qcd/Grid_qcd_wilson_dop.h @@ -45,10 +45,7 @@ namespace Grid { virtual void MooeeInvDag (const LatticeFermion &in, LatticeFermion &out); // non-hermitian hopping term; half cb or both - void Dhop(const LatticeFermion &in, LatticeFermion &out); - - // m+4r -1/2 Dhop; both cb's - void Dw(const LatticeFermion &in, LatticeFermion &out); + void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag); typedef iScalar > matrix; diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h index 41c8509f..2b03c825 100644 --- a/lib/simd/Grid_vComplexD.h +++ b/lib/simd/Grid_vComplexD.h @@ -32,7 +32,7 @@ namespace Grid { friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);} friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);} friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);} - friend inline vComplexD adj(const vComplexD &in){ return conj(in); } + friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); } ////////////////////////////////// // Initialise to 1,0,i @@ -166,11 +166,11 @@ namespace Grid { // all subtypes; may not be a good assumption, but could // add the vector width as a template param for BG/Q for example //////////////////////////////////////////////////////////////////// - /* friend inline void permute(vComplexD &y,vComplexD b,int perm) { Gpermute(y,b,perm); } + /* friend inline void merge(vComplexD &y,std::vector &extracted) { Gmerge(y,extracted); @@ -269,7 +269,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ //////////////////////// // Conjugate //////////////////////// - friend inline vComplexD conj(const vComplexD &in){ + friend inline vComplexD conjugate(const vComplexD &in){ vComplexD ret ; vzero(ret); #if defined (AVX1)|| defined (AVX2) // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ... @@ -345,17 +345,17 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ // REDUCE FIXME must be a cleaner implementation friend inline ComplexD Reduce(const vComplexD & in) { -#if defined SSE4 - return ComplexD(in.v[0], in.v[1]); // inefficient +#ifdef SSE4 + return ComplexD(in.v[0],in.v[1]); +#endif +#if defined(AVX1) || defined (AVX2) + vComplexD v1; + permute(v1,in,0); // sse 128; paired complex single + v1=v1+in; + return ComplexD(v1.v[0],v1.v[1]); #endif -#if defined (AVX1) || defined(AVX2) - // return std::complex(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v)); - __attribute__ ((aligned(32))) double c_[4]; - _mm256_store_pd(c_,in.v); - return ComplexD(c_[0]+c_[2],c_[1]+c_[3]); -#endif #ifdef AVX512 - return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v)); + return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v)); #endif #ifdef QPX #endif @@ -387,7 +387,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ }; - inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; } + inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; } typedef vComplexD vDComplex; diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h index 8a892d24..713bbdd8 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Grid_vComplexF.h @@ -47,7 +47,7 @@ namespace Grid { friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); } friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); } friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); } - friend inline vComplexF adj(const vComplexF &in){ return conj(in); } + friend inline vComplexF adj(const vComplexF &in){ return conjugate(in); } ////////////////////////////////// // Initialise to 1,0,i @@ -228,42 +228,25 @@ namespace Grid { ret.v = {a,b,a,b}; #endif } - friend inline ComplexF Reduce(const vComplexF & in) - { + friend inline void permute(vComplexF &y,vComplexF b,int perm) + { + Gpermute(y,b,perm); + } + friend inline ComplexF Reduce(const vComplexF & in) + { #ifdef SSE4 - union { - cvec v1; // SSE 4 x float vector - float f[4]; // scalar array of 4 floats - } u128; - u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros - return ComplexF(u128.f[0], u128.f[1]); + vComplexF v1; + permute(v1,in,0); // sse 128; paired complex single + v1=v1+in; + return ComplexF(v1.v[0],v1.v[1]); #endif -#ifdef AVX1 - //it would be better passing 2 arguments to saturate the vector lanes - union { - __m256 v1; - float f[8]; - } u256; - //SWAP lanes - // FIXME .. icc complains with lib/lattice/Grid_lattice_reduction.h (49): (col. 20) warning #13211: Immediate parameter to intrinsic call too large - __m256 t0 = _mm256_permute2f128_ps(in.v, in.v, 1); - __m256 t1 = _mm256_permute_ps(in.v , 0b11011000);//real (0,2,1,3) - __m256 t2 = _mm256_permute_ps(t0 , 0b10001101);//imag (1,3,0,2) - t0 = _mm256_blend_ps(t1, t2, 0b0101000001010000);// (0,0,1,1,0,0,1,1) - t1 = _mm256_hadd_ps(t0,t0); - u256.v1 = _mm256_hadd_ps(t1, t1); - return ComplexF(u256.f[0], u256.f[4]); -#endif -#ifdef AVX2 - union { - __m256 v1; - float f[8]; - } u256; - const __m256i mask= _mm256_set_epi32( 7, 5, 3, 1, 6, 4, 2, 0); - __m256 tmp1 = _mm256_permutevar8x32_ps(in.v, mask); - __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1); - u256.v1 = _mm256_hadd_ps(tmp2, tmp2); - return ComplexF(u256.f[0], u256.f[4]); +#if defined(AVX1) || defined (AVX2) + vComplexF v1,v2; + permute(v1,in,0); // sse 128; paired complex single + v1=v1+in; + permute(v2,v1,1); // avx 256; quad complex single + v1=v1+v2; + return ComplexF(v1.v[0],v1.v[1]); #endif #ifdef AVX512 return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v)); @@ -345,13 +328,10 @@ namespace Grid { // Conjugate /////////////////////// - friend inline vComplexF conj(const vComplexF &in){ + friend inline vComplexF conjugate(const vComplexF &in){ vComplexF ret ; vzero(ret); #if defined (AVX1)|| defined (AVX2) - cvec tmp; - tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi - ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); - + ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f)); #endif #ifdef SSE4 ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f)); @@ -433,10 +413,6 @@ namespace Grid { return *this; } - friend inline void permute(vComplexF &y,vComplexF b,int perm) - { - Gpermute(y,b,perm); - } /* friend inline void merge(vComplexF &y,std::vector &extracted) { @@ -460,7 +436,7 @@ namespace Grid { inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) { - return conj(l)*r; + return conjugate(l)*r; } inline void zeroit(vComplexF &z){ vzero(z);} diff --git a/lib/simd/Grid_vInteger.h b/lib/simd/Grid_vInteger.h index 63e8c720..ee4a3633 100644 --- a/lib/simd/Grid_vInteger.h +++ b/lib/simd/Grid_vInteger.h @@ -117,7 +117,7 @@ namespace Grid { }; /////////////////////////////////////////////// - // mult, sub, add, adj,conj, mac functions + // mult, sub, add, adj,conjugate, mac functions /////////////////////////////////////////////// friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);} friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);} diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Grid_vRealD.h index dbfacb0c..83979d14 100644 --- a/lib/simd/Grid_vRealD.h +++ b/lib/simd/Grid_vRealD.h @@ -26,7 +26,7 @@ namespace Grid { friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);} friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);} friend inline vRealD adj(const vRealD &in) { return in; } - friend inline vRealD conj(const vRealD &in){ return in; } + friend inline vRealD conjugate(const vRealD &in){ return in; } friend inline void mac (vRealD &y,const vRealD a,const vRealD x){ #if defined (AVX1) || defined (SSE4) @@ -112,11 +112,12 @@ namespace Grid { // all subtypes; may not be a good assumption, but could // add the vector width as a template param for BG/Q for example //////////////////////////////////////////////////////////////////// - /* + friend inline void permute(vRealD &y,vRealD b,int perm) { Gpermute(y,b,perm); } + /* friend inline void merge(vRealD &y,std::vector &extracted) { Gmerge(y,extracted); @@ -209,48 +210,26 @@ namespace Grid { friend inline RealD Reduce(const vRealD & in) { -#if defined (SSE4) - // FIXME Hack - const RealD * ptr =(const RealD *) ∈ - RealD ret = 0; - for(int i=0;i(y,b,perm); } + /* friend inline void merge(vRealF &y,std::vector &extracted) { Gmerge(y,extracted); @@ -155,7 +156,6 @@ namespace Grid { Gextract(y,extracted); } */ - ///////////////////////////////////////////////////// // Broadcast a value across Nsimd copies. @@ -243,33 +243,26 @@ friend inline void vstore(const vRealF &ret, float *a){ } friend inline RealF Reduce(const vRealF & in) { -#if defined (SSE4) - // FIXME Hack - const RealF * ptr = (const RealF *) ∈ - RealF ret = 0; - for(int i=0;i inline Grid_simd< scalar_type, vector_type> innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r) { - return conj(l)*r; + return conjugate(l)*r; } template diff --git a/scripts/build-all b/scripts/build-all index d3a978b7..b97dca19 100755 --- a/scripts/build-all +++ b/scripts/build-all @@ -1,6 +1,6 @@ -#!/bin/bash +#!/bin/bash -e -DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi" +DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse" EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512" BLACK="\033[30m" RED="\033[31m" diff --git a/scripts/configure-all b/scripts/configure-all index 2f80b60a..f42b1960 100755 --- a/scripts/configure-all +++ b/scripts/configure-all @@ -1,6 +1,6 @@ #!/bin/bash -DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx" +DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse" for D in $DIRS do diff --git a/scripts/configure-commands b/scripts/configure-commands index 89b72294..c7c35e1c 100755 --- a/scripts/configure-commands +++ b/scripts/configure-commands @@ -34,6 +34,9 @@ icpc-avx512) icpc-mic) CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none ;; +clang-sse) +CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none + ;; clang-avx) CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none ;; @@ -47,16 +50,16 @@ clang-avx2-openmp) CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none ;; clang-avx-openmp-mpi) -CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=mpi +CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi ;; clang-avx2-openmp-mpi) -CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=mpi +CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi ;; clang-avx-mpi) -CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx " LIBS="-lgmp -lmpfr" --enable-comms=mpi +CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi ;; clang-avx2-mpi) -CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx " LIBS="-lgmp -lmpfr" --enable-comms=mpi +CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi ;; esac echo -e $NORMAL diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc index 45778922..1aa4fc43 100644 --- a/tests/Grid_main.cc +++ b/tests/Grid_main.cc @@ -140,7 +140,7 @@ int main (int argc, char ** argv) // -=,+=,*=,() // add,+,sub,-,mult,mac,* -// adj,conj +// adj,conjugate // real,imag // transpose,transposeIndex // trace,traceIndex @@ -437,7 +437,7 @@ int main (int argc, char ** argv) for(int r=0;r<3;r++){ for(int c=0;c<3;c++){ diff =shifted1()()(r,c)-shifted2()()(r,c); - nn=real(conj(diff)*diff); + nn=real(conjugate(diff)*diff); if ( nn > 0 ) cout<<"Shift fail (shifted1/shifted2-ref) "< 0 ) cout<<"Shift rb fail (shifted3/shifted2-ref) "< +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(4,vComplexF::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + + GridSerialRNG fsRNG; fsRNG.SeedFixedIntegers(seeds); + GridParallelRNG fpRNG(&Grid); fpRNG.SeedFixedIntegers(seeds); + + vComplexF tmp; random(fsRNG,tmp); + std::cout<<"Random vComplexF (fixed seed)\n"<< tmp< void operator()(vec &rr,vec &i1,vec &i2) const { rr = conj(i1);} + template void operator()(vec &rr,vec &i1,vec &i2) const { rr = conjugate(i1);} std::string name(void) const { return std::string("Conj"); } }; class funcAdj { @@ -49,6 +49,34 @@ public: template void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesMinusI(i1);} std::string name(void) const { return std::string("timesMinusI"); } }; +class funcInnerProduct { +public: + funcInnerProduct() {}; + template void operator()(vec &rr,vec &i1,vec &i2) const { rr = innerProduct(i1,i2);} + std::string name(void) const { return std::string("innerProduct"); } +}; + +// FIXME still to test: +// +// innerProduct, +// norm2, +// Reduce, +// +// mac,mult,sub,add, vone,vzero,vcomplex_i, =zero, +// vset,vsplat,vstore,vstream,vload, scalar*vec, vec*scalar +// unary -, +// *= , -=, += +// outerproduct, +// zeroit +// permute + +class funcReduce { +public: + funcReduce() {}; +template void vfunc(reduce &rr,vec &i1,vec &i2) const { rr = Reduce(i1);} +template void sfunc(reduce &rr,scal &i1,scal &i2) const { rr = i1;} + std::string name(void) const { return std::string("Reduce"); } +}; template void Tester(const functor &func) @@ -96,8 +124,59 @@ void Tester(const functor &func) ok++; } } - if ( ok==0 ) std::cout << " OK!" < +void ReductionTester(const functor &func) +{ + GridSerialRNG sRNG; + sRNG.SeedRandomDevice(); + + int Nsimd = vec::Nsimd(); + + std::vector input1(Nsimd); + std::vector input2(Nsimd); + reduced result(0); + reduced reference(0); + reduced tmp; + + std::vector > buf(3); + vec & v_input1 = buf[0]; + vec & v_input2 = buf[1]; + + + for(int i=0;i(v_input1,input1); + merge(v_input2,input2); + + func.template vfunc(result,v_input1,v_input2); + + for(int i=0;i(tmp,input1[i],input2[i]); + reference+=tmp; + } + + std::cout << " " << func.name()< 1.0e-6 ){ // rounding is possible for reduce order + std::cout<< "*****" << std::endl; + std::cout<< abs(reference-result) << " " <(funcTimes()); Tester(funcConj()); Tester(funcAdj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); std::cout << "==================================="<< std::endl; std::cout << "Testing vComplexD "<(funcTimes()); Tester(funcConj()); Tester(funcAdj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); std::cout << "==================================="<< std::endl; std::cout << "Testing vRealF "<(funcMinus()); Tester(funcTimes()); Tester(funcAdj()); + Tester(funcConj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); std::cout << "==================================="<< std::endl; std::cout << "Testing vRealD "<(funcMinus()); Tester(funcTimes()); Tester(funcAdj()); + Tester(funcConj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); Grid_finalize(); } diff --git a/tests/Grid_stencil.cc b/tests/Grid_stencil.cc index 1fdb7265..9770ed7c 100644 --- a/tests/Grid_stencil.cc +++ b/tests/Grid_stencil.cc @@ -93,7 +93,7 @@ int main (int argc, char ** argv) for(int r=0;r<3;r++){ for(int c=0;c<3;c++){ diff =check()()(r,c)-bar()()(r,c); - double nn=real(conj(diff)*diff); + double nn=real(conjugate(diff)*diff); if ( nn > 0){ printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n", coor[0],coor[1],coor[2],coor[3],r,c, @@ -103,8 +103,8 @@ int main (int argc, char ** argv) real(bar()()(r,c)) ); } - snrmC=snrmC+real(conj(check()()(r,c))*check()()(r,c)); - snrmB=snrmB+real(conj(bar()()(r,c))*bar()()(r,c)); + snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c)); + snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c)); snrm=snrm+nn; }} diff --git a/tests/Makefile.am b/tests/Makefile.am index 82d1b3be..80f3a34c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib # # Test code # -bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez +bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez Grid_rng_fixed Grid_main_SOURCES = Grid_main.cc Grid_main_LDADD = -lGrid @@ -13,6 +13,9 @@ Grid_main_LDADD = -lGrid Grid_rng_SOURCES = Grid_rng.cc Grid_rng_LDADD = -lGrid +Grid_rng_fixed_SOURCES = Grid_rng_fixed.cc +Grid_rng_fixed_LDADD = -lGrid + Grid_remez_SOURCES = Grid_remez.cc Grid_remez_LDADD = -lGrid diff --git a/tests/test_Grid_jacobi.cc b/tests/test_Grid_jacobi.cc index 95e808f1..0cf15a6f 100644 --- a/tests/test_Grid_jacobi.cc +++ b/tests/test_Grid_jacobi.cc @@ -186,7 +186,7 @@ int main (int argc, char ** argv) for(int r=0;r<3;r++){ for(int c=0;c<3;c++){ diff =check._internal._internal[r][c]-bar._internal._internal[r][c]; - double nn=real(conj(diff)*diff); + double nn=real(conjugate(diff)*diff); if ( nn > 0 ){ printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n", coor[0],coor[1],coor[2],coor[3],r,c,