mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'master' of https://github.com/paboyle/Grid
Conflicts: lib/Grid_simd.h
This commit is contained in:
commit
48bb3ab4e7
48
Makefile.in
48
Makefile.in
@ -1,7 +1,7 @@
|
||||
# Makefile.in generated by automake 1.15 from Makefile.am.
|
||||
# Makefile.in generated by automake 1.14.1 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1994-2013 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -14,17 +14,7 @@
|
||||
|
||||
@SET_MAKE@
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
@ -89,12 +79,15 @@ build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
target_triplet = @target@
|
||||
subdir = .
|
||||
DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
|
||||
$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
|
||||
$(top_srcdir)/configure $(am__configure_deps) COPYING TODO \
|
||||
compile config.guess config.sub depcomp install-sh missing
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
|
||||
$(am__configure_deps) $(am__DIST_COMMON)
|
||||
am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
|
||||
configure.lineno config.status.lineno
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
@ -157,9 +150,6 @@ ETAGS = etags
|
||||
CTAGS = ctags
|
||||
CSCOPE = cscope
|
||||
DIST_SUBDIRS = $(SUBDIRS)
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \
|
||||
INSTALL NEWS README TODO compile config.guess config.sub \
|
||||
depcomp install-sh missing
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
distdir = $(PACKAGE)-$(VERSION)
|
||||
top_distdir = $(distdir)
|
||||
@ -223,6 +213,7 @@ ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
GREP = @GREP@
|
||||
HAVE_CXX11 = @HAVE_CXX11@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
@ -325,6 +316,7 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu Makefile
|
||||
.PRECIOUS: Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
@ -531,15 +523,15 @@ dist-xz: distdir
|
||||
$(am__post_remove_distdir)
|
||||
|
||||
dist-tarZ: distdir
|
||||
@echo WARNING: "Support for distribution archives compressed with" \
|
||||
"legacy program 'compress' is deprecated." >&2
|
||||
@echo WARNING: "Support for shar distribution archives is" \
|
||||
"deprecated." >&2
|
||||
@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
|
||||
tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
|
||||
$(am__post_remove_distdir)
|
||||
|
||||
dist-shar: distdir
|
||||
@echo WARNING: "Support for shar distribution archives is" \
|
||||
"deprecated." >&2
|
||||
@echo WARNING: "Support for distribution archives compressed with" \
|
||||
"legacy program 'compress' is deprecated." >&2
|
||||
@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
|
||||
shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
|
||||
$(am__post_remove_distdir)
|
||||
@ -575,17 +567,17 @@ distcheck: dist
|
||||
esac
|
||||
chmod -R a-w $(distdir)
|
||||
chmod u+w $(distdir)
|
||||
mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
|
||||
mkdir $(distdir)/_build $(distdir)/_inst
|
||||
chmod a-w $(distdir)
|
||||
test -d $(distdir)/_build || exit 0; \
|
||||
dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
|
||||
&& dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
|
||||
&& am__cwd=`pwd` \
|
||||
&& $(am__cd) $(distdir)/_build/sub \
|
||||
&& ../../configure \
|
||||
&& $(am__cd) $(distdir)/_build \
|
||||
&& ../configure \
|
||||
$(AM_DISTCHECK_CONFIGURE_FLAGS) \
|
||||
$(DISTCHECK_CONFIGURE_FLAGS) \
|
||||
--srcdir=../.. --prefix="$$dc_install_base" \
|
||||
--srcdir=.. --prefix="$$dc_install_base" \
|
||||
&& $(MAKE) $(AM_MAKEFLAGS) \
|
||||
&& $(MAKE) $(AM_MAKEFLAGS) dvi \
|
||||
&& $(MAKE) $(AM_MAKEFLAGS) check \
|
||||
@ -759,8 +751,6 @@ uninstall-am:
|
||||
maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
|
||||
pdf-am ps ps-am tags tags-am uninstall uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
|
4
TODO
4
TODO
@ -1,8 +1,8 @@
|
||||
================================================================
|
||||
*** Hacks and bug fixes to clean up and Audits
|
||||
================================================================
|
||||
* Base class to share common code between vRealF, VComplexF etc...
|
||||
- Performance check on Guido's reimplementation strategy
|
||||
* Base class to share common code between vRealF, VComplexF etc... done
|
||||
- Performance check on Guido's reimplementation strategy - (GUIDO) tested and no difference was found, merged
|
||||
|
||||
* FIXME audit
|
||||
|
||||
|
62
aclocal.m4
vendored
62
aclocal.m4
vendored
@ -1,6 +1,6 @@
|
||||
# generated automatically by aclocal 1.15 -*- Autoconf -*-
|
||||
# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
|
||||
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -20,7 +20,7 @@ You have another version of autoconf. It may work, but is not guaranteed to.
|
||||
If you have problems, you may need to regenerate the build system entirely.
|
||||
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
|
||||
|
||||
# Copyright (C) 2002-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2002-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
|
||||
# generated from the m4 files accompanying Automake X.Y.
|
||||
# (This private macro should not be called outside this file.)
|
||||
AC_DEFUN([AM_AUTOMAKE_VERSION],
|
||||
[am__api_version='1.15'
|
||||
[am__api_version='1.14'
|
||||
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
|
||||
dnl require some minimum version. Point them to the right macro.
|
||||
m4_if([$1], [1.15], [],
|
||||
m4_if([$1], [1.14.1], [],
|
||||
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
|
||||
])
|
||||
|
||||
@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
|
||||
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
|
||||
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
|
||||
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
|
||||
[AM_AUTOMAKE_VERSION([1.15])dnl
|
||||
[AM_AUTOMAKE_VERSION([1.14.1])dnl
|
||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
|
||||
|
||||
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -103,14 +103,15 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
|
||||
# configured tree to be moved without reconfiguration.
|
||||
|
||||
AC_DEFUN([AM_AUX_DIR_EXPAND],
|
||||
[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
|
||||
# Expand $ac_aux_dir to an absolute path.
|
||||
am_aux_dir=`cd "$ac_aux_dir" && pwd`
|
||||
[dnl Rely on autoconf to set up CDPATH properly.
|
||||
AC_PREREQ([2.50])dnl
|
||||
# expand $ac_aux_dir to an absolute path
|
||||
am_aux_dir=`cd $ac_aux_dir && pwd`
|
||||
])
|
||||
|
||||
# AM_CONDITIONAL -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -141,7 +142,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
Usually this means the macro was only invoked conditionally.]])
|
||||
fi])])
|
||||
|
||||
# Copyright (C) 1999-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -332,7 +333,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
|
||||
|
||||
# Generate code to set up dependency tracking. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1999-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -408,7 +409,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
|
||||
# Do all the work for Automake. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -498,8 +499,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
|
||||
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
|
||||
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
|
||||
AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
|
||||
# We need awk for the "check" target (and possibly the TAP driver). The
|
||||
# system "awk" is bad on some platforms.
|
||||
# We need awk for the "check" target. The system "awk" is bad on
|
||||
# some platforms.
|
||||
AC_REQUIRE([AC_PROG_AWK])dnl
|
||||
AC_REQUIRE([AC_PROG_MAKE_SET])dnl
|
||||
AC_REQUIRE([AM_SET_LEADING_DOT])dnl
|
||||
@ -572,11 +573,7 @@ to "yes", and re-run configure.
|
||||
END
|
||||
AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
|
||||
fi
|
||||
fi
|
||||
dnl The trailing newline in this macro's definition is deliberate, for
|
||||
dnl backward compatibility and to allow trailing 'dnl'-style comments
|
||||
dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
|
||||
])
|
||||
fi])
|
||||
|
||||
dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not
|
||||
dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
|
||||
@ -605,7 +602,7 @@ for _am_header in $config_headers :; do
|
||||
done
|
||||
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
|
||||
|
||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -616,7 +613,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
|
||||
# Define $install_sh.
|
||||
AC_DEFUN([AM_PROG_INSTALL_SH],
|
||||
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
||||
if test x"${install_sh+set}" != xset; then
|
||||
if test x"${install_sh}" != xset; then
|
||||
case $am_aux_dir in
|
||||
*\ * | *\ *)
|
||||
install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
|
||||
@ -626,7 +623,7 @@ if test x"${install_sh+set}" != xset; then
|
||||
fi
|
||||
AC_SUBST([install_sh])])
|
||||
|
||||
# Copyright (C) 2003-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2003-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -647,7 +644,7 @@ AC_SUBST([am__leading_dot])])
|
||||
|
||||
# Check to see how 'make' treats includes. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -697,7 +694,7 @@ rm -f confinc confmf
|
||||
|
||||
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -736,7 +733,7 @@ fi
|
||||
|
||||
# Helper functions for option handling. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -767,7 +764,7 @@ AC_DEFUN([_AM_IF_OPTION],
|
||||
|
||||
# Check to make sure that the build environment is sane. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -848,7 +845,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
rm -f conftest.file
|
||||
])
|
||||
|
||||
# Copyright (C) 2009-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2009-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -908,7 +905,7 @@ AC_SUBST([AM_BACKSLASH])dnl
|
||||
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
|
||||
])
|
||||
|
||||
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -936,7 +933,7 @@ fi
|
||||
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
|
||||
AC_SUBST([INSTALL_STRIP_PROGRAM])])
|
||||
|
||||
# Copyright (C) 2006-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2006-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -955,7 +952,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
|
||||
|
||||
# Check how to create a tarball. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2004-2014 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2004-2013 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@ -1086,3 +1083,4 @@ AC_SUBST([am__tar])
|
||||
AC_SUBST([am__untar])
|
||||
]) # _AM_PROG_TAR
|
||||
|
||||
m4_include([m4/ax_cxx_compile_stdcxx_11.m4])
|
||||
|
2
config.guess
vendored
2
config.guess
vendored
@ -1 +1 @@
|
||||
/opt/local/share/automake-1.15/config.guess
|
||||
/usr/share/automake-1.14/config.guess
|
2
config.sub
vendored
2
config.sub
vendored
@ -1 +1 @@
|
||||
/opt/local/share/automake-1.15/config.sub
|
||||
/usr/share/automake-1.14/config.sub
|
199
configure
vendored
199
configure
vendored
@ -633,6 +633,7 @@ BUILD_COMMS_MPI_TRUE
|
||||
EGREP
|
||||
GREP
|
||||
CXXCPP
|
||||
HAVE_CXX11
|
||||
RANLIB
|
||||
OPENMP_CXXFLAGS
|
||||
am__fastdepCXX_FALSE
|
||||
@ -2466,7 +2467,7 @@ test -n "$target_alias" &&
|
||||
NONENONEs,x,x, &&
|
||||
program_prefix=${target_alias}-
|
||||
|
||||
am__api_version='1.15'
|
||||
am__api_version='1.14'
|
||||
|
||||
# Find a good install program. We prefer a C program (faster),
|
||||
# so one script is as good as another. But avoid the broken or
|
||||
@ -2638,8 +2639,8 @@ test "$program_suffix" != NONE &&
|
||||
ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
|
||||
program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
|
||||
|
||||
# Expand $ac_aux_dir to an absolute path.
|
||||
am_aux_dir=`cd "$ac_aux_dir" && pwd`
|
||||
# expand $ac_aux_dir to an absolute path
|
||||
am_aux_dir=`cd $ac_aux_dir && pwd`
|
||||
|
||||
if test x"${MISSING+set}" != xset; then
|
||||
case $am_aux_dir in
|
||||
@ -2658,7 +2659,7 @@ else
|
||||
$as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
|
||||
fi
|
||||
|
||||
if test x"${install_sh+set}" != xset; then
|
||||
if test x"${install_sh}" != xset; then
|
||||
case $am_aux_dir in
|
||||
*\ * | *\ *)
|
||||
install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
|
||||
@ -2986,8 +2987,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
|
||||
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
|
||||
mkdir_p='$(MKDIR_P)'
|
||||
|
||||
# We need awk for the "check" target (and possibly the TAP driver). The
|
||||
# system "awk" is bad on some platforms.
|
||||
# We need awk for the "check" target. The system "awk" is bad on
|
||||
# some platforms.
|
||||
# Always define AMTAR for backward compatibility. Yes, it's still used
|
||||
# in the wild :-( We should find a proper way to deprecate it ...
|
||||
AMTAR='$${TAR-tar}'
|
||||
@ -3046,7 +3047,6 @@ END
|
||||
fi
|
||||
|
||||
|
||||
|
||||
ac_config_headers="$ac_config_headers lib/Grid_config.h"
|
||||
|
||||
# Check whether --enable-silent-rules was given.
|
||||
@ -3966,6 +3966,191 @@ else
|
||||
RANLIB="$ac_cv_prog_RANLIB"
|
||||
fi
|
||||
|
||||
ax_cxx_compile_cxx11_required=true
|
||||
ac_ext=cpp
|
||||
ac_cpp='$CXXCPP $CPPFLAGS'
|
||||
ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
|
||||
ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
|
||||
ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
|
||||
ac_success=no
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features by default" >&5
|
||||
$as_echo_n "checking whether $CXX supports C++11 features by default... " >&6; }
|
||||
if ${ax_cv_cxx_compile_cxx11+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
template <typename T>
|
||||
struct check
|
||||
{
|
||||
static_assert(sizeof(int) <= sizeof(T), "not big enough");
|
||||
};
|
||||
|
||||
struct Base {
|
||||
virtual void f() {}
|
||||
};
|
||||
struct Child : public Base {
|
||||
virtual void f() override {}
|
||||
};
|
||||
|
||||
typedef check<check<bool>> right_angle_brackets;
|
||||
|
||||
int a;
|
||||
decltype(a) b;
|
||||
|
||||
typedef check<int> check_type;
|
||||
check_type c;
|
||||
check_type&& cr = static_cast<check_type&&>(c);
|
||||
|
||||
auto d = a;
|
||||
auto l = [](){};
|
||||
// Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
|
||||
struct use_l { use_l() { l(); } };
|
||||
|
||||
// http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
|
||||
// Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
|
||||
namespace test_template_alias_sfinae {
|
||||
struct foo {};
|
||||
|
||||
template<typename T>
|
||||
using member = typename T::member_type;
|
||||
|
||||
template<typename T>
|
||||
void func(...) {}
|
||||
|
||||
template<typename T>
|
||||
void func(member<T>*) {}
|
||||
|
||||
void test();
|
||||
|
||||
void test() {
|
||||
func<foo>(0);
|
||||
}
|
||||
}
|
||||
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_compile "$LINENO"; then :
|
||||
ax_cv_cxx_compile_cxx11=yes
|
||||
else
|
||||
ax_cv_cxx_compile_cxx11=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
fi
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_cxx_compile_cxx11" >&5
|
||||
$as_echo "$ax_cv_cxx_compile_cxx11" >&6; }
|
||||
if test x$ax_cv_cxx_compile_cxx11 = xyes; then
|
||||
ac_success=yes
|
||||
fi
|
||||
|
||||
|
||||
|
||||
if test x$ac_success = xno; then
|
||||
for switch in -std=c++11 -std=c++0x +std=c++11; do
|
||||
cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
|
||||
$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
|
||||
if eval \${$cachevar+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
ac_save_CXXFLAGS="$CXXFLAGS"
|
||||
CXXFLAGS="$CXXFLAGS $switch"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
template <typename T>
|
||||
struct check
|
||||
{
|
||||
static_assert(sizeof(int) <= sizeof(T), "not big enough");
|
||||
};
|
||||
|
||||
struct Base {
|
||||
virtual void f() {}
|
||||
};
|
||||
struct Child : public Base {
|
||||
virtual void f() override {}
|
||||
};
|
||||
|
||||
typedef check<check<bool>> right_angle_brackets;
|
||||
|
||||
int a;
|
||||
decltype(a) b;
|
||||
|
||||
typedef check<int> check_type;
|
||||
check_type c;
|
||||
check_type&& cr = static_cast<check_type&&>(c);
|
||||
|
||||
auto d = a;
|
||||
auto l = [](){};
|
||||
// Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
|
||||
struct use_l { use_l() { l(); } };
|
||||
|
||||
// http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
|
||||
// Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
|
||||
namespace test_template_alias_sfinae {
|
||||
struct foo {};
|
||||
|
||||
template<typename T>
|
||||
using member = typename T::member_type;
|
||||
|
||||
template<typename T>
|
||||
void func(...) {}
|
||||
|
||||
template<typename T>
|
||||
void func(member<T>*) {}
|
||||
|
||||
void test();
|
||||
|
||||
void test() {
|
||||
func<foo>(0);
|
||||
}
|
||||
}
|
||||
|
||||
_ACEOF
|
||||
if ac_fn_cxx_try_compile "$LINENO"; then :
|
||||
eval $cachevar=yes
|
||||
else
|
||||
eval $cachevar=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CXXFLAGS="$ac_save_CXXFLAGS"
|
||||
fi
|
||||
eval ac_res=\$$cachevar
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
|
||||
$as_echo "$ac_res" >&6; }
|
||||
if eval test x\$$cachevar = xyes; then
|
||||
CXXFLAGS="$CXXFLAGS $switch"
|
||||
ac_success=yes
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
ac_ext=cpp
|
||||
ac_cpp='$CXXCPP $CPPFLAGS'
|
||||
ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
|
||||
ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
|
||||
ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
|
||||
|
||||
if test x$ax_cxx_compile_cxx11_required = xtrue; then
|
||||
if test x$ac_success = xno; then
|
||||
as_fn_error $? "*** A compiler with support for C++11 language features is required." "$LINENO" 5
|
||||
fi
|
||||
else
|
||||
if test x$ac_success = xno; then
|
||||
HAVE_CXX11=0
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5
|
||||
$as_echo "$as_me: No compiler with C++11 support was found" >&6;}
|
||||
else
|
||||
HAVE_CXX11=1
|
||||
|
||||
$as_echo "#define HAVE_CXX11 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
|
||||
|
||||
fi
|
||||
|
||||
|
||||
|
||||
# Checks for libraries.
|
||||
#AX_GCC_VAR_ATTRIBUTE(aligned)
|
||||
|
@ -3,9 +3,9 @@
|
||||
#
|
||||
# Project Grid package
|
||||
#
|
||||
# Time-stamp: <2015-05-19 13:51:08 neo>
|
||||
# Time-stamp: <2015-05-25 14:54:34 neo>
|
||||
|
||||
AC_PREREQ([2.69])
|
||||
AC_PREREQ([2.63])
|
||||
AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
|
||||
AC_CANONICAL_SYSTEM
|
||||
AM_INIT_AUTOMAKE(subdir-objects)
|
||||
@ -26,6 +26,8 @@ AC_LANG(C++)
|
||||
AC_PROG_CXX
|
||||
AC_OPENMP
|
||||
AC_PROG_RANLIB
|
||||
AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
|
||||
|
||||
|
||||
# Checks for libraries.
|
||||
#AX_GCC_VAR_ATTRIBUTE(aligned)
|
||||
|
@ -16,6 +16,9 @@
|
||||
/* GRID_COMMS_NONE */
|
||||
#define GRID_COMMS_NONE 1
|
||||
|
||||
/* define if the compiler supports basic C++11 syntax */
|
||||
/* #undef HAVE_CXX11 */
|
||||
|
||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
|
||||
don't. */
|
||||
#define HAVE_DECL_BE64TOH 1
|
||||
|
@ -15,6 +15,9 @@
|
||||
/* GRID_COMMS_NONE */
|
||||
#undef GRID_COMMS_NONE
|
||||
|
||||
/* define if the compiler supports basic C++11 syntax */
|
||||
#undef HAVE_CXX11
|
||||
|
||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
|
||||
don't. */
|
||||
#undef HAVE_DECL_BE64TOH
|
||||
|
@ -13,28 +13,6 @@
|
||||
|
||||
typedef uint32_t Integer;
|
||||
|
||||
#ifdef SSE4
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
#if defined(AVX1) || defined (AVX2)
|
||||
#include <immintrin.h>
|
||||
|
||||
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
|
||||
#ifndef _mm256_set_m128i
|
||||
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef AVX512
|
||||
#include <immintrin.h>
|
||||
#ifndef KNC_ONLY_STORES
|
||||
#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512
|
||||
#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
namespace Grid {
|
||||
|
||||
typedef float RealF;
|
||||
@ -103,6 +81,10 @@ namespace Grid {
|
||||
inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
|
||||
inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
|
||||
inline void vstream(RealF &l, const RealF &r){ l=r;}
|
||||
inline void vstream(RealD &l, const RealD &r){ l=r;}
|
||||
|
||||
|
||||
class Zero{};
|
||||
@ -113,7 +95,6 @@ namespace Grid {
|
||||
template<> inline void zeroit(RealF &arg){ arg=0; };
|
||||
template<> inline void zeroit(RealD &arg){ arg=0; };
|
||||
|
||||
|
||||
#if defined (SSE4)
|
||||
typedef __m128 fvec;
|
||||
typedef __m128d dvec;
|
||||
@ -245,56 +226,12 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
|
||||
default: assert(0); break;
|
||||
}
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
#include <simd/Grid_vInteger.h>
|
||||
#include <simd/Grid_vRealF.h>
|
||||
#include <simd/Grid_vRealD.h>
|
||||
#include <simd/Grid_vComplexF.h>
|
||||
#include <simd/Grid_vComplexD.h>
|
||||
|
||||
#include <simd/Grid_vector_types.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
// NB: Template the following on "type Complex" and then implement *,+,- for
|
||||
// ComplexF, ComplexD, RealF, RealD above to
|
||||
// get full generality of binops with scalars.
|
||||
inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
|
||||
inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
|
||||
inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
// Default precision
|
||||
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
|
||||
typedef vRealD vReal;
|
||||
|
@ -93,12 +93,10 @@ nobase_include_HEADERS = algorithms/approx/bigfloat.h \
|
||||
qcd/Grid_qcd_2spinor.h \
|
||||
qcd/Grid_qcd_dirac.h \
|
||||
qcd/Grid_qcd_wilson_dop.h \
|
||||
simd/Grid_vComplexD.h \
|
||||
simd/Grid_vComplexF.h \
|
||||
simd/Grid_vInteger.h \
|
||||
simd/Grid_vRealD.h \
|
||||
simd/Grid_vRealF.h \
|
||||
simd/Grid_vector_types.h \
|
||||
simd/Grid_sse4.h
|
||||
simd/Grid_sse4.h \
|
||||
simd/Grid_avx.h \
|
||||
simd/Grid_knc.h
|
||||
|
||||
|
||||
|
||||
|
@ -154,26 +154,35 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
|
||||
cbmask=0x3;
|
||||
}
|
||||
|
||||
|
||||
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
|
||||
|
||||
/*
|
||||
int o =n*rhs._grid->_slice_stride[dimension];
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||
if ( ocb&cbmask ) {
|
||||
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
|
||||
}
|
||||
*/
|
||||
|
||||
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
|
||||
if ( ocb&cbmask ) {
|
||||
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
||||
{
|
||||
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
|
@ -212,6 +212,16 @@ public:
|
||||
|
||||
iMatrix(const Zero &z){ *this = zero; };
|
||||
iMatrix() =default;
|
||||
|
||||
iMatrix& operator=(const iMatrix& rhs){
|
||||
for(int i=0;i<N;i++)
|
||||
for(int j=0;j<N;j++)
|
||||
vstream(_internal[i][j],rhs._internal[i][j]);
|
||||
return *this;
|
||||
};
|
||||
|
||||
|
||||
|
||||
iMatrix(scalar_type s) { (*this) = s ;};// recurse down and hit the constructor for vector_type
|
||||
|
||||
/*
|
||||
@ -220,6 +230,9 @@ public:
|
||||
iMatrix<vtype,N> & operator= (const iMatrix<vtype,N> ©me) = default;
|
||||
iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &©me) = default;
|
||||
*/
|
||||
|
||||
|
||||
|
||||
iMatrix<vtype,N> & operator= (const Zero &hero){
|
||||
zeroit(*this);
|
||||
return *this;
|
||||
|
399
lib/simd/Grid_avx.h
Normal file
399
lib/simd/Grid_avx.h
Normal file
@ -0,0 +1,399 @@
|
||||
//----------------------------------------------------------------------
|
||||
/*! @file Grid_avx.h
|
||||
@brief Optimization libraries for AVX1/2 instructions set
|
||||
|
||||
Using intrinsics
|
||||
*/
|
||||
// Time-stamp: <2015-05-22 18:58:27 neo>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <immintrin.h>
|
||||
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
|
||||
#ifndef _mm256_set_m128i
|
||||
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
|
||||
#endif
|
||||
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline __m256 operator()(float a, float b){
|
||||
return _mm256_set_ps(b,a,b,a,b,a,b,a);
|
||||
}
|
||||
// Real float
|
||||
inline __m256 operator()(float a){
|
||||
return _mm256_set_ps(a,a,a,a,a,a,a,a);
|
||||
}
|
||||
//Complex double
|
||||
inline __m256d operator()(double a, double b){
|
||||
return _mm256_set_pd(b,a,b,a);
|
||||
}
|
||||
//Real double
|
||||
inline __m256d operator()(double a){
|
||||
return _mm256_set_pd(a,a,a,a);
|
||||
}
|
||||
//Integer
|
||||
inline __m256i operator()(Integer a){
|
||||
return _mm256_set1_epi32(a);
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(__m256 a, float* F){
|
||||
_mm256_store_ps(F,a);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(__m256d a, double* D){
|
||||
_mm256_store_pd(D,a);
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(__m256i a, Integer* I){
|
||||
_mm256_store_si256((__m256i*)I,a);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float * a, __m256 b){
|
||||
_mm256_stream_ps(a,b);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double * a, __m256d b){
|
||||
_mm256_stream_pd(a,b);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline __m256 operator()(Grid::ComplexF *a){
|
||||
return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Complex double
|
||||
inline __m256d operator()(Grid::ComplexD *a){
|
||||
return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Real float
|
||||
inline __m256 operator()(float *a){
|
||||
return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Real double
|
||||
inline __m256d operator()(double *a){
|
||||
return _mm256_set_pd(a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Integer
|
||||
inline __m256i operator()(Integer *a){
|
||||
return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
return _mm256_add_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
return _mm256_add_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m256i operator()(__m256i a, __m256i b){
|
||||
#if defined (AVX1)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a,0);
|
||||
b0 = _mm256_extractf128_si256(b,0);
|
||||
a1 = _mm256_extractf128_si256(a,1);
|
||||
b1 = _mm256_extractf128_si256(b,1);
|
||||
a0 = _mm_add_epi32(a0,b0);
|
||||
a1 = _mm_add_epi32(a1,b1);
|
||||
return _mm256_set_m128i(a1,a0);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
return _mm256_add_epi32(a,b);
|
||||
#endif
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
return _mm256_sub_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
return _mm256_sub_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m256i operator()(__m256i a, __m256i b){
|
||||
#if defined (AVX1)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a,0);
|
||||
b0 = _mm256_extractf128_si256(b,0);
|
||||
a1 = _mm256_extractf128_si256(a,1);
|
||||
b1 = _mm256_extractf128_si256(b,1);
|
||||
a0 = _mm_sub_epi32(a0,b0);
|
||||
a1 = _mm_sub_epi32(a1,b1);
|
||||
return _mm256_set_m128i(a1,a0);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
return _mm256_sub_epi32(a,b);
|
||||
#endif
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
__m256 ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
// FIXME AVX2 could MAC
|
||||
ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
|
||||
ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
|
||||
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm256_addsub_ps(ymm0,ymm1);
|
||||
}
|
||||
// Complex double
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
//Multiplication of (ak+ibk)*(ck+idk)
|
||||
// a + i b can be stored as a data structure
|
||||
//From intel optimisation reference guide
|
||||
/*
|
||||
movsldup xmm0, Src1; load real parts into the destination,
|
||||
; a1, a1, a0, a0
|
||||
movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
|
||||
mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
|
||||
shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
|
||||
movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
|
||||
mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
|
||||
addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
|
||||
VSHUFPD (VEX.256 encoded version)
|
||||
IF IMM0[0] = 0
|
||||
THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
|
||||
IF IMM0[1] = 0
|
||||
THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
|
||||
IF IMM0[2] = 0
|
||||
THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
|
||||
IF IMM0[3] = 0
|
||||
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
|
||||
*/
|
||||
|
||||
__m256d ymm0,ymm1,ymm2;
|
||||
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
|
||||
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi b'01,01
|
||||
ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11
|
||||
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
|
||||
return _mm256_addsub_pd(ymm0,ymm1);
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
// Real float
|
||||
inline __m256 operator()(__m256 a, __m256 b){
|
||||
return _mm256_mul_ps(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline __m256d operator()(__m256d a, __m256d b){
|
||||
return _mm256_mul_pd(a,b);
|
||||
}
|
||||
// Integer
|
||||
inline __m256i operator()(__m256i a, __m256i b){
|
||||
#if defined (AVX1)
|
||||
__m128i a0,a1;
|
||||
__m128i b0,b1;
|
||||
a0 = _mm256_extractf128_si256(a,0);
|
||||
b0 = _mm256_extractf128_si256(b,0);
|
||||
a1 = _mm256_extractf128_si256(a,1);
|
||||
b1 = _mm256_extractf128_si256(b,1);
|
||||
a0 = _mm_mul_epi32(a0,b0);
|
||||
a1 = _mm_mul_epi32(a1,b1);
|
||||
return _mm256_set_m128i(a1,a0);
|
||||
#endif
|
||||
#if defined (AVX2)
|
||||
return _mm256_mul_epi32(a,b);
|
||||
#endif
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline __m256 operator()(__m256 in){
|
||||
return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f));
|
||||
}
|
||||
// Complex double
|
||||
inline __m256d operator()(__m256d in){
|
||||
return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));//untested
|
||||
/*
|
||||
// original
|
||||
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
|
||||
__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),_mm256_shuffle_pd(in,in,0x5));
|
||||
return _mm256_shuffle_pd(tmp,tmp,0x5);
|
||||
*/
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline __m256 operator()(__m256 in, __m256 ret){
|
||||
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
|
||||
return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
|
||||
}
|
||||
//Complex double
|
||||
inline __m256d operator()(__m256d in, __m256d ret){
|
||||
__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
|
||||
return _mm256_shuffle_pd(tmp,tmp,0x5);
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m256 operator()(__m256 in, __m256 ret){
|
||||
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
|
||||
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
|
||||
}
|
||||
//Complex double
|
||||
inline __m256d operator()(__m256d in, __m256d ret){
|
||||
__m256d tmp = _mm256_shuffle_pd(in,in,0x5);
|
||||
return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
template < typename vtype >
|
||||
void permute(vtype a, vtype b, int perm) {
|
||||
union {
|
||||
__m256 f;
|
||||
vtype v;
|
||||
} conv;
|
||||
conv.v = b;
|
||||
switch (perm){
|
||||
// 8x32 bits=>3 permutes
|
||||
case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
|
||||
default: assert(0); break;
|
||||
}
|
||||
a = conv.v;
|
||||
|
||||
}
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
|
||||
__m256 v1,v2;
|
||||
Optimization::permute(v1,in,0); // sse 128; paired complex single
|
||||
v1 = _mm256_add_ps(v1,in);
|
||||
Optimization::permute(v2,v1,1); // avx 256; quad complex single
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
return Grid::ComplexF(v1[0],v1[1]);
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
|
||||
__m256 v1,v2;
|
||||
Optimization::permute(v1,in,0); // avx 256; octo-double
|
||||
v1 = _mm256_add_ps(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
Optimization::permute(v2,v1,2);
|
||||
v1 = _mm256_add_ps(v1,v2);
|
||||
return v1[0];
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
|
||||
__m256d v1;
|
||||
Optimization::permute(v1,in,0); // sse 128; paired complex single
|
||||
v1 = _mm256_add_pd(v1,in);
|
||||
return Grid::ComplexD(v1[0],v1[1]);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
|
||||
__m256d v1,v2;
|
||||
Optimization::permute(v1,in,0); // avx 256; quad double
|
||||
v1 = _mm256_add_pd(v1,in);
|
||||
Optimization::permute(v2,v1,1);
|
||||
v1 = _mm256_add_pd(v1,v2);
|
||||
return v1[0];
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
|
||||
// FIXME unimplemented
|
||||
printf("Reduce : Missing integer implementation -> FIX\n");
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
namespace Grid {
|
||||
typedef __m256 SIMD_Ftype; // Single precision type
|
||||
typedef __m256d SIMD_Dtype; // Double precision type
|
||||
typedef __m256i SIMD_Itype; // Integer type
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
323
lib/simd/Grid_knc.h
Normal file
323
lib/simd/Grid_knc.h
Normal file
@ -0,0 +1,323 @@
|
||||
//----------------------------------------------------------------------
|
||||
/*! @file Grid_knc.h
|
||||
@brief Optimization libraries for AVX512 instructions set for KNC
|
||||
|
||||
Using intrinsics
|
||||
*/
|
||||
// Time-stamp: <2015-05-22 17:12:44 neo>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <immintrin.h>
|
||||
#ifndef KNC_ONLY_STORES
|
||||
#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512
|
||||
#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512
|
||||
#endif
|
||||
|
||||
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline __m512 operator()(float a, float b){
|
||||
return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
|
||||
}
|
||||
// Real float
|
||||
inline __m512 operator()(float a){
|
||||
return _mm512_set1_ps(a);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(double a, double b){
|
||||
return _mm512_set_pd(b,a,b,a,b,a,b,a);
|
||||
}
|
||||
//Real double
|
||||
inline __m512d operator()(double a){
|
||||
return _mm512_set1_pd(a);
|
||||
}
|
||||
//Integer
|
||||
inline __m512i operator()(Integer a){
|
||||
return _mm512_set1_epi32(a);
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(__m512 a, float* F){
|
||||
_mm512_store_ps(F,a);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(__m512d a, double* D){
|
||||
_mm512_store_pd(D,a);
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(__m512i a, Integer* I){
|
||||
_mm512_store_si512((__m512i *)I,a);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float * a, __m512 b){
|
||||
_mm512_storenrngo_ps(a,b);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double * a, __m512d b){
|
||||
_mm512_storenrngo_pd(a,b);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline __m512 operator()(Grid::ComplexF *a){
|
||||
return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
|
||||
a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
|
||||
a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
|
||||
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Complex double
|
||||
inline __m512d operator()(Grid::ComplexD *a){
|
||||
return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
|
||||
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
|
||||
}
|
||||
// Real float
|
||||
inline __m512 operator()(float *a){
|
||||
return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
|
||||
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Real double
|
||||
inline __m512d operator()(double *a){
|
||||
return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
// Integer
|
||||
inline __m512i operator()(Integer *a){
|
||||
return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
|
||||
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_add_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_add_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m512i operator()(__m512i a, __m512i b){
|
||||
return _mm512_add_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_sub_ps(a,b);
|
||||
}
|
||||
//Complex/Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_sub_pd(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline __m512i operator()(__m512i a, __m512i b){
|
||||
return _mm512_sub_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
__m512 vzero,ymm0,ymm1,real, imag;
|
||||
vzero = _mm512_setzero_ps();
|
||||
ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); //
|
||||
real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
|
||||
imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
|
||||
ymm1 = _mm512_mul_ps(real, b);
|
||||
ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
|
||||
return _mm512_fmadd_ps(ymm0,imag,ymm1);
|
||||
}
|
||||
// Complex double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
/* This is from
|
||||
* Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets
|
||||
* @inproceedings{McFarlin:2011:ASV:1995896.1995938,
|
||||
* author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
|
||||
* title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
|
||||
* booktitle = {Proceedings of the International Conference on Supercomputing},
|
||||
* series = {ICS '11},
|
||||
* year = {2011},
|
||||
* isbn = {978-1-4503-0102-2},
|
||||
* location = {Tucson, Arizona, USA},
|
||||
* pages = {265--274},
|
||||
* numpages = {10},
|
||||
* url = {http://doi.acm.org/10.1145/1995896.1995938},
|
||||
* doi = {10.1145/1995896.1995938},
|
||||
* acmid = {1995938},
|
||||
* publisher = {ACM},
|
||||
* address = {New York, NY, USA},
|
||||
* keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
|
||||
* }
|
||||
*/
|
||||
__m512d vzero,ymm0,ymm1,real,imag;
|
||||
vzero =_mm512_setzero_pd();
|
||||
ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); //
|
||||
real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
|
||||
imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
|
||||
ymm1 = _mm512_mul_pd(real, b);
|
||||
ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
|
||||
return _mm512_fmadd_pd(ymm0,imag,ymm1);
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
// Real float
|
||||
inline __m512 operator()(__m512 a, __m512 b){
|
||||
return _mm512_mul_ps(a,b);
|
||||
}
|
||||
// Real double
|
||||
inline __m512d operator()(__m512d a, __m512d b){
|
||||
return _mm512_mul_pd(a,b);
|
||||
}
|
||||
// Integer
|
||||
inline __m512i operator()(__m512i a, __m512i b){
|
||||
return _mm512_mullo_epi32(a,b);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline __m512 operator()(__m512 in){
|
||||
return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag
|
||||
}
|
||||
// Complex double
|
||||
inline __m512d operator()(__m512d in){
|
||||
return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||
return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||
return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
|
||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
|
||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
|
||||
return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
|
||||
return _mm512_reduce_add_ps(in);
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
|
||||
return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
|
||||
return _mm512_reduce_add_pd(in);
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
|
||||
// FIXME unimplemented
|
||||
printf("Reduce : Missing integer implementation -> FIX\n");
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
namespace Grid {
|
||||
typedef __m512 SIMD_Ftype; // Single precision type
|
||||
typedef __m512d SIMD_Dtype; // Double precision type
|
||||
typedef __m512i SIMD_Itype; // Integer type
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
272
lib/simd/Grid_qpx.h
Normal file
272
lib/simd/Grid_qpx.h
Normal file
@ -0,0 +1,272 @@
|
||||
//----------------------------------------------------------------------
|
||||
/*! @file Grid_qpx.h
|
||||
@brief Optimization libraries for QPX instructions set for BG/Q
|
||||
|
||||
Using intrinsics
|
||||
*/
|
||||
// Time-stamp: <2015-05-22 17:29:26 neo>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
// lot of undefined functions
|
||||
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
inline float operator()(float a, float b){
|
||||
return {a,b,a,b};
|
||||
}
|
||||
// Real float
|
||||
inline float operator()(float a){
|
||||
return {a,a,a,a};
|
||||
}
|
||||
//Complex double
|
||||
inline vector4double operator()(double a, double b){
|
||||
return {a,b,a,b};
|
||||
}
|
||||
//Real double
|
||||
inline vector4double operator()(double a){
|
||||
return {a,a,a,a};
|
||||
}
|
||||
//Integer
|
||||
inline int operator()(Integer a){
|
||||
#error
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
inline void operator()(float a, float* F){
|
||||
assert(0);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(vector4double a, double* D){
|
||||
assert(0);
|
||||
}
|
||||
//Integer
|
||||
inline void operator()(int a, Integer* I){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(float * a, float b){
|
||||
assert(0);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(double * a, vector4double b){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
inline float operator()(Grid::ComplexF *a){
|
||||
return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
|
||||
}
|
||||
// Complex double
|
||||
inline vector4double operator()(Grid::ComplexD *a){
|
||||
return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
|
||||
}
|
||||
// Real float
|
||||
inline float operator()(float *a){
|
||||
return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
|
||||
}
|
||||
// Real double
|
||||
inline vector4double operator()(double *a){
|
||||
return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
|
||||
}
|
||||
// Integer
|
||||
inline int operator()(Integer *a){
|
||||
#error
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
inline float operator()(float a, float b){
|
||||
#error
|
||||
}
|
||||
//Complex/Real double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
return vec_add(a,b);
|
||||
}
|
||||
//Integer
|
||||
inline int operator()(int a, int b){
|
||||
#error
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
inline float operator()(float a, float b){
|
||||
#error
|
||||
}
|
||||
//Complex/Real double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
#error
|
||||
}
|
||||
//Integer
|
||||
inline floati operator()(int a, int b){
|
||||
#error
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
inline float operator()(float a, float b){
|
||||
#error
|
||||
}
|
||||
// Complex double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
#error
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
// Real float
|
||||
inline float operator()(float a, float b){
|
||||
#error
|
||||
}
|
||||
// Real double
|
||||
inline vector4double operator()(vector4double a, vector4double b){
|
||||
#error
|
||||
}
|
||||
// Integer
|
||||
inline int operator()(int a, int b){
|
||||
#error
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
inline float operator()(float in){
|
||||
assert(0);
|
||||
}
|
||||
// Complex double
|
||||
inline vector4double operator()(vector4double in){
|
||||
assert(0);
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline float operator()(float in, float ret){
|
||||
assert(0);
|
||||
}
|
||||
//Complex double
|
||||
inline vector4double operator()(vector4double in, vector4double ret){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline float operator()(float in, float ret){
|
||||
|
||||
}
|
||||
//Complex double
|
||||
inline vector4double operator()(vector4double in, vector4double ret){
|
||||
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
|
||||
assert(0);
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
inline Integer Reduce<Integer, floati>::operator()(float in){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
namespace Grid {
|
||||
typedef float SIMD_Ftype __attribute__ ((vector_size (16))); // Single precision type
|
||||
typedef vector4double SIMD_Dtype; // Double precision type
|
||||
typedef int SIMD_Itype; // Integer type
|
||||
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
@ -4,7 +4,7 @@
|
||||
|
||||
Using intrinsics
|
||||
*/
|
||||
// Time-stamp: <2015-05-20 16:45:39 neo>
|
||||
// Time-stamp: <2015-05-21 18:06:30 neo>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <pmmintrin.h>
|
||||
@ -53,12 +53,12 @@ namespace Optimization {
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
inline void operator()(__m128 a, __m128 b){
|
||||
_mm_stream_ps((float *)&a,b);
|
||||
inline void operator()(float * a, __m128 b){
|
||||
_mm_stream_ps(a,b);
|
||||
}
|
||||
//Double
|
||||
inline void operator()(__m128d a, __m128d b){
|
||||
_mm_stream_pd((double *)&a,b);
|
||||
inline void operator()(double * a, __m128d b){
|
||||
_mm_stream_pd(a,b);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2,13 +2,23 @@
|
||||
/*! @file Grid_vector_types.h
|
||||
@brief Defines templated class Grid_simd to deal with inner vector types
|
||||
*/
|
||||
// Time-stamp: <2015-05-20 17:31:55 neo>
|
||||
// Time-stamp: <2015-05-26 13:44:54 neo>
|
||||
//---------------------------------------------------------------------------
|
||||
#ifndef GRID_VECTOR_TYPES
|
||||
#define GRID_VECTOR_TYPES
|
||||
|
||||
#ifdef SSE4
|
||||
#include "Grid_sse4.h"
|
||||
|
||||
#endif
|
||||
#if defined (AVX1)|| defined (AVX2)
|
||||
#include "Grid_avx.h"
|
||||
#endif
|
||||
#if defined AVX512
|
||||
#include "Grid_knc.h"
|
||||
#endif
|
||||
#if defined QPX
|
||||
#include "Grid_qpx.h"
|
||||
#endif
|
||||
|
||||
namespace Grid {
|
||||
|
||||
@ -25,8 +35,6 @@ namespace Grid {
|
||||
template <typename Condition, typename ReturnType> using EnableIf = Invoke<std::enable_if<Condition::value, ReturnType>>;
|
||||
template <typename Condition, typename ReturnType> using NotEnableIf= Invoke<std::enable_if<!Condition::value, ReturnType>>;
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Check for complexity with type traits
|
||||
template <typename T> struct is_complex : std::false_type {};
|
||||
@ -36,18 +44,71 @@ namespace Grid {
|
||||
// general forms to allow for vsplat syntax
|
||||
// need explicit declaration of types when used since
|
||||
// clang cannot automatically determine the output type sometimes
|
||||
// use decltype?
|
||||
template < class Out, class Input1, class Input2, class Operation >
|
||||
Out binary(Input1 src_1, Input2 src_2, Operation op){
|
||||
return op(src_1, src_2);
|
||||
}
|
||||
|
||||
template < class SIMDout, class Input, class Operation >
|
||||
SIMDout unary(Input src, Operation op){
|
||||
template < class Out, class Input, class Operation >
|
||||
Out unary(Input src, Operation op){
|
||||
return op(src);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Permute
|
||||
// Permute 0 every ABCDEFGH -> BA DC FE HG
|
||||
// Permute 1 every ABCDEFGH -> CD AB GH EF
|
||||
// Permute 2 every ABCDEFGH -> EFGH ABCD
|
||||
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
|
||||
// Permute 4 possible on half precision @512bit vectors.
|
||||
//////////////////////////////////////////////////////////
|
||||
template<class vsimd>
|
||||
inline void Gpermute(vsimd &y,const vsimd &b,int perm){
|
||||
union {
|
||||
SIMD_Ftype f;
|
||||
decltype(vsimd::v) v;
|
||||
} conv;
|
||||
conv.v = b.v;
|
||||
switch (perm){
|
||||
#if defined(AVX1)||defined(AVX2)
|
||||
// 8x32 bits=>3 permutes
|
||||
case 2:
|
||||
conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1));
|
||||
break;
|
||||
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
|
||||
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
|
||||
#endif
|
||||
#ifdef SSE4
|
||||
case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break;
|
||||
#endif
|
||||
#ifdef AVX512
|
||||
// 16 floats=> permutes
|
||||
// Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo
|
||||
// Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn
|
||||
// Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
|
||||
// Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
|
||||
case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
|
||||
case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
|
||||
case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
|
||||
case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
|
||||
#endif
|
||||
#ifdef QPX
|
||||
#error not implemented
|
||||
#endif
|
||||
default: assert(0); break;
|
||||
}
|
||||
y.v=conv.v;
|
||||
|
||||
};
|
||||
|
||||
///////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
/*
|
||||
@brief Grid_simd class for the SIMD vector type operations
|
||||
*/
|
||||
@ -56,6 +117,9 @@ namespace Grid {
|
||||
|
||||
public:
|
||||
typedef typename RealPart < Scalar_type >::type Real;
|
||||
typedef Vector_type vector_type;
|
||||
typedef Scalar_type scalar_type;
|
||||
|
||||
Vector_type v;
|
||||
|
||||
|
||||
@ -66,17 +130,21 @@ namespace Grid {
|
||||
vzero(*this);
|
||||
return (*this);
|
||||
}
|
||||
Grid_simd(){};
|
||||
|
||||
Grid_simd& operator=(const Grid_simd&& rhs){v=rhs.v;return *this;};
|
||||
Grid_simd& operator=(const Grid_simd& rhs){v=rhs.v;return *this;}; //faster than not declaring it and leaving to the compiler
|
||||
Grid_simd()=default;
|
||||
Grid_simd(const Grid_simd& rhs):v(rhs.v){}; //compiles in movaps
|
||||
Grid_simd(const Grid_simd&& rhs):v(rhs.v){};
|
||||
|
||||
//Enable if complex type
|
||||
template < class S = Scalar_type >
|
||||
Grid_simd(typename std::enable_if< is_complex < S >::value, S>::type a){
|
||||
Grid_simd(const typename std::enable_if< is_complex < S >::value, S>::type a){
|
||||
vsplat(*this,a);
|
||||
};
|
||||
|
||||
|
||||
Grid_simd(Real a){
|
||||
Grid_simd(const Real a){
|
||||
vsplat(*this,Scalar_type(a));
|
||||
};
|
||||
|
||||
@ -88,18 +156,25 @@ namespace Grid {
|
||||
friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
|
||||
friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
//not for integer types... FIXME
|
||||
|
||||
friend inline void mac (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
friend inline void mult(Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
|
||||
friend inline void sub (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
|
||||
friend inline void add (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
|
||||
friend inline void mac (Grid_simd *__restrict__ y,const Grid_simd *__restrict__ a,const Scalar_type *__restrict__ x){ *y = (*a)*(*x)+(*y); };
|
||||
friend inline void mult(Grid_simd *__restrict__ y,const Grid_simd *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) * (*r); }
|
||||
friend inline void sub (Grid_simd *__restrict__ y,const Grid_simd *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) - (*r); }
|
||||
friend inline void add (Grid_simd *__restrict__ y,const Grid_simd *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) + (*r); }
|
||||
|
||||
|
||||
|
||||
//not for integer types...
|
||||
template < class S = Scalar_type, NotEnableIf<std::is_integral < S >, int> = 0 >
|
||||
friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Initialise to 1,0,i for the correct types
|
||||
///////////////////////////////////////////////
|
||||
// if not complex overload here
|
||||
template < class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 >
|
||||
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); }
|
||||
template < class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 >
|
||||
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); }
|
||||
|
||||
// For complex types
|
||||
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
|
||||
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0,0.0); }
|
||||
@ -108,6 +183,14 @@ namespace Grid {
|
||||
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
|
||||
friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);}
|
||||
|
||||
// if not complex overload here
|
||||
template < class S = Scalar_type, EnableIf<std::is_floating_point < S >,int> = 0 >
|
||||
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); }
|
||||
template < class S = Scalar_type, EnableIf<std::is_floating_point < S >,int> = 0 >
|
||||
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); }
|
||||
|
||||
|
||||
|
||||
// For integral types
|
||||
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
|
||||
friend inline void vone(Grid_simd &ret) { vsplat(ret,1); }
|
||||
@ -116,7 +199,7 @@ namespace Grid {
|
||||
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
|
||||
friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
|
||||
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
|
||||
friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
|
||||
friend inline void vfalse(Grid_simd &ret){vsplat(ret,0);}
|
||||
|
||||
////////////////////////////////////
|
||||
// Arithmetic operator overloads +,-,*
|
||||
@ -192,8 +275,9 @@ namespace Grid {
|
||||
///////////////////////
|
||||
// Vstream
|
||||
///////////////////////
|
||||
template < class S = Scalar_type, NotEnableIf<std::is_integral < S >, int> = 0 >
|
||||
friend inline void vstream(Grid_simd &out,const Grid_simd &in){
|
||||
binary<void>(out.v, in.v, VstreamSIMD());
|
||||
binary<void>((Real*)&out.v, in.v, VstreamSIMD());
|
||||
}
|
||||
|
||||
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
|
||||
@ -291,7 +375,7 @@ namespace Grid {
|
||||
// Unary negation
|
||||
///////////////////////
|
||||
friend inline Grid_simd operator -(const Grid_simd &r) {
|
||||
vComplexF ret;
|
||||
Grid_simd ret;
|
||||
vzero(ret);
|
||||
ret = ret - r;
|
||||
return ret;
|
||||
@ -354,33 +438,15 @@ namespace Grid {
|
||||
|
||||
// Define available types (now change names to avoid clashing with the rest of the code)
|
||||
|
||||
typedef Grid_simd< float , SIMD_Ftype > MyRealF;
|
||||
typedef Grid_simd< double , SIMD_Dtype > MyRealD;
|
||||
typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF;
|
||||
typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD;
|
||||
typedef Grid_simd< float , SIMD_Ftype > vRealF;
|
||||
typedef Grid_simd< double , SIMD_Dtype > vRealD;
|
||||
typedef Grid_simd< std::complex< float > , SIMD_Ftype > vComplexF;
|
||||
typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD;
|
||||
typedef Grid_simd< Integer , SIMD_Itype > vInteger;
|
||||
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Temporary hack to keep independent from the rest of the code
|
||||
template<> struct isGridTensor<MyRealD > {
|
||||
static const bool value = false;
|
||||
static const bool notvalue = true;
|
||||
};
|
||||
template<> struct isGridTensor<MyRealF > {
|
||||
static const bool value = false;
|
||||
static const bool notvalue = true;
|
||||
};
|
||||
template<> struct isGridTensor<MyComplexD > {
|
||||
static const bool value = false;
|
||||
static const bool notvalue = true;
|
||||
};
|
||||
template<> struct isGridTensor<MyComplexF > {
|
||||
static const bool value = false;
|
||||
static const bool notvalue = true;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
167
m4/ax_cxx_compile_stdcxx_11.m4
Normal file
167
m4/ax_cxx_compile_stdcxx_11.m4
Normal file
@ -0,0 +1,167 @@
|
||||
# ============================================================================
|
||||
# http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html
|
||||
# ============================================================================
|
||||
#
|
||||
# SYNOPSIS
|
||||
#
|
||||
# AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional])
|
||||
#
|
||||
# DESCRIPTION
|
||||
#
|
||||
# Check for baseline language coverage in the compiler for the C++11
|
||||
# standard; if necessary, add switches to CXXFLAGS to enable support.
|
||||
#
|
||||
# The first argument, if specified, indicates whether you insist on an
|
||||
# extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
|
||||
# -std=c++11). If neither is specified, you get whatever works, with
|
||||
# preference for an extended mode.
|
||||
#
|
||||
# The second argument, if specified 'mandatory' or if left unspecified,
|
||||
# indicates that baseline C++11 support is required and that the macro
|
||||
# should error out if no mode with that support is found. If specified
|
||||
# 'optional', then configuration proceeds regardless, after defining
|
||||
# HAVE_CXX11 if and only if a supporting mode is found.
|
||||
#
|
||||
# LICENSE
|
||||
#
|
||||
# Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
|
||||
# Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
|
||||
# Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
|
||||
# Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov@google.com>
|
||||
#
|
||||
# Copying and distribution of this file, with or without modification, are
|
||||
# permitted in any medium without royalty provided the copyright notice
|
||||
# and this notice are preserved. This file is offered as-is, without any
|
||||
# warranty.
|
||||
|
||||
#serial 11
|
||||
|
||||
m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [[
|
||||
template <typename T>
|
||||
struct check
|
||||
{
|
||||
static_assert(sizeof(int) <= sizeof(T), "not big enough");
|
||||
};
|
||||
|
||||
struct Base {
|
||||
virtual void f() {}
|
||||
};
|
||||
struct Child : public Base {
|
||||
virtual void f() override {}
|
||||
};
|
||||
|
||||
typedef check<check<bool>> right_angle_brackets;
|
||||
|
||||
int a;
|
||||
decltype(a) b;
|
||||
|
||||
typedef check<int> check_type;
|
||||
check_type c;
|
||||
check_type&& cr = static_cast<check_type&&>(c);
|
||||
|
||||
auto d = a;
|
||||
auto l = [](){};
|
||||
// Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
|
||||
struct use_l { use_l() { l(); } };
|
||||
|
||||
// http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
|
||||
// Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
|
||||
namespace test_template_alias_sfinae {
|
||||
struct foo {};
|
||||
|
||||
template<typename T>
|
||||
using member = typename T::member_type;
|
||||
|
||||
template<typename T>
|
||||
void func(...) {}
|
||||
|
||||
template<typename T>
|
||||
void func(member<T>*) {}
|
||||
|
||||
void test();
|
||||
|
||||
void test() {
|
||||
func<foo>(0);
|
||||
}
|
||||
}
|
||||
]])
|
||||
|
||||
AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl
|
||||
m4_if([$1], [], [],
|
||||
[$1], [ext], [],
|
||||
[$1], [noext], [],
|
||||
[m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl
|
||||
m4_if([$2], [], [ax_cxx_compile_cxx11_required=true],
|
||||
[$2], [mandatory], [ax_cxx_compile_cxx11_required=true],
|
||||
[$2], [optional], [ax_cxx_compile_cxx11_required=false],
|
||||
[m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])])
|
||||
AC_LANG_PUSH([C++])dnl
|
||||
ac_success=no
|
||||
AC_CACHE_CHECK(whether $CXX supports C++11 features by default,
|
||||
ax_cv_cxx_compile_cxx11,
|
||||
[AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
|
||||
[ax_cv_cxx_compile_cxx11=yes],
|
||||
[ax_cv_cxx_compile_cxx11=no])])
|
||||
if test x$ax_cv_cxx_compile_cxx11 = xyes; then
|
||||
ac_success=yes
|
||||
fi
|
||||
|
||||
m4_if([$1], [noext], [], [dnl
|
||||
if test x$ac_success = xno; then
|
||||
for switch in -std=gnu++11 -std=gnu++0x; do
|
||||
cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
|
||||
AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
|
||||
$cachevar,
|
||||
[ac_save_CXXFLAGS="$CXXFLAGS"
|
||||
CXXFLAGS="$CXXFLAGS $switch"
|
||||
AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
|
||||
[eval $cachevar=yes],
|
||||
[eval $cachevar=no])
|
||||
CXXFLAGS="$ac_save_CXXFLAGS"])
|
||||
if eval test x\$$cachevar = xyes; then
|
||||
CXXFLAGS="$CXXFLAGS $switch"
|
||||
ac_success=yes
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi])
|
||||
|
||||
m4_if([$1], [ext], [], [dnl
|
||||
if test x$ac_success = xno; then
|
||||
dnl HP's aCC needs +std=c++11 according to:
|
||||
dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
|
||||
for switch in -std=c++11 -std=c++0x +std=c++11; do
|
||||
cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
|
||||
AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
|
||||
$cachevar,
|
||||
[ac_save_CXXFLAGS="$CXXFLAGS"
|
||||
CXXFLAGS="$CXXFLAGS $switch"
|
||||
AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
|
||||
[eval $cachevar=yes],
|
||||
[eval $cachevar=no])
|
||||
CXXFLAGS="$ac_save_CXXFLAGS"])
|
||||
if eval test x\$$cachevar = xyes; then
|
||||
CXXFLAGS="$CXXFLAGS $switch"
|
||||
ac_success=yes
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi])
|
||||
AC_LANG_POP([C++])
|
||||
if test x$ax_cxx_compile_cxx11_required = xtrue; then
|
||||
if test x$ac_success = xno; then
|
||||
AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.])
|
||||
fi
|
||||
else
|
||||
if test x$ac_success = xno; then
|
||||
HAVE_CXX11=0
|
||||
AC_MSG_NOTICE([No compiler with C++11 support was found])
|
||||
else
|
||||
HAVE_CXX11=1
|
||||
AC_DEFINE(HAVE_CXX11,1,
|
||||
[define if the compiler supports basic C++11 syntax])
|
||||
fi
|
||||
|
||||
AC_SUBST(HAVE_CXX11)
|
||||
fi
|
||||
])
|
@ -104,6 +104,9 @@ int main (int argc, char ** argv)
|
||||
random(FineRNG,scVec);
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
|
||||
|
||||
cVec = cMat * cVec; // LatticeColourVector = LatticeColourMatrix * LatticeColourVector
|
||||
sVec = sMat * sVec; // LatticeSpinVector = LatticeSpinMatrix * LatticeSpinVector
|
||||
scVec= scMat * scVec;// LatticeSpinColourVector = LatticeSpinColourMatrix * LatticeSpinColourVector
|
||||
@ -113,12 +116,14 @@ int main (int argc, char ** argv)
|
||||
cMat = outerProduct(cVec,cVec);
|
||||
scalar = localInnerProduct(cVec,cVec);
|
||||
|
||||
|
||||
scalar += scalar;
|
||||
scalar -= scalar;
|
||||
scalar *= scalar;
|
||||
add(scalar,scalar,scalar);
|
||||
sub(scalar,scalar,scalar);
|
||||
mult(scalar,scalar,scalar);
|
||||
|
||||
mac(scalar,scalar,scalar);
|
||||
scalar = scalar+scalar;
|
||||
scalar = scalar-scalar;
|
||||
@ -154,51 +159,11 @@ int main (int argc, char ** argv)
|
||||
// localNorm2
|
||||
// localInnerProduct
|
||||
|
||||
|
||||
scMat = sMat*scMat; // LatticeSpinColourMatrix = LatticeSpinMatrix * LatticeSpinColourMatrix
|
||||
|
||||
|
||||
|
||||
#ifdef SSE4
|
||||
///////// Tests the new class Grid_simd
|
||||
std::complex<double> ctest(3.0,2.0);
|
||||
std::complex<float> ctestf(3.0,2.0);
|
||||
MyComplexF TestMe1(1.0); // fills only real part
|
||||
MyComplexD TestMe2(ctest);
|
||||
MyComplexD TestMe3(ctest);// compiler generate conversion of basic types
|
||||
//MyRealF TestMe5(ctest);// Must generate compiler error
|
||||
MyRealD TestRe1(2.0);
|
||||
MyRealF TestRe2(3.0);
|
||||
|
||||
vone(TestRe2);
|
||||
|
||||
MyComplexF TestMe6(ctestf);
|
||||
MyComplexF TestMe7(ctestf);
|
||||
|
||||
MyComplexD TheSum= TestMe2*TestMe3;
|
||||
MyComplexF TheSumF= TestMe6*TestMe7;
|
||||
|
||||
|
||||
|
||||
double dsum[2];
|
||||
_mm_store_pd(dsum, TheSum.v);
|
||||
for (int i =0; i< 2; i++)
|
||||
printf("%f\n", dsum[i]);
|
||||
MyComplexD TheSumI = timesMinusI(TheSum);
|
||||
MyComplexF TheSumIF = timesMinusI(TheSumF);
|
||||
|
||||
float fsum[4];
|
||||
_mm_store_ps(fsum, TheSumF.v);
|
||||
for (int i =0; i< 4; i++)
|
||||
printf("%f\n", fsum[i]);
|
||||
|
||||
vstore(TheSumI, &ctest);
|
||||
std::complex<float> sum = Reduce(TheSumF);
|
||||
std::cout << ctest<< std::endl;
|
||||
std::cout << sum<< std::endl;
|
||||
|
||||
#endif
|
||||
///////////////////////
|
||||
|
||||
// Non-lattice (const objects) * Lattice
|
||||
ColourMatrix cm;
|
||||
SpinColourMatrix scm;
|
||||
@ -241,6 +206,9 @@ int main (int argc, char ** argv)
|
||||
scm=transpose(scm);
|
||||
scm=transposeIndex<1>(scm);
|
||||
|
||||
|
||||
|
||||
|
||||
// Foo = Foo+scalar; // LatticeColourMatrix+Scalar
|
||||
// Foo = Foo*scalar; // LatticeColourMatrix*Scalar
|
||||
// Foo = Foo-scalar; // LatticeColourMatrix-Scalar
|
||||
@ -280,7 +248,6 @@ int main (int argc, char ** argv)
|
||||
pokeIndex<1> (c_m,c,0,0);
|
||||
}
|
||||
|
||||
|
||||
FooBar = Bar;
|
||||
|
||||
/*
|
||||
@ -340,7 +307,7 @@ int main (int argc, char ** argv)
|
||||
// Benchmark some simple operations LatticeSU3 * Lattice SU3.
|
||||
double t0,t1,flops;
|
||||
double bytes;
|
||||
int ncall=100;
|
||||
int ncall=5000;
|
||||
int Nc = Grid::QCD::Nc;
|
||||
|
||||
LatticeGaugeField U(&Fine);
|
||||
@ -359,12 +326,14 @@ int main (int argc, char ** argv)
|
||||
Fine.Barrier();
|
||||
mult(FooBar,Foo,Bar); // this is better
|
||||
}
|
||||
|
||||
t1=usecond();
|
||||
Fine.Barrier();
|
||||
if ( Fine.IsBoss() ) {
|
||||
#ifdef OMP
|
||||
printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp_get_max_threads(),lat,(t1-t0)/ncall);
|
||||
#endif
|
||||
printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp,lat,(t1-t0)/ncall);
|
||||
printf("mult NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
|
||||
printf("mult NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
|
||||
}
|
||||
@ -526,5 +495,9 @@ int main (int argc, char ** argv)
|
||||
|
||||
} // loop for lat
|
||||
} // loop for omp
|
||||
|
||||
|
||||
std::cout << sizeof(vComplexF) << std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
0
tests/InvSqrt.gnu
Normal file
0
tests/InvSqrt.gnu
Normal file
@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib
|
||||
#
|
||||
# Test code
|
||||
#
|
||||
bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez Grid_rng_fixed Grid_simd_new
|
||||
bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez Grid_rng_fixed
|
||||
|
||||
Grid_main_SOURCES = Grid_main.cc
|
||||
Grid_main_LDADD = -lGrid
|
||||
@ -34,5 +34,5 @@ Grid_stencil_LDADD = -lGrid
|
||||
Grid_simd_SOURCES = Grid_simd.cc
|
||||
Grid_simd_LDADD = -lGrid
|
||||
|
||||
Grid_simd_new_SOURCES = Grid_simd_new.cc
|
||||
Grid_simd_new_LDADD = -lGrid
|
||||
#Grid_simd_new_SOURCES = Grid_simd_new.cc
|
||||
#Grid_simd_new_LDADD = -lGrid
|
||||
|
2
tests/Sqrt.gnu
Normal file
2
tests/Sqrt.gnu
Normal file
@ -0,0 +1,2 @@
|
||||
f(x) = 6.81384+(-2.34645e-06/(x+0.000228091))+(-1.51593e-05/(x+0.00112084))+(-6.89254e-05/(x+0.003496))+(-0.000288983/(x+0.00954309))+(-0.00119277/(x+0.024928))+(-0.0050183/(x+0.0646627))+(-0.0226449/(x+0.171576))+(-0.123767/(x+0.491792))+(-1.1705/(x+1.78667))+(-102.992/(x+18.4866));
|
||||
f(x) = 0.14676+(0.00952992/(x+5.40933e-05))+(0.0115952/(x+0.000559699))+(0.0161824/(x+0.00203338))+(0.0243252/(x+0.00582831))+(0.0379533/(x+0.0154649))+(0.060699/(x+0.0401156))+(0.100345/(x+0.104788))+(0.178335/(x+0.286042))+(0.381586/(x+0.892189))+(1.42625/(x+4.38422));
|
Loading…
Reference in New Issue
Block a user