1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-25 02:04:48 +01:00
Conflicts:
	lib/Grid_simd.h
This commit is contained in:
Peter Boyle
2015-05-26 20:04:08 +01:00
28 changed files with 1634 additions and 294 deletions

View File

@@ -1,7 +1,7 @@
# Makefile.in generated by automake 1.15 from Makefile.am.
# Makefile.in generated by automake 1.14.1 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2014 Free Software Foundation, Inc.
# Copyright (C) 1994-2013 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -14,17 +14,7 @@
@SET_MAKE@
VPATH = @srcdir@
am__is_gnu_make = { \
if test -z '$(MAKELEVEL)'; then \
false; \
elif test -n '$(MAKE_HOST)'; then \
true; \
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
true; \
else \
false; \
fi; \
}
am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
@@ -89,12 +79,15 @@ build_triplet = @build@
host_triplet = @host@
target_triplet = @target@
subdir = .
DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
$(top_srcdir)/configure $(am__configure_deps) COPYING TODO \
compile config.guess config.sub depcomp install-sh missing
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
$(am__configure_deps) $(am__DIST_COMMON)
am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
configure.lineno config.status.lineno
mkinstalldirs = $(install_sh) -d
@@ -157,9 +150,6 @@ ETAGS = etags
CTAGS = ctags
CSCOPE = cscope
DIST_SUBDIRS = $(SUBDIRS)
am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \
INSTALL NEWS README TODO compile config.guess config.sub \
depcomp install-sh missing
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
distdir = $(PACKAGE)-$(VERSION)
top_distdir = $(distdir)
@@ -223,6 +213,7 @@ ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
GREP = @GREP@
HAVE_CXX11 = @HAVE_CXX11@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -325,6 +316,7 @@ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu Makefile
.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
@@ -531,15 +523,15 @@ dist-xz: distdir
$(am__post_remove_distdir)
dist-tarZ: distdir
@echo WARNING: "Support for distribution archives compressed with" \
"legacy program 'compress' is deprecated." >&2
@echo WARNING: "Support for shar distribution archives is" \
"deprecated." >&2
@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
$(am__post_remove_distdir)
dist-shar: distdir
@echo WARNING: "Support for shar distribution archives is" \
"deprecated." >&2
@echo WARNING: "Support for distribution archives compressed with" \
"legacy program 'compress' is deprecated." >&2
@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
$(am__post_remove_distdir)
@@ -575,17 +567,17 @@ distcheck: dist
esac
chmod -R a-w $(distdir)
chmod u+w $(distdir)
mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
mkdir $(distdir)/_build $(distdir)/_inst
chmod a-w $(distdir)
test -d $(distdir)/_build || exit 0; \
dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
&& dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
&& am__cwd=`pwd` \
&& $(am__cd) $(distdir)/_build/sub \
&& ../../configure \
&& $(am__cd) $(distdir)/_build \
&& ../configure \
$(AM_DISTCHECK_CONFIGURE_FLAGS) \
$(DISTCHECK_CONFIGURE_FLAGS) \
--srcdir=../.. --prefix="$$dc_install_base" \
--srcdir=.. --prefix="$$dc_install_base" \
&& $(MAKE) $(AM_MAKEFLAGS) \
&& $(MAKE) $(AM_MAKEFLAGS) dvi \
&& $(MAKE) $(AM_MAKEFLAGS) check \
@@ -759,8 +751,6 @@ uninstall-am:
maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
pdf-am ps ps-am tags tags-am uninstall uninstall-am
.PRECIOUS: Makefile
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.

4
TODO
View File

@@ -1,8 +1,8 @@
================================================================
*** Hacks and bug fixes to clean up and Audits
================================================================
* Base class to share common code between vRealF, VComplexF etc...
- Performance check on Guido's reimplementation strategy
* Base class to share common code between vRealF, VComplexF etc... done
- Performance check on Guido's reimplementation strategy - (GUIDO) tested and no difference was found, merged
* FIXME audit

62
aclocal.m4 vendored
View File

@@ -1,6 +1,6 @@
# generated automatically by aclocal 1.15 -*- Autoconf -*-
# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf. It may work, but is not guaranteed to.
If you have problems, you may need to regenerate the build system entirely.
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
# Copyright (C) 2002-2014 Free Software Foundation, Inc.
# Copyright (C) 2002-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
# generated from the m4 files accompanying Automake X.Y.
# (This private macro should not be called outside this file.)
AC_DEFUN([AM_AUTOMAKE_VERSION],
[am__api_version='1.15'
[am__api_version='1.14'
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
dnl require some minimum version. Point them to the right macro.
m4_if([$1], [1.15], [],
m4_if([$1], [1.14.1], [],
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
])
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
[AM_AUTOMAKE_VERSION([1.15])dnl
[AM_AUTOMAKE_VERSION([1.14.1])dnl
m4_ifndef([AC_AUTOCONF_VERSION],
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -103,14 +103,15 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
# configured tree to be moved without reconfiguration.
AC_DEFUN([AM_AUX_DIR_EXPAND],
[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
# Expand $ac_aux_dir to an absolute path.
am_aux_dir=`cd "$ac_aux_dir" && pwd`
[dnl Rely on autoconf to set up CDPATH properly.
AC_PREREQ([2.50])dnl
# expand $ac_aux_dir to an absolute path
am_aux_dir=`cd $ac_aux_dir && pwd`
])
# AM_CONDITIONAL -*- Autoconf -*-
# Copyright (C) 1997-2014 Free Software Foundation, Inc.
# Copyright (C) 1997-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -141,7 +142,7 @@ AC_CONFIG_COMMANDS_PRE(
Usually this means the macro was only invoked conditionally.]])
fi])])
# Copyright (C) 1999-2014 Free Software Foundation, Inc.
# Copyright (C) 1999-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -332,7 +333,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
# Generate code to set up dependency tracking. -*- Autoconf -*-
# Copyright (C) 1999-2014 Free Software Foundation, Inc.
# Copyright (C) 1999-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -408,7 +409,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
# Do all the work for Automake. -*- Autoconf -*-
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -498,8 +499,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
# We need awk for the "check" target (and possibly the TAP driver). The
# system "awk" is bad on some platforms.
# We need awk for the "check" target. The system "awk" is bad on
# some platforms.
AC_REQUIRE([AC_PROG_AWK])dnl
AC_REQUIRE([AC_PROG_MAKE_SET])dnl
AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@@ -572,11 +573,7 @@ to "yes", and re-run configure.
END
AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
fi
fi
dnl The trailing newline in this macro's definition is deliberate, for
dnl backward compatibility and to allow trailing 'dnl'-style comments
dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
])
fi])
dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion. Do not
dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@@ -605,7 +602,7 @@ for _am_header in $config_headers :; do
done
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -616,7 +613,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
# Define $install_sh.
AC_DEFUN([AM_PROG_INSTALL_SH],
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
if test x"${install_sh+set}" != xset; then
if test x"${install_sh}" != xset; then
case $am_aux_dir in
*\ * | *\ *)
install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -626,7 +623,7 @@ if test x"${install_sh+set}" != xset; then
fi
AC_SUBST([install_sh])])
# Copyright (C) 2003-2014 Free Software Foundation, Inc.
# Copyright (C) 2003-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -647,7 +644,7 @@ AC_SUBST([am__leading_dot])])
# Check to see how 'make' treats includes. -*- Autoconf -*-
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -697,7 +694,7 @@ rm -f confinc confmf
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
# Copyright (C) 1997-2014 Free Software Foundation, Inc.
# Copyright (C) 1997-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -736,7 +733,7 @@ fi
# Helper functions for option handling. -*- Autoconf -*-
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -767,7 +764,7 @@ AC_DEFUN([_AM_IF_OPTION],
# Check to make sure that the build environment is sane. -*- Autoconf -*-
# Copyright (C) 1996-2014 Free Software Foundation, Inc.
# Copyright (C) 1996-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -848,7 +845,7 @@ AC_CONFIG_COMMANDS_PRE(
rm -f conftest.file
])
# Copyright (C) 2009-2014 Free Software Foundation, Inc.
# Copyright (C) 2009-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -908,7 +905,7 @@ AC_SUBST([AM_BACKSLASH])dnl
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
])
# Copyright (C) 2001-2014 Free Software Foundation, Inc.
# Copyright (C) 2001-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -936,7 +933,7 @@ fi
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
AC_SUBST([INSTALL_STRIP_PROGRAM])])
# Copyright (C) 2006-2014 Free Software Foundation, Inc.
# Copyright (C) 2006-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -955,7 +952,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
# Check how to create a tarball. -*- Autoconf -*-
# Copyright (C) 2004-2014 Free Software Foundation, Inc.
# Copyright (C) 2004-2013 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -1086,3 +1083,4 @@ AC_SUBST([am__tar])
AC_SUBST([am__untar])
]) # _AM_PROG_TAR
m4_include([m4/ax_cxx_compile_stdcxx_11.m4])

2
config.guess vendored
View File

@@ -1 +1 @@
/opt/local/share/automake-1.15/config.guess
/usr/share/automake-1.14/config.guess

2
config.sub vendored
View File

@@ -1 +1 @@
/opt/local/share/automake-1.15/config.sub
/usr/share/automake-1.14/config.sub

199
configure vendored
View File

@@ -633,6 +633,7 @@ BUILD_COMMS_MPI_TRUE
EGREP
GREP
CXXCPP
HAVE_CXX11
RANLIB
OPENMP_CXXFLAGS
am__fastdepCXX_FALSE
@@ -2466,7 +2467,7 @@ test -n "$target_alias" &&
NONENONEs,x,x, &&
program_prefix=${target_alias}-
am__api_version='1.15'
am__api_version='1.14'
# Find a good install program. We prefer a C program (faster),
# so one script is as good as another. But avoid the broken or
@@ -2638,8 +2639,8 @@ test "$program_suffix" != NONE &&
ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
# Expand $ac_aux_dir to an absolute path.
am_aux_dir=`cd "$ac_aux_dir" && pwd`
# expand $ac_aux_dir to an absolute path
am_aux_dir=`cd $ac_aux_dir && pwd`
if test x"${MISSING+set}" != xset; then
case $am_aux_dir in
@@ -2658,7 +2659,7 @@ else
$as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
fi
if test x"${install_sh+set}" != xset; then
if test x"${install_sh}" != xset; then
case $am_aux_dir in
*\ * | *\ *)
install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -2986,8 +2987,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
# <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
mkdir_p='$(MKDIR_P)'
# We need awk for the "check" target (and possibly the TAP driver). The
# system "awk" is bad on some platforms.
# We need awk for the "check" target. The system "awk" is bad on
# some platforms.
# Always define AMTAR for backward compatibility. Yes, it's still used
# in the wild :-( We should find a proper way to deprecate it ...
AMTAR='$${TAR-tar}'
@@ -3046,7 +3047,6 @@ END
fi
ac_config_headers="$ac_config_headers lib/Grid_config.h"
# Check whether --enable-silent-rules was given.
@@ -3966,6 +3966,191 @@ else
RANLIB="$ac_cv_prog_RANLIB"
fi
ax_cxx_compile_cxx11_required=true
ac_ext=cpp
ac_cpp='$CXXCPP $CPPFLAGS'
ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
ac_success=no
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features by default" >&5
$as_echo_n "checking whether $CXX supports C++11 features by default... " >&6; }
if ${ax_cv_cxx_compile_cxx11+:} false; then :
$as_echo_n "(cached) " >&6
else
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
template <typename T>
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
struct Base {
virtual void f() {}
};
struct Child : public Base {
virtual void f() override {}
};
typedef check<check<bool>> right_angle_brackets;
int a;
decltype(a) b;
typedef check<int> check_type;
check_type c;
check_type&& cr = static_cast<check_type&&>(c);
auto d = a;
auto l = [](){};
// Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
struct use_l { use_l() { l(); } };
// http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
// Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
namespace test_template_alias_sfinae {
struct foo {};
template<typename T>
using member = typename T::member_type;
template<typename T>
void func(...) {}
template<typename T>
void func(member<T>*) {}
void test();
void test() {
func<foo>(0);
}
}
_ACEOF
if ac_fn_cxx_try_compile "$LINENO"; then :
ax_cv_cxx_compile_cxx11=yes
else
ax_cv_cxx_compile_cxx11=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_cxx_compile_cxx11" >&5
$as_echo "$ax_cv_cxx_compile_cxx11" >&6; }
if test x$ax_cv_cxx_compile_cxx11 = xyes; then
ac_success=yes
fi
if test x$ac_success = xno; then
for switch in -std=c++11 -std=c++0x +std=c++11; do
cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
if eval \${$cachevar+:} false; then :
$as_echo_n "(cached) " >&6
else
ac_save_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $switch"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
template <typename T>
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
struct Base {
virtual void f() {}
};
struct Child : public Base {
virtual void f() override {}
};
typedef check<check<bool>> right_angle_brackets;
int a;
decltype(a) b;
typedef check<int> check_type;
check_type c;
check_type&& cr = static_cast<check_type&&>(c);
auto d = a;
auto l = [](){};
// Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
struct use_l { use_l() { l(); } };
// http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
// Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
namespace test_template_alias_sfinae {
struct foo {};
template<typename T>
using member = typename T::member_type;
template<typename T>
void func(...) {}
template<typename T>
void func(member<T>*) {}
void test();
void test() {
func<foo>(0);
}
}
_ACEOF
if ac_fn_cxx_try_compile "$LINENO"; then :
eval $cachevar=yes
else
eval $cachevar=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CXXFLAGS="$ac_save_CXXFLAGS"
fi
eval ac_res=\$$cachevar
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
$as_echo "$ac_res" >&6; }
if eval test x\$$cachevar = xyes; then
CXXFLAGS="$CXXFLAGS $switch"
ac_success=yes
break
fi
done
fi
ac_ext=cpp
ac_cpp='$CXXCPP $CPPFLAGS'
ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
if test x$ax_cxx_compile_cxx11_required = xtrue; then
if test x$ac_success = xno; then
as_fn_error $? "*** A compiler with support for C++11 language features is required." "$LINENO" 5
fi
else
if test x$ac_success = xno; then
HAVE_CXX11=0
{ $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5
$as_echo "$as_me: No compiler with C++11 support was found" >&6;}
else
HAVE_CXX11=1
$as_echo "#define HAVE_CXX11 1" >>confdefs.h
fi
fi
# Checks for libraries.
#AX_GCC_VAR_ATTRIBUTE(aligned)

View File

@@ -3,9 +3,9 @@
#
# Project Grid package
#
# Time-stamp: <2015-05-19 13:51:08 neo>
# Time-stamp: <2015-05-25 14:54:34 neo>
AC_PREREQ([2.69])
AC_PREREQ([2.63])
AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
AC_CANONICAL_SYSTEM
AM_INIT_AUTOMAKE(subdir-objects)
@@ -26,6 +26,8 @@ AC_LANG(C++)
AC_PROG_CXX
AC_OPENMP
AC_PROG_RANLIB
AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
# Checks for libraries.
#AX_GCC_VAR_ATTRIBUTE(aligned)

View File

@@ -16,6 +16,9 @@
/* GRID_COMMS_NONE */
#define GRID_COMMS_NONE 1
/* define if the compiler supports basic C++11 syntax */
/* #undef HAVE_CXX11 */
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
don't. */
#define HAVE_DECL_BE64TOH 1

View File

@@ -15,6 +15,9 @@
/* GRID_COMMS_NONE */
#undef GRID_COMMS_NONE
/* define if the compiler supports basic C++11 syntax */
#undef HAVE_CXX11
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
don't. */
#undef HAVE_DECL_BE64TOH

View File

@@ -13,28 +13,6 @@
typedef uint32_t Integer;
#ifdef SSE4
#include <pmmintrin.h>
#endif
#if defined(AVX1) || defined (AVX2)
#include <immintrin.h>
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
#ifndef _mm256_set_m128i
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
#endif
#endif
#ifdef AVX512
#include <immintrin.h>
#ifndef KNC_ONLY_STORES
#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512
#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512
#endif
#endif
namespace Grid {
typedef float RealF;
@@ -66,45 +44,49 @@ namespace Grid {
inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
////////////////////////////////////////////////////////////////////////////////
//Provide support functions for basic real and complex data types required by Grid
//Single and double precision versions. Should be able to template this once only.
////////////////////////////////////////////////////////////////////////////////
inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
// conjugate already supported for complex
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
//conjugate already supported for complex
inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);}
inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);}
inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
////////////////////////////////////////////////////////////////////////////////
//Provide support functions for basic real and complex data types required by Grid
//Single and double precision versions. Should be able to template this once only.
////////////////////////////////////////////////////////////////////////////////
inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
// conjugate already supported for complex
inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
//conjugate already supported for complex
inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));}
inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);}
inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);}
inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);}
inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
inline void vstream(RealF &l, const RealF &r){ l=r;}
inline void vstream(RealD &l, const RealD &r){ l=r;}
class Zero{};
static Zero zero;
template<class itype> inline void zeroit(itype &arg){ arg=zero;};
@@ -113,7 +95,6 @@ namespace Grid {
template<> inline void zeroit(RealF &arg){ arg=0; };
template<> inline void zeroit(RealD &arg){ arg=0; };
#if defined (SSE4)
typedef __m128 fvec;
typedef __m128d dvec;
@@ -245,56 +226,12 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
default: assert(0); break;
}
};
};
#include <simd/Grid_vInteger.h>
#include <simd/Grid_vRealF.h>
#include <simd/Grid_vRealD.h>
#include <simd/Grid_vComplexF.h>
#include <simd/Grid_vComplexD.h>
#include <simd/Grid_vector_types.h>
namespace Grid {
// NB: Template the following on "type Complex" and then implement *,+,- for
// ComplexF, ComplexD, RealF, RealD above to
// get full generality of binops with scalars.
inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
// Default precision
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
typedef vRealD vReal;

View File

@@ -93,12 +93,10 @@ nobase_include_HEADERS = algorithms/approx/bigfloat.h \
qcd/Grid_qcd_2spinor.h \
qcd/Grid_qcd_dirac.h \
qcd/Grid_qcd_wilson_dop.h \
simd/Grid_vComplexD.h \
simd/Grid_vComplexF.h \
simd/Grid_vInteger.h \
simd/Grid_vRealD.h \
simd/Grid_vRealF.h \
simd/Grid_vector_types.h \
simd/Grid_sse4.h
simd/Grid_sse4.h \
simd/Grid_avx.h \
simd/Grid_knc.h

View File

@@ -154,26 +154,35 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
cbmask=0x3;
}
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
PARALLEL_NESTED_LOOP2
for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
/*
int o =n*rhs._grid->_slice_stride[dimension];
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
}
*/
int o =n*rhs._grid->_slice_stride[dimension]+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
}
}
}
}
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {

View File

@@ -212,6 +212,16 @@ public:
iMatrix(const Zero &z){ *this = zero; };
iMatrix() =default;
iMatrix& operator=(const iMatrix& rhs){
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
vstream(_internal[i][j],rhs._internal[i][j]);
return *this;
};
iMatrix(scalar_type s) { (*this) = s ;};// recurse down and hit the constructor for vector_type
/*
@@ -220,6 +230,9 @@ public:
iMatrix<vtype,N> & operator= (const iMatrix<vtype,N> &copyme) = default;
iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &&copyme) = default;
*/
iMatrix<vtype,N> & operator= (const Zero &hero){
zeroit(*this);
return *this;

399
lib/simd/Grid_avx.h Normal file
View File

@@ -0,0 +1,399 @@
//----------------------------------------------------------------------
/*! @file Grid_avx.h
@brief Optimization libraries for AVX1/2 instructions set
Using intrinsics
*/
// Time-stamp: <2015-05-22 18:58:27 neo>
//----------------------------------------------------------------------
#include <immintrin.h>
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
#ifndef _mm256_set_m128i
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
#endif
namespace Optimization {
struct Vsplat{
//Complex float
inline __m256 operator()(float a, float b){
return _mm256_set_ps(b,a,b,a,b,a,b,a);
}
// Real float
inline __m256 operator()(float a){
return _mm256_set_ps(a,a,a,a,a,a,a,a);
}
//Complex double
inline __m256d operator()(double a, double b){
return _mm256_set_pd(b,a,b,a);
}
//Real double
inline __m256d operator()(double a){
return _mm256_set_pd(a,a,a,a);
}
//Integer
inline __m256i operator()(Integer a){
return _mm256_set1_epi32(a);
}
};
struct Vstore{
//Float
inline void operator()(__m256 a, float* F){
_mm256_store_ps(F,a);
}
//Double
inline void operator()(__m256d a, double* D){
_mm256_store_pd(D,a);
}
//Integer
inline void operator()(__m256i a, Integer* I){
_mm256_store_si256((__m256i*)I,a);
}
};
struct Vstream{
//Float
inline void operator()(float * a, __m256 b){
_mm256_stream_ps(a,b);
}
//Double
inline void operator()(double * a, __m256d b){
_mm256_stream_pd(a,b);
}
};
struct Vset{
// Complex float
inline __m256 operator()(Grid::ComplexF *a){
return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Complex double
inline __m256d operator()(Grid::ComplexD *a){
return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Real float
inline __m256 operator()(float *a){
return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Real double
inline __m256d operator()(double *a){
return _mm256_set_pd(a[3],a[2],a[1],a[0]);
}
// Integer
inline __m256i operator()(Integer *a){
return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline __m256 operator()(__m256 a, __m256 b){
return _mm256_add_ps(a,b);
}
//Complex/Real double
inline __m256d operator()(__m256d a, __m256d b){
return _mm256_add_pd(a,b);
}
//Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
b0 = _mm256_extractf128_si256(b,0);
a1 = _mm256_extractf128_si256(a,1);
b1 = _mm256_extractf128_si256(b,1);
a0 = _mm_add_epi32(a0,b0);
a1 = _mm_add_epi32(a1,b1);
return _mm256_set_m128i(a1,a0);
#endif
#if defined (AVX2)
return _mm256_add_epi32(a,b);
#endif
}
};
struct Sub{
//Complex/Real float
inline __m256 operator()(__m256 a, __m256 b){
return _mm256_sub_ps(a,b);
}
//Complex/Real double
inline __m256d operator()(__m256d a, __m256d b){
return _mm256_sub_pd(a,b);
}
//Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
b0 = _mm256_extractf128_si256(b,0);
a1 = _mm256_extractf128_si256(a,1);
b1 = _mm256_extractf128_si256(b,1);
a0 = _mm_sub_epi32(a0,b0);
a1 = _mm_sub_epi32(a1,b1);
return _mm256_set_m128i(a1,a0);
#endif
#if defined (AVX2)
return _mm256_sub_epi32(a,b);
#endif
}
};
struct MultComplex{
// Complex float
inline __m256 operator()(__m256 a, __m256 b){
__m256 ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
// FIXME AVX2 could MAC
ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_ps(ymm0,ymm1);
}
// Complex double
inline __m256d operator()(__m256d a, __m256d b){
//Multiplication of (ak+ibk)*(ck+idk)
// a + i b can be stored as a data structure
//From intel optimisation reference guide
/*
movsldup xmm0, Src1; load real parts into the destination,
; a1, a1, a0, a0
movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
VSHUFPD (VEX.256 encoded version)
IF IMM0[0] = 0
THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
IF IMM0[1] = 0
THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
IF IMM0[2] = 0
THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
IF IMM0[3] = 0
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
*/
__m256d ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi b'01,01
ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_pd(ymm0,ymm1);
}
};
struct Mult{
// Real float
inline __m256 operator()(__m256 a, __m256 b){
return _mm256_mul_ps(a,b);
}
// Real double
inline __m256d operator()(__m256d a, __m256d b){
return _mm256_mul_pd(a,b);
}
// Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
b0 = _mm256_extractf128_si256(b,0);
a1 = _mm256_extractf128_si256(a,1);
b1 = _mm256_extractf128_si256(b,1);
a0 = _mm_mul_epi32(a0,b0);
a1 = _mm_mul_epi32(a1,b1);
return _mm256_set_m128i(a1,a0);
#endif
#if defined (AVX2)
return _mm256_mul_epi32(a,b);
#endif
}
};
struct Conj{
// Complex single
inline __m256 operator()(__m256 in){
return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f));
}
// Complex double
inline __m256d operator()(__m256d in){
return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));//untested
/*
// original
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),_mm256_shuffle_pd(in,in,0x5));
return _mm256_shuffle_pd(tmp,tmp,0x5);
*/
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline __m256 operator()(__m256 in, __m256 ret){
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
}
//Complex double
inline __m256d operator()(__m256d in, __m256d ret){
__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
return _mm256_shuffle_pd(tmp,tmp,0x5);
}
};
struct TimesI{
//Complex single
inline __m256 operator()(__m256 in, __m256 ret){
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
}
//Complex double
inline __m256d operator()(__m256d in, __m256d ret){
__m256d tmp = _mm256_shuffle_pd(in,in,0x5);
return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
}
};
//////////////////////////////////////////////
// Some Template specialization
template < typename vtype >
void permute(vtype a, vtype b, int perm) {
union {
__m256 f;
vtype v;
} conv;
conv.v = b;
switch (perm){
// 8x32 bits=>3 permutes
case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
default: assert(0); break;
}
a = conv.v;
}
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
__m256 v1,v2;
Optimization::permute(v1,in,0); // sse 128; paired complex single
v1 = _mm256_add_ps(v1,in);
Optimization::permute(v2,v1,1); // avx 256; quad complex single
v1 = _mm256_add_ps(v1,v2);
return Grid::ComplexF(v1[0],v1[1]);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
__m256 v1,v2;
Optimization::permute(v1,in,0); // avx 256; octo-double
v1 = _mm256_add_ps(v1,in);
Optimization::permute(v2,v1,1);
v1 = _mm256_add_ps(v1,v2);
Optimization::permute(v2,v1,2);
v1 = _mm256_add_ps(v1,v2);
return v1[0];
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
__m256d v1;
Optimization::permute(v1,in,0); // sse 128; paired complex single
v1 = _mm256_add_pd(v1,in);
return Grid::ComplexD(v1[0],v1[1]);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
__m256d v1,v2;
Optimization::permute(v1,in,0); // avx 256; quad double
v1 = _mm256_add_pd(v1,in);
Optimization::permute(v2,v1,1);
v1 = _mm256_add_pd(v1,v2);
return v1[0];
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
namespace Grid {
typedef __m256 SIMD_Ftype; // Single precision type
typedef __m256d SIMD_Dtype; // Double precision type
typedef __m256i SIMD_Itype; // Integer type
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

323
lib/simd/Grid_knc.h Normal file
View File

@@ -0,0 +1,323 @@
//----------------------------------------------------------------------
/*! @file Grid_knc.h
@brief Optimization libraries for AVX512 instructions set for KNC
Using intrinsics
*/
// Time-stamp: <2015-05-22 17:12:44 neo>
//----------------------------------------------------------------------
#include <immintrin.h>
#ifndef KNC_ONLY_STORES
#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512
#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512
#endif
namespace Optimization {
struct Vsplat{
//Complex float
inline __m512 operator()(float a, float b){
return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
}
// Real float
inline __m512 operator()(float a){
return _mm512_set1_ps(a);
}
//Complex double
inline __m512d operator()(double a, double b){
return _mm512_set_pd(b,a,b,a,b,a,b,a);
}
//Real double
inline __m512d operator()(double a){
return _mm512_set1_pd(a);
}
//Integer
inline __m512i operator()(Integer a){
return _mm512_set1_epi32(a);
}
};
struct Vstore{
//Float
inline void operator()(__m512 a, float* F){
_mm512_store_ps(F,a);
}
//Double
inline void operator()(__m512d a, double* D){
_mm512_store_pd(D,a);
}
//Integer
inline void operator()(__m512i a, Integer* I){
_mm512_store_si512((__m512i *)I,a);
}
};
struct Vstream{
//Float
inline void operator()(float * a, __m512 b){
_mm512_storenrngo_ps(a,b);
}
//Double
inline void operator()(double * a, __m512d b){
_mm512_storenrngo_pd(a,b);
}
};
struct Vset{
// Complex float
inline __m512 operator()(Grid::ComplexF *a){
return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Complex double
inline __m512d operator()(Grid::ComplexD *a){
return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
}
// Real float
inline __m512 operator()(float *a){
return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Real double
inline __m512d operator()(double *a){
return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
// Integer
inline __m512i operator()(Integer *a){
return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_add_ps(a,b);
}
//Complex/Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_add_pd(a,b);
}
//Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_add_epi32(a,b);
}
};
struct Sub{
//Complex/Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_sub_ps(a,b);
}
//Complex/Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_sub_pd(a,b);
}
//Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_sub_epi32(a,b);
}
};
struct MultComplex{
// Complex float
inline __m512 operator()(__m512 a, __m512 b){
__m512 vzero,ymm0,ymm1,real, imag;
vzero = _mm512_setzero_ps();
ymm0 = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); //
real = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
imag = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
ymm1 = _mm512_mul_ps(real, b);
ymm0 = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
return _mm512_fmadd_ps(ymm0,imag,ymm1);
}
// Complex double
inline __m512d operator()(__m512d a, __m512d b){
/* This is from
* Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets
* @inproceedings{McFarlin:2011:ASV:1995896.1995938,
* author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
* title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
* booktitle = {Proceedings of the International Conference on Supercomputing},
* series = {ICS '11},
* year = {2011},
* isbn = {978-1-4503-0102-2},
* location = {Tucson, Arizona, USA},
* pages = {265--274},
* numpages = {10},
* url = {http://doi.acm.org/10.1145/1995896.1995938},
* doi = {10.1145/1995896.1995938},
* acmid = {1995938},
* publisher = {ACM},
* address = {New York, NY, USA},
* keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
* }
*/
__m512d vzero,ymm0,ymm1,real,imag;
vzero =_mm512_setzero_pd();
ymm0 = _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); //
real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
imag = _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
ymm1 = _mm512_mul_pd(real, b);
ymm0 = _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
return _mm512_fmadd_pd(ymm0,imag,ymm1);
}
};
struct Mult{
// Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_mul_ps(a,b);
}
// Real double
inline __m512d operator()(__m512d a, __m512d b){
return _mm512_mul_pd(a,b);
}
// Integer
inline __m512i operator()(__m512i a, __m512i b){
return _mm512_mullo_epi32(a,b);
}
};
struct Conj{
// Complex single
inline __m512 operator()(__m512 in){
return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag
}
// Complex double
inline __m512d operator()(__m512d in){
return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
return _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
}
};
struct TimesI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
}
};
//////////////////////////////////////////////
// Some Template specialization
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
return _mm512_reduce_add_ps(in);
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
return _mm512_reduce_add_pd(in);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
namespace Grid {
typedef __m512 SIMD_Ftype; // Single precision type
typedef __m512d SIMD_Dtype; // Double precision type
typedef __m512i SIMD_Itype; // Integer type
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

272
lib/simd/Grid_qpx.h Normal file
View File

@@ -0,0 +1,272 @@
//----------------------------------------------------------------------
/*! @file Grid_qpx.h
@brief Optimization libraries for QPX instructions set for BG/Q
Using intrinsics
*/
// Time-stamp: <2015-05-22 17:29:26 neo>
//----------------------------------------------------------------------
// lot of undefined functions
namespace Optimization {
struct Vsplat{
//Complex float
inline float operator()(float a, float b){
return {a,b,a,b};
}
// Real float
inline float operator()(float a){
return {a,a,a,a};
}
//Complex double
inline vector4double operator()(double a, double b){
return {a,b,a,b};
}
//Real double
inline vector4double operator()(double a){
return {a,a,a,a};
}
//Integer
inline int operator()(Integer a){
#error
}
};
struct Vstore{
//Float
inline void operator()(float a, float* F){
assert(0);
}
//Double
inline void operator()(vector4double a, double* D){
assert(0);
}
//Integer
inline void operator()(int a, Integer* I){
assert(0);
}
};
struct Vstream{
//Float
inline void operator()(float * a, float b){
assert(0);
}
//Double
inline void operator()(double * a, vector4double b){
assert(0);
}
};
struct Vset{
// Complex float
inline float operator()(Grid::ComplexF *a){
return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
}
// Complex double
inline vector4double operator()(Grid::ComplexD *a){
return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
}
// Real float
inline float operator()(float *a){
return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
}
// Real double
inline vector4double operator()(double *a){
return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
}
// Integer
inline int operator()(Integer *a){
#error
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline float operator()(float a, float b){
#error
}
//Complex/Real double
inline vector4double operator()(vector4double a, vector4double b){
return vec_add(a,b);
}
//Integer
inline int operator()(int a, int b){
#error
}
};
struct Sub{
//Complex/Real float
inline float operator()(float a, float b){
#error
}
//Complex/Real double
inline vector4double operator()(vector4double a, vector4double b){
#error
}
//Integer
inline floati operator()(int a, int b){
#error
}
};
struct MultComplex{
// Complex float
inline float operator()(float a, float b){
#error
}
// Complex double
inline vector4double operator()(vector4double a, vector4double b){
#error
}
};
struct Mult{
// Real float
inline float operator()(float a, float b){
#error
}
// Real double
inline vector4double operator()(vector4double a, vector4double b){
#error
}
// Integer
inline int operator()(int a, int b){
#error
}
};
struct Conj{
// Complex single
inline float operator()(float in){
assert(0);
}
// Complex double
inline vector4double operator()(vector4double in){
assert(0);
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline float operator()(float in, float ret){
assert(0);
}
//Complex double
inline vector4double operator()(vector4double in, vector4double ret){
assert(0);
}
};
struct TimesI{
//Complex single
inline float operator()(float in, float ret){
}
//Complex double
inline vector4double operator()(vector4double in, vector4double ret){
}
};
//////////////////////////////////////////////
// Some Template specialization
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
assert(0);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
assert(0);
}
//Complex double Reduce
template<>
inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
assert(0);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
assert(0);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, floati>::operator()(float in){
assert(0);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
namespace Grid {
typedef float SIMD_Ftype __attribute__ ((vector_size (16))); // Single precision type
typedef vector4double SIMD_Dtype; // Double precision type
typedef int SIMD_Itype; // Integer type
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

View File

@@ -4,7 +4,7 @@
Using intrinsics
*/
// Time-stamp: <2015-05-20 16:45:39 neo>
// Time-stamp: <2015-05-21 18:06:30 neo>
//----------------------------------------------------------------------
#include <pmmintrin.h>
@@ -53,12 +53,12 @@ namespace Optimization {
struct Vstream{
//Float
inline void operator()(__m128 a, __m128 b){
_mm_stream_ps((float *)&a,b);
inline void operator()(float * a, __m128 b){
_mm_stream_ps(a,b);
}
//Double
inline void operator()(__m128d a, __m128d b){
_mm_stream_pd((double *)&a,b);
inline void operator()(double * a, __m128d b){
_mm_stream_pd(a,b);
}

View File

@@ -2,13 +2,23 @@
/*! @file Grid_vector_types.h
@brief Defines templated class Grid_simd to deal with inner vector types
*/
// Time-stamp: <2015-05-20 17:31:55 neo>
// Time-stamp: <2015-05-26 13:44:54 neo>
//---------------------------------------------------------------------------
#ifndef GRID_VECTOR_TYPES
#define GRID_VECTOR_TYPES
#ifdef SSE4
#include "Grid_sse4.h"
#endif
#if defined (AVX1)|| defined (AVX2)
#include "Grid_avx.h"
#endif
#if defined AVX512
#include "Grid_knc.h"
#endif
#if defined QPX
#include "Grid_qpx.h"
#endif
namespace Grid {
@@ -25,8 +35,6 @@ namespace Grid {
template <typename Condition, typename ReturnType> using EnableIf = Invoke<std::enable_if<Condition::value, ReturnType>>;
template <typename Condition, typename ReturnType> using NotEnableIf= Invoke<std::enable_if<!Condition::value, ReturnType>>;
////////////////////////////////////////////////////////
// Check for complexity with type traits
template <typename T> struct is_complex : std::false_type {};
@@ -36,18 +44,71 @@ namespace Grid {
// general forms to allow for vsplat syntax
// need explicit declaration of types when used since
// clang cannot automatically determine the output type sometimes
// use decltype?
template < class Out, class Input1, class Input2, class Operation >
Out binary(Input1 src_1, Input2 src_2, Operation op){
return op(src_1, src_2);
}
template < class SIMDout, class Input, class Operation >
SIMDout unary(Input src, Operation op){
template < class Out, class Input, class Operation >
Out unary(Input src, Operation op){
return op(src);
}
///////////////////////////////////////////////
//////////////////////////////////////////////////////////
// Permute
// Permute 0 every ABCDEFGH -> BA DC FE HG
// Permute 1 every ABCDEFGH -> CD AB GH EF
// Permute 2 every ABCDEFGH -> EFGH ABCD
// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
// Permute 4 possible on half precision @512bit vectors.
//////////////////////////////////////////////////////////
template<class vsimd>
inline void Gpermute(vsimd &y,const vsimd &b,int perm){
union {
SIMD_Ftype f;
decltype(vsimd::v) v;
} conv;
conv.v = b.v;
switch (perm){
#if defined(AVX1)||defined(AVX2)
// 8x32 bits=>3 permutes
case 2:
conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1));
break;
case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
#endif
#ifdef SSE4
case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break;
#endif
#ifdef AVX512
// 16 floats=> permutes
// Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo
// Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn
// Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
// Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
#endif
#ifdef QPX
#error not implemented
#endif
default: assert(0); break;
}
y.v=conv.v;
};
///////////////////////////////////////
/*
@brief Grid_simd class for the SIMD vector type operations
*/
@@ -56,27 +117,34 @@ namespace Grid {
public:
typedef typename RealPart < Scalar_type >::type Real;
typedef Vector_type vector_type;
typedef Scalar_type scalar_type;
Vector_type v;
static inline int Nsimd(void) { return sizeof(Vector_type)/sizeof(Scalar_type);}
// Constructors
Grid_simd & operator = ( Zero & z){
vzero(*this);
return (*this);
}
Grid_simd(){};
Grid_simd& operator=(const Grid_simd&& rhs){v=rhs.v;return *this;};
Grid_simd& operator=(const Grid_simd& rhs){v=rhs.v;return *this;}; //faster than not declaring it and leaving to the compiler
Grid_simd()=default;
Grid_simd(const Grid_simd& rhs):v(rhs.v){}; //compiles in movaps
Grid_simd(const Grid_simd&& rhs):v(rhs.v){};
//Enable if complex type
template < class S = Scalar_type >
Grid_simd(typename std::enable_if< is_complex < S >::value, S>::type a){
Grid_simd(const typename std::enable_if< is_complex < S >::value, S>::type a){
vsplat(*this,a);
};
Grid_simd(Real a){
Grid_simd(const Real a){
vsplat(*this,Scalar_type(a));
};
@@ -88,18 +156,25 @@ namespace Grid {
friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
//not for integer types... FIXME
friend inline void mac (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
friend inline void mult(Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
friend inline void mac (Grid_simd *__restrict__ y,const Grid_simd *__restrict__ a,const Scalar_type *__restrict__ x){ *y = (*a)*(*x)+(*y); };
friend inline void mult(Grid_simd *__restrict__ y,const Grid_simd *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (Grid_simd *__restrict__ y,const Grid_simd *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (Grid_simd *__restrict__ y,const Grid_simd *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) + (*r); }
//not for integer types...
template < class S = Scalar_type, NotEnableIf<std::is_integral < S >, int> = 0 >
friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
///////////////////////////////////////////////
// Initialise to 1,0,i for the correct types
///////////////////////////////////////////////
// if not complex overload here
template < class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 >
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); }
template < class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 >
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); }
// For complex types
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0,0.0); }
@@ -107,6 +182,14 @@ namespace Grid {
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0,0.0); }// use xor?
template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 >
friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);}
// if not complex overload here
template < class S = Scalar_type, EnableIf<std::is_floating_point < S >,int> = 0 >
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); }
template < class S = Scalar_type, EnableIf<std::is_floating_point < S >,int> = 0 >
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); }
// For integral types
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
@@ -116,7 +199,7 @@ namespace Grid {
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
friend inline void vfalse(Grid_simd &ret){vsplat(ret,0);}
////////////////////////////////////
// Arithmetic operator overloads +,-,*
@@ -192,8 +275,9 @@ namespace Grid {
///////////////////////
// Vstream
///////////////////////
template < class S = Scalar_type, NotEnableIf<std::is_integral < S >, int> = 0 >
friend inline void vstream(Grid_simd &out,const Grid_simd &in){
binary<void>(out.v, in.v, VstreamSIMD());
binary<void>((Real*)&out.v, in.v, VstreamSIMD());
}
template < class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 >
@@ -291,7 +375,7 @@ namespace Grid {
// Unary negation
///////////////////////
friend inline Grid_simd operator -(const Grid_simd &r) {
vComplexF ret;
Grid_simd ret;
vzero(ret);
ret = ret - r;
return ret;
@@ -336,7 +420,7 @@ namespace Grid {
}
template<class scalar_type, class vector_type >
inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);}
inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);}
template<class scalar_type, class vector_type >
@@ -354,33 +438,15 @@ namespace Grid {
// Define available types (now change names to avoid clashing with the rest of the code)
typedef Grid_simd< float , SIMD_Ftype > MyRealF;
typedef Grid_simd< double , SIMD_Dtype > MyRealD;
typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF;
typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD;
typedef Grid_simd< float , SIMD_Ftype > vRealF;
typedef Grid_simd< double , SIMD_Dtype > vRealD;
typedef Grid_simd< std::complex< float > , SIMD_Ftype > vComplexF;
typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD;
typedef Grid_simd< Integer , SIMD_Itype > vInteger;
////////////////////////////////////////////////////////////////////
// Temporary hack to keep independent from the rest of the code
template<> struct isGridTensor<MyRealD > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<MyRealF > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<MyComplexD > {
static const bool value = false;
static const bool notvalue = true;
};
template<> struct isGridTensor<MyComplexF > {
static const bool value = false;
static const bool notvalue = true;
};

View File

@@ -54,7 +54,7 @@ namespace Grid {
//////////////////////////////////
friend inline void vone(vComplexF &ret) { vsplat(ret,1.0,0.0); }
friend inline void vzero(vComplexF &ret) { vsplat(ret,0.0,0.0); }
friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0);}
friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0); }
////////////////////////////////////
// Arithmetic operator overloads +,-,*

View File

@@ -0,0 +1,167 @@
# ============================================================================
# http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html
# ============================================================================
#
# SYNOPSIS
#
# AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional])
#
# DESCRIPTION
#
# Check for baseline language coverage in the compiler for the C++11
# standard; if necessary, add switches to CXXFLAGS to enable support.
#
# The first argument, if specified, indicates whether you insist on an
# extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
# -std=c++11). If neither is specified, you get whatever works, with
# preference for an extended mode.
#
# The second argument, if specified 'mandatory' or if left unspecified,
# indicates that baseline C++11 support is required and that the macro
# should error out if no mode with that support is found. If specified
# 'optional', then configuration proceeds regardless, after defining
# HAVE_CXX11 if and only if a supporting mode is found.
#
# LICENSE
#
# Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
# Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
# Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
# Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov@google.com>
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 11
m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [[
template <typename T>
struct check
{
static_assert(sizeof(int) <= sizeof(T), "not big enough");
};
struct Base {
virtual void f() {}
};
struct Child : public Base {
virtual void f() override {}
};
typedef check<check<bool>> right_angle_brackets;
int a;
decltype(a) b;
typedef check<int> check_type;
check_type c;
check_type&& cr = static_cast<check_type&&>(c);
auto d = a;
auto l = [](){};
// Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
struct use_l { use_l() { l(); } };
// http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
// Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
namespace test_template_alias_sfinae {
struct foo {};
template<typename T>
using member = typename T::member_type;
template<typename T>
void func(...) {}
template<typename T>
void func(member<T>*) {}
void test();
void test() {
func<foo>(0);
}
}
]])
AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl
m4_if([$1], [], [],
[$1], [ext], [],
[$1], [noext], [],
[m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl
m4_if([$2], [], [ax_cxx_compile_cxx11_required=true],
[$2], [mandatory], [ax_cxx_compile_cxx11_required=true],
[$2], [optional], [ax_cxx_compile_cxx11_required=false],
[m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])])
AC_LANG_PUSH([C++])dnl
ac_success=no
AC_CACHE_CHECK(whether $CXX supports C++11 features by default,
ax_cv_cxx_compile_cxx11,
[AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
[ax_cv_cxx_compile_cxx11=yes],
[ax_cv_cxx_compile_cxx11=no])])
if test x$ax_cv_cxx_compile_cxx11 = xyes; then
ac_success=yes
fi
m4_if([$1], [noext], [], [dnl
if test x$ac_success = xno; then
for switch in -std=gnu++11 -std=gnu++0x; do
cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
$cachevar,
[ac_save_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $switch"
AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
[eval $cachevar=yes],
[eval $cachevar=no])
CXXFLAGS="$ac_save_CXXFLAGS"])
if eval test x\$$cachevar = xyes; then
CXXFLAGS="$CXXFLAGS $switch"
ac_success=yes
break
fi
done
fi])
m4_if([$1], [ext], [], [dnl
if test x$ac_success = xno; then
dnl HP's aCC needs +std=c++11 according to:
dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
for switch in -std=c++11 -std=c++0x +std=c++11; do
cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
$cachevar,
[ac_save_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $switch"
AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
[eval $cachevar=yes],
[eval $cachevar=no])
CXXFLAGS="$ac_save_CXXFLAGS"])
if eval test x\$$cachevar = xyes; then
CXXFLAGS="$CXXFLAGS $switch"
ac_success=yes
break
fi
done
fi])
AC_LANG_POP([C++])
if test x$ax_cxx_compile_cxx11_required = xtrue; then
if test x$ac_success = xno; then
AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.])
fi
else
if test x$ac_success = xno; then
HAVE_CXX11=0
AC_MSG_NOTICE([No compiler with C++11 support was found])
else
HAVE_CXX11=1
AC_DEFINE(HAVE_CXX11,1,
[define if the compiler supports basic C++11 syntax])
fi
AC_SUBST(HAVE_CXX11)
fi
])

View File

@@ -104,6 +104,9 @@ int main (int argc, char ** argv)
random(FineRNG,scVec);
fflush(stdout);
cVec = cMat * cVec; // LatticeColourVector = LatticeColourMatrix * LatticeColourVector
sVec = sMat * sVec; // LatticeSpinVector = LatticeSpinMatrix * LatticeSpinVector
scVec= scMat * scVec;// LatticeSpinColourVector = LatticeSpinColourMatrix * LatticeSpinColourVector
@@ -113,12 +116,14 @@ int main (int argc, char ** argv)
cMat = outerProduct(cVec,cVec);
scalar = localInnerProduct(cVec,cVec);
scalar += scalar;
scalar -= scalar;
scalar *= scalar;
add(scalar,scalar,scalar);
sub(scalar,scalar,scalar);
mult(scalar,scalar,scalar);
mac(scalar,scalar,scalar);
scalar = scalar+scalar;
scalar = scalar-scalar;
@@ -142,7 +147,7 @@ int main (int argc, char ** argv)
scalar=trace(scalar);
scalar=localInnerProduct(cVec,cVec);
scalar=localNorm2(cVec);
// -=,+=,*=,()
// add,+,sub,-,mult,mac,*
// adj,conjugate
@@ -154,51 +159,11 @@ int main (int argc, char ** argv)
// localNorm2
// localInnerProduct
scMat = sMat*scMat; // LatticeSpinColourMatrix = LatticeSpinMatrix * LatticeSpinColourMatrix
#ifdef SSE4
///////// Tests the new class Grid_simd
std::complex<double> ctest(3.0,2.0);
std::complex<float> ctestf(3.0,2.0);
MyComplexF TestMe1(1.0); // fills only real part
MyComplexD TestMe2(ctest);
MyComplexD TestMe3(ctest);// compiler generate conversion of basic types
//MyRealF TestMe5(ctest);// Must generate compiler error
MyRealD TestRe1(2.0);
MyRealF TestRe2(3.0);
vone(TestRe2);
MyComplexF TestMe6(ctestf);
MyComplexF TestMe7(ctestf);
MyComplexD TheSum= TestMe2*TestMe3;
MyComplexF TheSumF= TestMe6*TestMe7;
double dsum[2];
_mm_store_pd(dsum, TheSum.v);
for (int i =0; i< 2; i++)
printf("%f\n", dsum[i]);
MyComplexD TheSumI = timesMinusI(TheSum);
MyComplexF TheSumIF = timesMinusI(TheSumF);
float fsum[4];
_mm_store_ps(fsum, TheSumF.v);
for (int i =0; i< 4; i++)
printf("%f\n", fsum[i]);
vstore(TheSumI, &ctest);
std::complex<float> sum = Reduce(TheSumF);
std::cout << ctest<< std::endl;
std::cout << sum<< std::endl;
#endif
///////////////////////
// Non-lattice (const objects) * Lattice
ColourMatrix cm;
SpinColourMatrix scm;
@@ -230,7 +195,7 @@ int main (int argc, char ** argv)
scMat = scMat*mydouble;
scMat = mydouble*scMat;
cMat = mydouble*cMat;
sMat = adj(sMat); // LatticeSpinMatrix adjoint
sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix
sMat = GammaFive*sMat; // SpinMatrix * LatticeSpinMatrix
@@ -241,6 +206,9 @@ int main (int argc, char ** argv)
scm=transpose(scm);
scm=transposeIndex<1>(scm);
// Foo = Foo+scalar; // LatticeColourMatrix+Scalar
// Foo = Foo*scalar; // LatticeColourMatrix*Scalar
// Foo = Foo-scalar; // LatticeColourMatrix-Scalar
@@ -280,7 +248,6 @@ int main (int argc, char ** argv)
pokeIndex<1> (c_m,c,0,0);
}
FooBar = Bar;
/*
@@ -333,14 +300,14 @@ int main (int argc, char ** argv)
// Lattice SU(3) x SU(3)
Fine.Barrier();
FooBar = Foo * Bar;
// Lattice 12x12 GEMM
scFooBar = scFoo * scBar;
// Benchmark some simple operations LatticeSU3 * Lattice SU3.
double t0,t1,flops;
double bytes;
int ncall=100;
int ncall=5000;
int Nc = Grid::QCD::Nc;
LatticeGaugeField U(&Fine);
@@ -352,19 +319,21 @@ int main (int argc, char ** argv)
if ( Fine.IsBoss() ) {
printf("%f flop and %f bytes\n",flops,bytes/ncall);
}
FooBar = Foo * Bar;
FooBar = Foo * Bar;
Fine.Barrier();
t0=usecond();
for(int i=0;i<ncall;i++){
Fine.Barrier();
mult(FooBar,Foo,Bar); // this is better
}
t1=usecond();
Fine.Barrier();
if ( Fine.IsBoss() ) {
#ifdef OMP
printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp_get_max_threads(),lat,(t1-t0)/ncall);
#endif
printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp,lat,(t1-t0)/ncall);
printf("mult NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
printf("mult NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
}
@@ -526,5 +495,9 @@ int main (int argc, char ** argv)
} // loop for lat
} // loop for omp
std::cout << sizeof(vComplexF) << std::endl;
Grid_finalize();
}

0
tests/InvSqrt.gnu Normal file
View File

View File

@@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib
#
# Test code
#
bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez Grid_rng_fixed Grid_simd_new
bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez Grid_rng_fixed
Grid_main_SOURCES = Grid_main.cc
Grid_main_LDADD = -lGrid
@@ -34,5 +34,5 @@ Grid_stencil_LDADD = -lGrid
Grid_simd_SOURCES = Grid_simd.cc
Grid_simd_LDADD = -lGrid
Grid_simd_new_SOURCES = Grid_simd_new.cc
Grid_simd_new_LDADD = -lGrid
#Grid_simd_new_SOURCES = Grid_simd_new.cc
#Grid_simd_new_LDADD = -lGrid

2
tests/Sqrt.gnu Normal file
View File

@@ -0,0 +1,2 @@
f(x) = 6.81384+(-2.34645e-06/(x+0.000228091))+(-1.51593e-05/(x+0.00112084))+(-6.89254e-05/(x+0.003496))+(-0.000288983/(x+0.00954309))+(-0.00119277/(x+0.024928))+(-0.0050183/(x+0.0646627))+(-0.0226449/(x+0.171576))+(-0.123767/(x+0.491792))+(-1.1705/(x+1.78667))+(-102.992/(x+18.4866));
f(x) = 0.14676+(0.00952992/(x+5.40933e-05))+(0.0115952/(x+0.000559699))+(0.0161824/(x+0.00203338))+(0.0243252/(x+0.00582831))+(0.0379533/(x+0.0154649))+(0.060699/(x+0.0401156))+(0.100345/(x+0.104788))+(0.178335/(x+0.286042))+(0.381586/(x+0.892189))+(1.42625/(x+4.38422));