Merge branch 'master' of https://github.com/paboyle/Grid

Conflicts: lib/Grid_simd.h
2025-06-16 06:47:06 +01:00 · 2015-05-26 20:04:08 +01:00
parent bfb1cd36e2 500f6ed0c5
commit 48bb3ab4e7
28 changed files with 1634 additions and 294 deletions
--- a/Makefile.in
+++ b/Makefile.in
@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.15 from Makefile.am.
+# Makefile.in generated by automake 1.14.1 from Makefile.am.
 # @configure_input@
-# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -14,17 +14,7 @@
@SET_MAKE@
 VPATH = @srcdir@
-am__is_gnu_make = { \
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
  if test -z '$(MAKELEVEL)'; then \
    false; \
  elif test -n '$(MAKE_HOST)'; then \
    true; \
  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
    true; \
  else \
    false; \
  fi; \
 }
 am__make_running_with_option = \
  case $${target_option-} in \
      ?) ;; \
@ -89,12 +79,15 @@ build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
 subdir = .
 DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
 	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/configure $(am__configure_deps) COPYING TODO \
 	compile config.guess config.sub depcomp install-sh missing
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
 	$(am__configure_deps) $(am__DIST_COMMON)
 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
 configure.lineno config.status.lineno
 mkinstalldirs = $(install_sh) -d
@ -157,9 +150,6 @@ ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
 DIST_SUBDIRS = $(SUBDIRS)
 am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \
 	INSTALL NEWS README TODO compile config.guess config.sub \
 	depcomp install-sh missing
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@ -223,6 +213,7 @@ ECHO_T = @ECHO_T@
 EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 GREP = @GREP@
 HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@ -325,6 +316,7 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
 	$(am__cd) $(top_srcdir) && \
 	  $(AUTOMAKE) --gnu Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
 	  *config.status*) \
@ -531,15 +523,15 @@ dist-xz: distdir
 	$(am__post_remove_distdir)
 dist-tarZ: distdir
-	@echo WARNING: "Support for distribution archives compressed with" \
+	@echo WARNING: "Support for shar distribution archives is" \
-		       "legacy program 'compress' is deprecated." >&2
+	               "deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
 	$(am__post_remove_distdir)
 dist-shar: distdir
-	@echo WARNING: "Support for shar distribution archives is" \
+	@echo WARNING: "Support for distribution archives compressed with" \
-	               "deprecated." >&2
+		       "legacy program 'compress' is deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
 	$(am__post_remove_distdir)
@ -575,17 +567,17 @@ distcheck: dist
 	esac
 	chmod -R a-w $(distdir)
 	chmod u+w $(distdir)
-	mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
+	mkdir $(distdir)/_build $(distdir)/_inst
 	chmod a-w $(distdir)
 	test -d $(distdir)/_build || exit 0; \
 	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
 	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
 	  && am__cwd=`pwd` \
-	  && $(am__cd) $(distdir)/_build/sub \
+	  && $(am__cd) $(distdir)/_build \
-	  && ../../configure \
+	  && ../configure \
 	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
 	    $(DISTCHECK_CONFIGURE_FLAGS) \
-	    --srcdir=../.. --prefix="$$dc_install_base" \
+	    --srcdir=.. --prefix="$$dc_install_base" \
 	  && $(MAKE) $(AM_MAKEFLAGS) \
 	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
 	  && $(MAKE) $(AM_MAKEFLAGS) check \
@ -759,8 +751,6 @@ uninstall-am:
 	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
 	pdf-am ps ps-am tags tags-am uninstall uninstall-am
 .PRECIOUS: Makefile
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
--- a/4
+++ b/4
@ -1,8 +1,8 @@
 ================================================================
 *** Hacks and bug fixes to clean up and Audits
 ================================================================
-* Base class to share common code between vRealF, VComplexF etc... 
+* Base class to share common code between vRealF, VComplexF etc... done
-  - Performance check on Guido's reimplementation strategy
+  - Performance check on Guido's reimplementation strategy  - (GUIDO) tested and no difference was found, merged
 * FIXME audit
--- a/aclocal.m4
+++ b/aclocal.m4
@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.15 -*- Autoconf -*-
+# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])
-# Copyright (C) 2002-2014 Free Software Foundation, Inc.
+# Copyright (C) 2002-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.15'
+[am__api_version='1.14'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.15], [],
+m4_if([$1], [1.14.1], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])
@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.15])dnl
+[AM_AUTOMAKE_VERSION([1.14.1])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -103,14 +103,15 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 # configured tree to be moved without reconfiguration.
 AC_DEFUN([AM_AUX_DIR_EXPAND],
-[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+[dnl Rely on autoconf to set up CDPATH properly.
-# Expand $ac_aux_dir to an absolute path.
+AC_PREREQ([2.50])dnl
-am_aux_dir=`cd "$ac_aux_dir" && pwd`
+# expand $ac_aux_dir to an absolute path
 am_aux_dir=`cd $ac_aux_dir && pwd`
 ])
 # AM_CONDITIONAL                                            -*- Autoconf -*-
-# Copyright (C) 1997-2014 Free Software Foundation, Inc.
+# Copyright (C) 1997-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -141,7 +142,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])
-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -332,7 +333,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
 # Generate code to set up dependency tracking.              -*- Autoconf -*-
-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -408,7 +409,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 # Do all the work for Automake.                             -*- Autoconf -*-
-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -498,8 +499,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
-# We need awk for the "check" target (and possibly the TAP driver).  The
+# We need awk for the "check" target.  The system "awk" is bad on
-# system "awk" is bad on some platforms.
+# some platforms.
 AC_REQUIRE([AC_PROG_AWK])dnl
 AC_REQUIRE([AC_PROG_MAKE_SET])dnl
 AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@ -572,11 +573,7 @@ to "yes", and re-run configure.
 END
    AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
  fi
-fi
+fi])
 dnl The trailing newline in this macro's definition is deliberate, for
 dnl backward compatibility and to allow trailing 'dnl'-style comments
 dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
 ])
 dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@ -605,7 +602,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -616,7 +613,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
 # Define $install_sh.
 AC_DEFUN([AM_PROG_INSTALL_SH],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh+set}" != xset; then
+if test x"${install_sh}" != xset; then
  case $am_aux_dir in
  *\ * | *\	*)
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@ -626,7 +623,7 @@ if test x"${install_sh+set}" != xset; then
 fi
 AC_SUBST([install_sh])])
-# Copyright (C) 2003-2014 Free Software Foundation, Inc.
+# Copyright (C) 2003-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -647,7 +644,7 @@ AC_SUBST([am__leading_dot])])
 # Check to see how 'make' treats includes.	            -*- Autoconf -*-
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -697,7 +694,7 @@ rm -f confinc confmf
 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
-# Copyright (C) 1997-2014 Free Software Foundation, Inc.
+# Copyright (C) 1997-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -736,7 +733,7 @@ fi
 # Helper functions for option handling.                     -*- Autoconf -*-
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -767,7 +764,7 @@ AC_DEFUN([_AM_IF_OPTION],
 # Check to make sure that the build environment is sane.    -*- Autoconf -*-
-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -848,7 +845,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])
-# Copyright (C) 2009-2014 Free Software Foundation, Inc.
+# Copyright (C) 2009-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -908,7 +905,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])
-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -936,7 +933,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])
-# Copyright (C) 2006-2014 Free Software Foundation, Inc.
+# Copyright (C) 2006-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -955,7 +952,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
 # Check how to create a tarball.                            -*- Autoconf -*-
-# Copyright (C) 2004-2014 Free Software Foundation, Inc.
+# Copyright (C) 2004-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -1086,3 +1083,4 @@ AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 m4_include([m4/ax_cxx_compile_stdcxx_11.m4])
--- a/config.guess
+++ b/config.guess
@ -1 +1 @@
-/opt/local/share/automake-1.15/config.guess
+/usr/share/automake-1.14/config.guess
--- a/config.sub
+++ b/config.sub
@ -1 +1 @@
-/opt/local/share/automake-1.15/config.sub
+/usr/share/automake-1.14/config.sub
--- a/199
+++ b/199
@ -633,6 +633,7 @@ BUILD_COMMS_MPI_TRUE
 EGREP
 GREP
 CXXCPP
 HAVE_CXX11
 RANLIB
 OPENMP_CXXFLAGS
 am__fastdepCXX_FALSE
@ -2466,7 +2467,7 @@ test -n "$target_alias" &&
    NONENONEs,x,x, &&
  program_prefix=${target_alias}-
-am__api_version='1.15'
+am__api_version='1.14'
 # Find a good install program.  We prefer a C program (faster),
 # so one script is as good as another.  But avoid the broken or
@ -2638,8 +2639,8 @@ test "$program_suffix" != NONE &&
 ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
 program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
-# Expand $ac_aux_dir to an absolute path.
+# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd "$ac_aux_dir" && pwd`
+am_aux_dir=`cd $ac_aux_dir && pwd`
 if test x"${MISSING+set}" != xset; then
  case $am_aux_dir in
@ -2658,7 +2659,7 @@ else
 $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
 fi
-if test x"${install_sh+set}" != xset; then
+if test x"${install_sh}" != xset; then
  case $am_aux_dir in
  *\ * | *\	*)
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@ -2986,8 +2987,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 mkdir_p='$(MKDIR_P)'
-# We need awk for the "check" target (and possibly the TAP driver).  The
+# We need awk for the "check" target.  The system "awk" is bad on
-# system "awk" is bad on some platforms.
+# some platforms.
 # Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AMTAR='$${TAR-tar}'
@ -3046,7 +3047,6 @@ END
 fi
 ac_config_headers="$ac_config_headers lib/Grid_config.h"
 # Check whether --enable-silent-rules was given.
@ -3966,6 +3966,191 @@ else
  RANLIB="$ac_cv_prog_RANLIB"
 fi
    ax_cxx_compile_cxx11_required=true
  ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
  ac_success=no
  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features by default" >&5
 $as_echo_n "checking whether $CXX supports C++11 features by default... " >&6; }
 if ${ax_cv_cxx_compile_cxx11+:} false; then :
  $as_echo_n "(cached) " >&6
 else
  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
  template <typename T>
    struct check
    {
      static_assert(sizeof(int) <= sizeof(T), "not big enough");
    };
    struct Base {
    virtual void f() {}
    };
    struct Child : public Base {
    virtual void f() override {}
    };
    typedef check<check<bool>> right_angle_brackets;
    int a;
    decltype(a) b;
    typedef check<int> check_type;
    check_type c;
    check_type&& cr = static_cast<check_type&&>(c);
    auto d = a;
    auto l = [](){};
    // Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
    struct use_l { use_l() { l(); } };
    // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
    // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
    namespace test_template_alias_sfinae {
        struct foo {};
        template<typename T>
        using member = typename T::member_type;
        template<typename T>
        void func(...) {}
        template<typename T>
        void func(member<T>*) {}
        void test();
        void test() {
            func<foo>(0);
        }
    }
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
  ax_cv_cxx_compile_cxx11=yes
 else
  ax_cv_cxx_compile_cxx11=no
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_cxx_compile_cxx11" >&5
 $as_echo "$ax_cv_cxx_compile_cxx11" >&6; }
  if test x$ax_cv_cxx_compile_cxx11 = xyes; then
    ac_success=yes
  fi
    if test x$ac_success = xno; then
            for switch in -std=c++11 -std=c++0x +std=c++11; do
      cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
 $as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
 if eval \${$cachevar+:} false; then :
  $as_echo_n "(cached) " >&6
 else
  ac_save_CXXFLAGS="$CXXFLAGS"
         CXXFLAGS="$CXXFLAGS $switch"
         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
  template <typename T>
    struct check
    {
      static_assert(sizeof(int) <= sizeof(T), "not big enough");
    };
    struct Base {
    virtual void f() {}
    };
    struct Child : public Base {
    virtual void f() override {}
    };
    typedef check<check<bool>> right_angle_brackets;
    int a;
    decltype(a) b;
    typedef check<int> check_type;
    check_type c;
    check_type&& cr = static_cast<check_type&&>(c);
    auto d = a;
    auto l = [](){};
    // Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
    struct use_l { use_l() { l(); } };
    // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
    // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
    namespace test_template_alias_sfinae {
        struct foo {};
        template<typename T>
        using member = typename T::member_type;
        template<typename T>
        void func(...) {}
        template<typename T>
        void func(member<T>*) {}
        void test();
        void test() {
            func<foo>(0);
        }
    }
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
  eval $cachevar=yes
 else
  eval $cachevar=no
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
         CXXFLAGS="$ac_save_CXXFLAGS"
 fi
 eval ac_res=\$$cachevar
 	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
 $as_echo "$ac_res" >&6; }
      if eval test x\$$cachevar = xyes; then
        CXXFLAGS="$CXXFLAGS $switch"
        ac_success=yes
        break
      fi
    done
  fi
  ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
  if test x$ax_cxx_compile_cxx11_required = xtrue; then
    if test x$ac_success = xno; then
      as_fn_error $? "*** A compiler with support for C++11 language features is required." "$LINENO" 5
    fi
  else
    if test x$ac_success = xno; then
      HAVE_CXX11=0
      { $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5
 $as_echo "$as_me: No compiler with C++11 support was found" >&6;}
    else
      HAVE_CXX11=1
 $as_echo "#define HAVE_CXX11 1" >>confdefs.h
    fi
  fi
 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)
--- a/configure.ac
+++ b/configure.ac
@ -3,9 +3,9 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-05-19 13:51:08 neo>
+# Time-stamp: <2015-05-25 14:54:34 neo>
-AC_PREREQ([2.69])
+AC_PREREQ([2.63])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
 AC_CANONICAL_SYSTEM
 AM_INIT_AUTOMAKE(subdir-objects)
@ -26,6 +26,8 @@ AC_LANG(C++)
 AC_PROG_CXX
 AC_OPENMP
 AC_PROG_RANLIB
 AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)
--- a/lib/Grid_config.h
+++ b/lib/Grid_config.h
@ -16,6 +16,9 @@
 /* GRID_COMMS_NONE */
 #define GRID_COMMS_NONE 1
 /* define if the compiler supports basic C++11 syntax */
 /* #undef HAVE_CXX11 */
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #define HAVE_DECL_BE64TOH 1
--- a/lib/Grid_config.h.in
+++ b/lib/Grid_config.h.in
@ -15,6 +15,9 @@
 /* GRID_COMMS_NONE */
 #undef GRID_COMMS_NONE
 /* define if the compiler supports basic C++11 syntax */
 #undef HAVE_CXX11
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #undef HAVE_DECL_BE64TOH
--- a/lib/Grid_simd.h
+++ b/lib/Grid_simd.h
@ -13,28 +13,6 @@
 typedef uint32_t Integer;
 #ifdef SSE4
 #include <pmmintrin.h>
 #endif
 #if defined(AVX1) || defined (AVX2)
 #include <immintrin.h>
 // _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
 #ifndef _mm256_set_m128i
 #define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
 #endif
 #endif
 #ifdef AVX512
 #include <immintrin.h>
 #ifndef KNC_ONLY_STORES
 #define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
 #define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
 #endif
 #endif
 namespace Grid {
  typedef  float  RealF;
@ -66,45 +44,49 @@ namespace Grid {
  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
  inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
-
+  
-    ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
-    //Provide support functions for basic real and complex data types required by Grid
+  //Provide support functions for basic real and complex data types required by Grid
-    //Single and double precision versions. Should be able to template this once only.
+  //Single and double precision versions. Should be able to template this once only.
-    ////////////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////////////
-    inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
+  inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
-    inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
+  inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
-    inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
+  inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
-    inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
+  inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
-    // conjugate already supported for complex
+  // conjugate already supported for complex
-    
+  
-    inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
+  inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
-    inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
+  inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
-    inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
+  inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
-    inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
+  inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
-
+  
-    //conjugate already supported for complex
+  //conjugate already supported for complex
-
+  
-    inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
+  inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
-    inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
+  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
-    inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
+  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
-    inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
+  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
-    inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
+  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
-    inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
+  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
-    inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
+  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
-    inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
+  inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
-
+  
-    inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
+  inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
-    inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
+  inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
-    inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
+  inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
-    inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
+  inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
-    
+  
-    inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
+  inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
-    inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
+  inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
-    inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
+  inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
-    inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
+  inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
-    
+  
-
+  inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
-
+  inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
  inline void vstream(RealF &l, const RealF &r){ l=r;}
  inline void vstream(RealD &l, const RealD &r){ l=r;}
  class Zero{};
  static Zero zero;
  template<class itype> inline void zeroit(itype &arg){ arg=zero;};
@ -113,7 +95,6 @@ namespace Grid {
  template<>            inline void zeroit(RealF &arg){ arg=0; };
  template<>            inline void zeroit(RealD &arg){ arg=0; };
 #if defined (SSE4)
    typedef __m128 fvec;
    typedef __m128d dvec;
@ -245,56 +226,12 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
      default: assert(0); break;
      }
    };
 };
-#include <simd/Grid_vInteger.h>
+#include <simd/Grid_vector_types.h>
 #include <simd/Grid_vRealF.h>
 #include <simd/Grid_vRealD.h>
 #include <simd/Grid_vComplexF.h>
 #include <simd/Grid_vComplexD.h>
 namespace Grid {
  // NB: Template the following on "type Complex" and then implement *,+,- for 
  // ComplexF, ComplexD, RealF, RealD above to
  // get full generality of binops with scalars.
   inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
   inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
   inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
   inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
   inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
   inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
   inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
   inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
   inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
   inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
   inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
   inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
   inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
   inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
   inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
   inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
   inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
   inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
   inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
   inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
   inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
   inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
   inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
   inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
   inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
   inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
   inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
   inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
   inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
   inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
   inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
   inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
  // Default precision
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
  typedef vRealD vReal;
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -93,12 +93,10 @@ nobase_include_HEADERS = algorithms/approx/bigfloat.h		\
 	qcd/Grid_qcd_2spinor.h					\
 	qcd/Grid_qcd_dirac.h					\
 	qcd/Grid_qcd_wilson_dop.h				\
 	simd/Grid_vComplexD.h					\
 	simd/Grid_vComplexF.h					\
 	simd/Grid_vInteger.h					\
 	simd/Grid_vRealD.h					\
 	simd/Grid_vRealF.h					\
 	simd/Grid_vector_types.h				\
-	simd/Grid_sse4.h					
+	simd/Grid_sse4.h					\
 	simd/Grid_avx.h						\
 	simd/Grid_knc.h					
--- a/lib/cshift/Grid_cshift_common.h
+++ b/lib/cshift/Grid_cshift_common.h
@ -154,26 +154,35 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
    cbmask=0x3;
  }
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
 PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-      
+      /*
      int o =n*rhs._grid->_slice_stride[dimension];
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
      if ( ocb&cbmask ) {
 	lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
      }
-      
+      */
      int o =n*rhs._grid->_slice_stride[dimension]+b;
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
      if ( ocb&cbmask ) {
 	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
      }
    }
  }
 }
 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
 {
  int rd = rhs._grid->_rdimensions[dimension];
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
--- a/lib/math/Grid_math_tensors.h
+++ b/lib/math/Grid_math_tensors.h
@ -212,6 +212,16 @@ public:
  iMatrix(const Zero &z){ *this = zero; };
  iMatrix() =default;
  iMatrix& operator=(const iMatrix& rhs){
    for(int i=0;i<N;i++)
      for(int j=0;j<N;j++)
 	vstream(_internal[i][j],rhs._internal[i][j]);
    return *this;
  }; 
  iMatrix(scalar_type s)  { (*this) = s ;};// recurse down and hit the constructor for vector_type
  /*
@ -220,6 +230,9 @@ public:
  iMatrix<vtype,N> & operator= (const iMatrix<vtype,N> &copyme) = default;
  iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &&copyme) = default;
  */
  iMatrix<vtype,N> & operator= (const Zero &hero){
    zeroit(*this);
    return *this;
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@ -0,0 +1,399 @@
 //----------------------------------------------------------------------
 /*! @file Grid_avx.h
  @brief Optimization libraries for AVX1/2 instructions set
  Using intrinsics
 */
 // Time-stamp: <2015-05-22 18:58:27 neo>
 //----------------------------------------------------------------------
 #include <immintrin.h>
 // _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
 #ifndef _mm256_set_m128i
 #define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
 #endif
 namespace Optimization {
  struct Vsplat{
    //Complex float
    inline __m256 operator()(float a, float b){
      return _mm256_set_ps(b,a,b,a,b,a,b,a);
    }
    // Real float
    inline __m256 operator()(float a){
      return _mm256_set_ps(a,a,a,a,a,a,a,a);
    }
    //Complex double
    inline __m256d operator()(double a, double b){
      return _mm256_set_pd(b,a,b,a);
    }
    //Real double
    inline __m256d operator()(double a){
      return _mm256_set_pd(a,a,a,a);
    }
    //Integer
    inline __m256i operator()(Integer a){
      return _mm256_set1_epi32(a);
    }
  };
  struct Vstore{
    //Float 
    inline void operator()(__m256 a, float* F){
      _mm256_store_ps(F,a);
    }
    //Double
    inline void operator()(__m256d a, double* D){
      _mm256_store_pd(D,a);
    }
    //Integer
    inline void operator()(__m256i a, Integer* I){
      _mm256_store_si256((__m256i*)I,a);
    }
  };
  struct Vstream{
    //Float
    inline void operator()(float * a, __m256 b){
      _mm256_stream_ps(a,b);
    }
    //Double
    inline void operator()(double * a, __m256d b){
      _mm256_stream_pd(a,b);
    }
  };
  struct Vset{
    // Complex float 
    inline __m256 operator()(Grid::ComplexF *a){
      return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
    }
    // Complex double 
    inline __m256d operator()(Grid::ComplexD *a){
      return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
    }
    // Real float 
    inline __m256 operator()(float *a){
      return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
    }
    // Real double
    inline __m256d operator()(double *a){
      return _mm256_set_pd(a[3],a[2],a[1],a[0]);
    }
    // Integer
    inline __m256i operator()(Integer *a){
      return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
    }
  };
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
    //General form must generate error if compiled
    inline Out_type operator()(In_type in){
      printf("Error, using wrong Reduce function\n");
      exit(1);
      return 0;
    }
  };
  /////////////////////////////////////////////////////
  // Arithmetic operations
  /////////////////////////////////////////////////////
  struct Sum{
    //Complex/Real float
    inline __m256 operator()(__m256 a, __m256 b){
      return _mm256_add_ps(a,b);
    }
    //Complex/Real double
    inline __m256d operator()(__m256d a, __m256d b){
      return _mm256_add_pd(a,b);
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
 #if defined (AVX1) 
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
          b0 = _mm256_extractf128_si256(b,0);
          a1 = _mm256_extractf128_si256(a,1);
          b1 = _mm256_extractf128_si256(b,1);
          a0 = _mm_add_epi32(a0,b0);
          a1 = _mm_add_epi32(a1,b1);
          return _mm256_set_m128i(a1,a0);
 #endif
 #if defined (AVX2)
            return _mm256_add_epi32(a,b);
 #endif
    }
  };
  struct Sub{
    //Complex/Real float
    inline __m256 operator()(__m256 a, __m256 b){
      return _mm256_sub_ps(a,b);
    }
    //Complex/Real double
    inline __m256d operator()(__m256d a, __m256d b){
      return _mm256_sub_pd(a,b);
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
 #if defined (AVX1) 
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
          b0 = _mm256_extractf128_si256(b,0);
          a1 = _mm256_extractf128_si256(a,1);
          b1 = _mm256_extractf128_si256(b,1);
          a0 = _mm_sub_epi32(a0,b0);
          a1 = _mm_sub_epi32(a1,b1);
          return _mm256_set_m128i(a1,a0);
 #endif
 #if defined (AVX2)
            return _mm256_sub_epi32(a,b);
 #endif
    }
  };
  struct MultComplex{
    // Complex float
    inline __m256 operator()(__m256 a, __m256 b){
      __m256 ymm0,ymm1,ymm2;
      ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
      // FIXME AVX2 could MAC
      ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
      ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
      ymm1 = _mm256_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
      return _mm256_addsub_ps(ymm0,ymm1);  
    }
    // Complex double
    inline __m256d operator()(__m256d a, __m256d b){
      //Multiplication of (ak+ibk)*(ck+idk)
      // a + i b can be stored as a data structure
      //From intel optimisation reference guide
      /*
 	movsldup xmm0, Src1; load real parts into the destination,
 	; a1, a1, a0, a0
 	movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
 	mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
 	shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
 	movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
 	mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
 	addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
 	VSHUFPD (VEX.256 encoded version)
 	IF IMM0[0] = 0
 	THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
 	IF IMM0[1] = 0
 	THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
 	IF IMM0[2] = 0
 	THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
 	IF IMM0[3] = 0
 	THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
      */
      __m256d ymm0,ymm1,ymm2;
      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
      ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
      ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi  b'01,01 
      ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai  b'11,11
      ymm1 = _mm256_mul_pd(ymm1,ymm2);   // ymm1 <- br ai, ai bi
      return _mm256_addsub_pd(ymm0,ymm1);
    }
  };
  struct Mult{
    // Real float
    inline __m256 operator()(__m256 a, __m256 b){
      return _mm256_mul_ps(a,b);
    }
    // Real double
    inline __m256d operator()(__m256d a, __m256d b){
      return _mm256_mul_pd(a,b);
    }
    // Integer
    inline __m256i operator()(__m256i a, __m256i b){
 #if defined (AVX1) 
      __m128i a0,a1;
      __m128i b0,b1;
      a0 = _mm256_extractf128_si256(a,0);
      b0 = _mm256_extractf128_si256(b,0);
      a1 = _mm256_extractf128_si256(a,1);
      b1 = _mm256_extractf128_si256(b,1);
      a0 = _mm_mul_epi32(a0,b0);
      a1 = _mm_mul_epi32(a1,b1);
      return _mm256_set_m128i(a1,a0);
 #endif
 #if defined (AVX2)
      return _mm256_mul_epi32(a,b);
 #endif
    }
  };
  struct Conj{
    // Complex single
    inline __m256 operator()(__m256 in){
      return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f));
    }
    // Complex double
    inline __m256d operator()(__m256d in){
      return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));//untested
      /*
 	// original 
 	//      addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
 	__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),_mm256_shuffle_pd(in,in,0x5));
 	return _mm256_shuffle_pd(tmp,tmp,0x5);
      */
    }
    // do not define for integer input
  };
  struct TimesMinusI{
    //Complex single
    inline __m256 operator()(__m256 in, __m256 ret){
      __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in);   // r,-i
      return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
    }
    //Complex double
    inline __m256d operator()(__m256d in, __m256d ret){
      __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
      return _mm256_shuffle_pd(tmp,tmp,0x5);
    }
  };
  struct TimesI{
    //Complex single
    inline __m256 operator()(__m256 in, __m256 ret){
      __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
      return _mm256_addsub_ps(_mm256_setzero_ps(),tmp);          // i,-r
    }
    //Complex double
    inline __m256d operator()(__m256d in, __m256d ret){
      __m256d tmp = _mm256_shuffle_pd(in,in,0x5);
      return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
    }
  };
  //////////////////////////////////////////////
  // Some Template specialization
  template < typename vtype > 
    void permute(vtype a, vtype b, int perm) {
    union { 
      __m256 f;
      vtype v;
    } conv;
    conv.v = b;
    switch (perm){
      // 8x32 bits=>3 permutes
    case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
    case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
    case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
    default: assert(0); break;
    }
    a = conv.v;
  }
  //Complex float Reduce
  template<>
    inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
    __m256 v1,v2;
    Optimization::permute(v1,in,0); // sse 128; paired complex single
    v1 = _mm256_add_ps(v1,in);
    Optimization::permute(v2,v1,1); // avx 256; quad complex single
    v1 = _mm256_add_ps(v1,v2);
    return Grid::ComplexF(v1[0],v1[1]);
  }
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
    __m256 v1,v2;
    Optimization::permute(v1,in,0); // avx 256; octo-double
    v1 = _mm256_add_ps(v1,in);
    Optimization::permute(v2,v1,1); 
    v1 = _mm256_add_ps(v1,v2);
    Optimization::permute(v2,v1,2); 
    v1 = _mm256_add_ps(v1,v2);
    return v1[0];
  }
  //Complex double Reduce
  template<>
  inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
    __m256d v1;
    Optimization::permute(v1,in,0); // sse 128; paired complex single
    v1 = _mm256_add_pd(v1,in);
    return Grid::ComplexD(v1[0],v1[1]);
  }
  //Real double Reduce
  template<>
  inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
    __m256d v1,v2;
    Optimization::permute(v1,in,0); // avx 256; quad double
    v1 = _mm256_add_pd(v1,in);
    Optimization::permute(v2,v1,1); 
    v1 = _mm256_add_pd(v1,v2);
    return v1[0];
  }
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
    // FIXME unimplemented
    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 }
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
 namespace Grid {
  typedef __m256  SIMD_Ftype; // Single precision type
  typedef __m256d SIMD_Dtype; // Double precision type
  typedef __m256i SIMD_Itype; // Integer type
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
  typedef Optimization::Vset     VsetSIMD;
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
 }
--- a/lib/simd/Grid_knc.h
+++ b/lib/simd/Grid_knc.h
@ -0,0 +1,323 @@
 //----------------------------------------------------------------------
 /*! @file Grid_knc.h
  @brief Optimization libraries for AVX512 instructions set for KNC
  Using intrinsics
 */
 // Time-stamp: <2015-05-22 17:12:44 neo>
 //----------------------------------------------------------------------
 #include <immintrin.h>
 #ifndef KNC_ONLY_STORES
 #define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
 #define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
 #endif
 namespace Optimization {
  struct Vsplat{
    //Complex float
    inline __m512 operator()(float a, float b){
      return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
    }
    // Real float
    inline __m512 operator()(float a){
      return _mm512_set1_ps(a);
    }
    //Complex double
    inline __m512d operator()(double a, double b){
      return _mm512_set_pd(b,a,b,a,b,a,b,a);
    }
    //Real double
    inline __m512d operator()(double a){
      return _mm512_set1_pd(a);
    }
    //Integer
    inline __m512i operator()(Integer a){
      return _mm512_set1_epi32(a);
    }
  };
  struct Vstore{
    //Float 
    inline void operator()(__m512 a, float* F){
      _mm512_store_ps(F,a);
    }
    //Double
    inline void operator()(__m512d a, double* D){
      _mm512_store_pd(D,a);
    }
    //Integer
    inline void operator()(__m512i a, Integer* I){
      _mm512_store_si512((__m512i *)I,a);
    }
  };
  struct Vstream{
    //Float
    inline void operator()(float * a, __m512 b){
      _mm512_storenrngo_ps(a,b);
    }
    //Double
    inline void operator()(double * a, __m512d b){
      _mm512_storenrngo_pd(a,b);
    }
  };
  struct Vset{
    // Complex float 
    inline __m512 operator()(Grid::ComplexF *a){
      return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
 			   a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
 			   a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
 			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
    }
    // Complex double 
    inline __m512d operator()(Grid::ComplexD *a){
      return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
 			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
    }
    // Real float 
    inline __m512 operator()(float *a){
      return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
 			    a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
    }
    // Real double
    inline __m512d operator()(double *a){
      return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
    }
    // Integer
    inline __m512i operator()(Integer *a){
      return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
 			       a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
    }
  };
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
    //General form must generate error if compiled
    inline Out_type operator()(In_type in){
      printf("Error, using wrong Reduce function\n");
      exit(1);
      return 0;
    }
  };
  /////////////////////////////////////////////////////
  // Arithmetic operations
  /////////////////////////////////////////////////////
  struct Sum{
    //Complex/Real float
    inline __m512 operator()(__m512 a, __m512 b){
      return _mm512_add_ps(a,b);
    }
    //Complex/Real double
    inline __m512d operator()(__m512d a, __m512d b){
      return _mm512_add_pd(a,b);
    }
    //Integer
    inline __m512i operator()(__m512i a, __m512i b){
      return _mm512_add_epi32(a,b);
    }
  };
  struct Sub{
    //Complex/Real float
    inline __m512 operator()(__m512 a, __m512 b){
      return _mm512_sub_ps(a,b);
    }
    //Complex/Real double
    inline __m512d operator()(__m512d a, __m512d b){
      return _mm512_sub_pd(a,b);
    }
    //Integer
    inline __m512i operator()(__m512i a, __m512i b){
      return _mm512_sub_epi32(a,b);
    }
  };
  struct MultComplex{
    // Complex float
    inline __m512 operator()(__m512 a, __m512 b){
      __m512 vzero,ymm0,ymm1,real, imag;
      vzero = _mm512_setzero_ps();
      ymm0  = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // 
      real  = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
      imag  = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
      ymm1  = _mm512_mul_ps(real, b);
      ymm0  = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
      return _mm512_fmadd_ps(ymm0,imag,ymm1);
    }
    // Complex double
    inline __m512d operator()(__m512d a, __m512d b){
      /* This is from
       * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
       * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
       * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
       * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
       * booktitle = {Proceedings of the International Conference on Supercomputing},
       * series = {ICS '11},
       * year = {2011},
       * isbn = {978-1-4503-0102-2},
       * location = {Tucson, Arizona, USA},
       * pages = {265--274},
       * numpages = {10},
       * url = {http://doi.acm.org/10.1145/1995896.1995938},
       * doi = {10.1145/1995896.1995938},
       * acmid = {1995938},
       * publisher = {ACM},
       * address = {New York, NY, USA},
       * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
       *                } 
       */
      __m512d vzero,ymm0,ymm1,real,imag;
      vzero =_mm512_setzero_pd();
      ymm0 =  _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // 
      real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
      imag =  _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
      ymm1 =  _mm512_mul_pd(real, b);
      ymm0 =  _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
      return  _mm512_fmadd_pd(ymm0,imag,ymm1);
    }
  };
  struct Mult{
    // Real float
    inline __m512 operator()(__m512 a, __m512 b){
      return _mm512_mul_ps(a,b);
    }
    // Real double
    inline __m512d operator()(__m512d a, __m512d b){
      return _mm512_mul_pd(a,b);
    }
    // Integer
    inline __m512i operator()(__m512i a, __m512i b){
      return _mm512_mullo_epi32(a,b);
    }
  };
  struct Conj{
    // Complex single
    inline __m512 operator()(__m512 in){
      return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag  
    }
    // Complex double
    inline __m512d operator()(__m512d in){
      return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
    }
    // do not define for integer input
  };
  struct TimesMinusI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
      return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
      return  _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
    }
  };
  struct TimesI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
      __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
      __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
    }
  };
  //////////////////////////////////////////////
  // Some Template specialization
  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
    return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
  }
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
    return _mm512_reduce_add_ps(in);
  }
  //Complex double Reduce
  template<>
  inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
    return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
  }
  //Real double Reduce
  template<>
  inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
    return _mm512_reduce_add_pd(in);
  }
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
    // FIXME unimplemented
    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 }
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
 namespace Grid {
  typedef __m512 SIMD_Ftype;  // Single precision type
  typedef __m512d SIMD_Dtype; // Double precision type
  typedef __m512i SIMD_Itype; // Integer type
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
  typedef Optimization::Vset     VsetSIMD;
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
 }
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@ -0,0 +1,272 @@
 //----------------------------------------------------------------------
 /*! @file Grid_qpx.h
  @brief Optimization libraries for QPX instructions set for BG/Q
  Using intrinsics
 */
 // Time-stamp: <2015-05-22 17:29:26 neo>
 //----------------------------------------------------------------------
 // lot of undefined functions
 namespace Optimization {
  struct Vsplat{
    //Complex float
    inline float operator()(float a, float b){
      return {a,b,a,b};
    }
    // Real float
    inline float operator()(float a){
      return {a,a,a,a};
    }
    //Complex double
    inline vector4double operator()(double a, double b){
      return {a,b,a,b};
    }
    //Real double
    inline vector4double operator()(double a){
      return {a,a,a,a};
    }
    //Integer
    inline int operator()(Integer a){
 #error
    }
  };
  struct Vstore{
    //Float 
    inline void operator()(float a, float* F){
      assert(0);
    }
    //Double
    inline void operator()(vector4double a, double* D){
      assert(0);
    }
    //Integer
    inline void operator()(int a, Integer* I){
      assert(0);
    }
  };
  struct Vstream{
    //Float
    inline void operator()(float * a, float b){
      assert(0);
    }
    //Double
    inline void operator()(double * a, vector4double b){
      assert(0);
    }
  };
  struct Vset{
    // Complex float 
    inline float operator()(Grid::ComplexF *a){
      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
    }
    // Complex double 
    inline vector4double operator()(Grid::ComplexD *a){
      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
    }
    // Real float 
    inline float operator()(float *a){
      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
    }
    // Real double
    inline vector4double operator()(double *a){
      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
    }
    // Integer
    inline int operator()(Integer *a){
 #error
    }
  };
  template <typename Out_type, typename In_type>
    struct Reduce{
      //Need templated class to overload output type
      //General form must generate error if compiled
      inline Out_type operator()(In_type in){
 	printf("Error, using wrong Reduce function\n");
 	exit(1);
 	return 0;
      }
    };
  /////////////////////////////////////////////////////
  // Arithmetic operations
  /////////////////////////////////////////////////////
  struct Sum{
    //Complex/Real float
    inline float operator()(float a, float b){
 #error
    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
      return vec_add(a,b);
    }
    //Integer
    inline int operator()(int a, int b){
 #error
    }
  };
  struct Sub{
    //Complex/Real float
    inline float operator()(float a, float b){
 #error
    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
 #error
    }
    //Integer
    inline floati operator()(int a, int b){
 #error
    }
  };
  struct MultComplex{
    // Complex float
    inline float operator()(float a, float b){
 #error
    }
    // Complex double
    inline vector4double operator()(vector4double a, vector4double b){
 #error
    }
  };
  struct Mult{
    // Real float
    inline float operator()(float a, float b){
 #error
    }
    // Real double
    inline vector4double operator()(vector4double a, vector4double b){
 #error
    }
    // Integer
    inline int operator()(int a, int b){
 #error
    }
  };
  struct Conj{
    // Complex single
    inline float operator()(float in){
      assert(0);
    }
    // Complex double
    inline vector4double operator()(vector4double in){
      assert(0);
    }
    // do not define for integer input
  };
  struct TimesMinusI{
    //Complex single
    inline float operator()(float in, float ret){
      assert(0);
    }
    //Complex double
    inline vector4double operator()(vector4double in, vector4double ret){
      assert(0);
    }
  };
  struct TimesI{
    //Complex single
    inline float operator()(float in, float ret){
    }
    //Complex double
    inline vector4double operator()(vector4double in, vector4double ret){
    }
  };
  //////////////////////////////////////////////
  // Some Template specialization
  //Complex float Reduce
  template<>
    inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
    assert(0);
  }
  //Real float Reduce
  template<>
    inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
    assert(0);
  }
  //Complex double Reduce
  template<>
    inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
    assert(0);
  }
  //Real double Reduce
  template<>
    inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
    assert(0);
  }
  //Integer Reduce
  template<>
    inline Integer Reduce<Integer, floati>::operator()(float in){
    assert(0);
  }
 }
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
 namespace Grid {
  typedef float SIMD_Ftype  __attribute__ ((vector_size (16)));         // Single precision type
  typedef vector4double SIMD_Dtype; // Double precision type
  typedef int SIMD_Itype;           // Integer type
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
  typedef Optimization::Vset     VsetSIMD;
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
 }
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@ -4,7 +4,7 @@
  Using intrinsics
 */
-// Time-stamp: <2015-05-20 16:45:39 neo>
+// Time-stamp: <2015-05-21 18:06:30 neo>
 //----------------------------------------------------------------------
 #include <pmmintrin.h>
@ -53,12 +53,12 @@ namespace Optimization {
  struct Vstream{
    //Float
-    inline void operator()(__m128 a, __m128 b){
+    inline void operator()(float * a, __m128 b){
-      _mm_stream_ps((float *)&a,b);
+      _mm_stream_ps(a,b);
    }
    //Double
-    inline void operator()(__m128d a, __m128d b){
+    inline void operator()(double * a, __m128d b){
-      _mm_stream_pd((double *)&a,b);
+      _mm_stream_pd(a,b);
    }
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@ -2,13 +2,23 @@
 /*! @file Grid_vector_types.h
  @brief Defines templated class Grid_simd to deal with inner vector types
 */
-// Time-stamp: <2015-05-20 17:31:55 neo>
+// Time-stamp: <2015-05-26 13:44:54 neo>
 //---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES
 #ifdef SSE4
 #include "Grid_sse4.h"
-
+#endif
 #if defined (AVX1)|| defined (AVX2)
 #include "Grid_avx.h"
 #endif
 #if defined AVX512
 #include "Grid_knc.h"
 #endif
 #if defined QPX
 #include "Grid_qpx.h"
 #endif
 namespace Grid {
@ -25,8 +35,6 @@ namespace Grid {
  template <typename Condition, typename ReturnType> using EnableIf   =    Invoke<std::enable_if<Condition::value, ReturnType>>;
  template <typename Condition, typename ReturnType> using NotEnableIf=    Invoke<std::enable_if<!Condition::value, ReturnType>>;
  ////////////////////////////////////////////////////////
  // Check for complexity with type traits
  template <typename T>     struct is_complex : std::false_type {};
@ -36,18 +44,71 @@ namespace Grid {
  // general forms to allow for vsplat syntax
  // need explicit declaration of types when used since
  // clang cannot automatically determine the output type sometimes
  // use decltype?
  template < class Out, class Input1, class Input2, class Operation > 
    Out binary(Input1 src_1, Input2 src_2, Operation op){
    return op(src_1, src_2);
  } 
-  template < class SIMDout, class Input, class Operation > 
+  template < class Out, class Input, class Operation > 
-    SIMDout unary(Input src, Operation op){
+    Out unary(Input src, Operation op){
    return op(src);
  } 
  ///////////////////////////////////////////////
 //////////////////////////////////////////////////////////
 // Permute
 // Permute 0 every ABCDEFGH -> BA DC FE HG
 // Permute 1 every ABCDEFGH -> CD AB GH EF
 // Permute 2 every ABCDEFGH -> EFGH ABCD
 // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
 // Permute 4 possible on half precision @512bit vectors.
 //////////////////////////////////////////////////////////
 template<class vsimd>
 inline void Gpermute(vsimd &y,const vsimd &b,int perm){
 	union { 
 	  SIMD_Ftype f;
 	  decltype(vsimd::v) v;
 	} conv;
 	conv.v = b.v;
      switch (perm){
 #if defined(AVX1)||defined(AVX2)
      // 8x32 bits=>3 permutes
      case 2: 
 	conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); 
 	break;
      case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
      case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
 #endif
 #ifdef SSE4
      case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
      case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break;
 #endif
 #ifdef AVX512
 	// 16 floats=> permutes
        // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo 
        // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn 
        // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
        // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
      case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
      case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
      case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
      case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
 #endif
 #ifdef QPX
 #error not implemented
 #endif
      default: assert(0); break;
      }
      y.v=conv.v;
 };
 ///////////////////////////////////////
  /*
    @brief Grid_simd class for the SIMD vector type operations
   */
@ -56,27 +117,34 @@ namespace Grid {
  public:
    typedef typename RealPart < Scalar_type >::type Real; 
    typedef Vector_type     vector_type;
    typedef Scalar_type     scalar_type;
    Vector_type v;
-
+    
    static inline int Nsimd(void) { return sizeof(Vector_type)/sizeof(Scalar_type);}
-
+    
    // Constructors
    Grid_simd & operator = ( Zero & z){
      vzero(*this);
      return (*this);
    }
    Grid_simd(){};
    Grid_simd& operator=(const Grid_simd&& rhs){v=rhs.v;return *this;};
    Grid_simd& operator=(const Grid_simd& rhs){v=rhs.v;return *this;}; //faster than not declaring it and leaving to the compiler
    Grid_simd()=default; 
    Grid_simd(const Grid_simd& rhs):v(rhs.v){};    //compiles in movaps
    Grid_simd(const Grid_simd&& rhs):v(rhs.v){};  
    //Enable if complex type
    template < class S = Scalar_type > 
-    Grid_simd(typename std::enable_if< is_complex < S >::value, S>::type a){
+    Grid_simd(const typename std::enable_if< is_complex < S >::value, S>::type a){
      vsplat(*this,a);
    };
-    Grid_simd(Real a){
+    Grid_simd(const Real a){
      vsplat(*this,Scalar_type(a));
    };
@ -88,18 +156,25 @@ namespace Grid {
    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
-    //not for integer types... FIXME
+
    friend inline void mac (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ a,const Grid_simd   *__restrict__ x){ *y = (*a)*(*x)+(*y); };
    friend inline void mult(Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd   *__restrict__ r){ *y = (*l) * (*r); }
    friend inline void sub (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd   *__restrict__ r){ *y = (*l) - (*r); }
    friend inline void add (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd   *__restrict__ r){ *y = (*l) + (*r); }
    friend inline void mac (Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ a,const Scalar_type *__restrict__ x){ *y = (*a)*(*x)+(*y); };
    friend inline void mult(Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) * (*r); }
    friend inline void sub (Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) - (*r); }
    friend inline void add (Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) + (*r); }
    //not for integer types... 
    template <  class S = Scalar_type, NotEnableIf<std::is_integral < S >, int> = 0 > 
    friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
    ///////////////////////////////////////////////
    // Initialise to 1,0,i for the correct types
    ///////////////////////////////////////////////
    // if not complex overload here 
    template <  class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 > 
      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0); }
    template <  class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 > 
      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0); }
    // For complex types
    template <  class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 > 
      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0,0.0); }
@ -107,6 +182,14 @@ namespace Grid {
      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0,0.0); }// use xor?
    template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 > 
      friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);} 
    // if not complex overload here 
    template <  class S = Scalar_type, EnableIf<std::is_floating_point < S >,int> = 0 > 
      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0); }
    template <  class S = Scalar_type, EnableIf<std::is_floating_point < S >,int> = 0 > 
      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0); }
    // For integral types
    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
@ -116,7 +199,7 @@ namespace Grid {
    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
      friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
-      friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
+      friend inline void vfalse(Grid_simd &ret){vsplat(ret,0);}
    ////////////////////////////////////
    // Arithmetic operator overloads +,-,*
@ -192,8 +275,9 @@ namespace Grid {
    ///////////////////////
    // Vstream
    ///////////////////////
    template <  class S = Scalar_type, NotEnableIf<std::is_integral < S >, int> = 0 > 
    friend inline void vstream(Grid_simd &out,const Grid_simd &in){
-      binary<void>(out.v, in.v, VstreamSIMD());
+      binary<void>((Real*)&out.v, in.v, VstreamSIMD());
    }
    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
@ -291,7 +375,7 @@ namespace Grid {
    // Unary negation
    ///////////////////////
    friend inline Grid_simd operator -(const Grid_simd &r) {
-      vComplexF ret;
+      Grid_simd ret;
      vzero(ret);
      ret = ret - r;
      return ret;
@ -336,7 +420,7 @@ namespace Grid {
  }
  template<class scalar_type, class vector_type >
-  inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);}
+    inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);}
  template<class scalar_type, class vector_type >
@ -354,33 +438,15 @@ namespace Grid {
  // Define available types (now change names to avoid clashing with the rest of the code)
-  typedef Grid_simd< float                 , SIMD_Ftype > MyRealF;
+  typedef Grid_simd< float                 , SIMD_Ftype > vRealF;
-  typedef Grid_simd< double                , SIMD_Dtype > MyRealD;
+  typedef Grid_simd< double                , SIMD_Dtype > vRealD;
-  typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF;
+  typedef Grid_simd< std::complex< float > , SIMD_Ftype > vComplexF;
-  typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD;
+  typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD;
  typedef Grid_simd< Integer               , SIMD_Itype > vInteger;
  ////////////////////////////////////////////////////////////////////
  // Temporary hack to keep independent from the rest of the code
  template<> struct isGridTensor<MyRealD > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<MyRealF > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<MyComplexD > {
    static const bool value = false;
    static const bool notvalue = true;
  };
  template<> struct isGridTensor<MyComplexF > {
    static const bool value = false;
    static const bool notvalue = true;
  };
--- a/lib/simd/Old/Grid_vComplexD.h
+++ b/lib/simd/Old/Grid_vComplexD.h
--- a/lib/simd/Old/Grid_vComplexF.h
+++ b/lib/simd/Old/Grid_vComplexF.h
@ -54,7 +54,7 @@ namespace Grid {
        //////////////////////////////////
        friend inline void vone(vComplexF &ret)      { vsplat(ret,1.0,0.0); }
        friend inline void vzero(vComplexF &ret)     { vsplat(ret,0.0,0.0); }
-        friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0);}
+        friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0); }
        ////////////////////////////////////
        // Arithmetic operator overloads +,-,*
--- a/lib/simd/Old/Grid_vInteger.h
+++ b/lib/simd/Old/Grid_vInteger.h
--- a/lib/simd/Old/Grid_vRealD.h
+++ b/lib/simd/Old/Grid_vRealD.h
--- a/lib/simd/Old/Grid_vRealF.h
+++ b/lib/simd/Old/Grid_vRealF.h
--- a/m4/ax_cxx_compile_stdcxx_11.m4
+++ b/m4/ax_cxx_compile_stdcxx_11.m4
@ -0,0 +1,167 @@
 # ============================================================================
 #  http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html
 # ============================================================================
 #
 # SYNOPSIS
 #
 #   AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional])
 #
 # DESCRIPTION
 #
 #   Check for baseline language coverage in the compiler for the C++11
 #   standard; if necessary, add switches to CXXFLAGS to enable support.
 #
 #   The first argument, if specified, indicates whether you insist on an
 #   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
 #   -std=c++11).  If neither is specified, you get whatever works, with
 #   preference for an extended mode.
 #
 #   The second argument, if specified 'mandatory' or if left unspecified,
 #   indicates that baseline C++11 support is required and that the macro
 #   should error out if no mode with that support is found.  If specified
 #   'optional', then configuration proceeds regardless, after defining
 #   HAVE_CXX11 if and only if a supporting mode is found.
 #
 # LICENSE
 #
 #   Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
 #   Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
 #   Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
 #   Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov@google.com>
 #
 #   Copying and distribution of this file, with or without modification, are
 #   permitted in any medium without royalty provided the copyright notice
 #   and this notice are preserved. This file is offered as-is, without any
 #   warranty.
 #serial 11
 m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [[
  template <typename T>
    struct check
    {
      static_assert(sizeof(int) <= sizeof(T), "not big enough");
    };
    struct Base {
    virtual void f() {}
    };
    struct Child : public Base {
    virtual void f() override {}
    };
    typedef check<check<bool>> right_angle_brackets;
    int a;
    decltype(a) b;
    typedef check<int> check_type;
    check_type c;
    check_type&& cr = static_cast<check_type&&>(c);
    auto d = a;
    auto l = [](){};
    // Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
    struct use_l { use_l() { l(); } };
    // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
    // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
    namespace test_template_alias_sfinae {
        struct foo {};
        template<typename T>
        using member = typename T::member_type;
        template<typename T>
        void func(...) {}
        template<typename T>
        void func(member<T>*) {}
        void test();
        void test() {
            func<foo>(0);
        }
    }
 ]])
 AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl
  m4_if([$1], [], [],
        [$1], [ext], [],
        [$1], [noext], [],
        [m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl
  m4_if([$2], [], [ax_cxx_compile_cxx11_required=true],
        [$2], [mandatory], [ax_cxx_compile_cxx11_required=true],
        [$2], [optional], [ax_cxx_compile_cxx11_required=false],
        [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])])
  AC_LANG_PUSH([C++])dnl
  ac_success=no
  AC_CACHE_CHECK(whether $CXX supports C++11 features by default,
  ax_cv_cxx_compile_cxx11,
  [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
    [ax_cv_cxx_compile_cxx11=yes],
    [ax_cv_cxx_compile_cxx11=no])])
  if test x$ax_cv_cxx_compile_cxx11 = xyes; then
    ac_success=yes
  fi
  m4_if([$1], [noext], [], [dnl
  if test x$ac_success = xno; then
    for switch in -std=gnu++11 -std=gnu++0x; do
      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
      AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
                     $cachevar,
        [ac_save_CXXFLAGS="$CXXFLAGS"
         CXXFLAGS="$CXXFLAGS $switch"
         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
          [eval $cachevar=yes],
          [eval $cachevar=no])
         CXXFLAGS="$ac_save_CXXFLAGS"])
      if eval test x\$$cachevar = xyes; then
        CXXFLAGS="$CXXFLAGS $switch"
        ac_success=yes
        break
      fi
    done
  fi])
  m4_if([$1], [ext], [], [dnl
  if test x$ac_success = xno; then
    dnl HP's aCC needs +std=c++11 according to:
    dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
    for switch in -std=c++11 -std=c++0x +std=c++11; do
      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
      AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
                     $cachevar,
        [ac_save_CXXFLAGS="$CXXFLAGS"
         CXXFLAGS="$CXXFLAGS $switch"
         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
          [eval $cachevar=yes],
          [eval $cachevar=no])
         CXXFLAGS="$ac_save_CXXFLAGS"])
      if eval test x\$$cachevar = xyes; then
        CXXFLAGS="$CXXFLAGS $switch"
        ac_success=yes
        break
      fi
    done
  fi])
  AC_LANG_POP([C++])
  if test x$ax_cxx_compile_cxx11_required = xtrue; then
    if test x$ac_success = xno; then
      AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.])
    fi
  else
    if test x$ac_success = xno; then
      HAVE_CXX11=0
      AC_MSG_NOTICE([No compiler with C++11 support was found])
    else
      HAVE_CXX11=1
      AC_DEFINE(HAVE_CXX11,1,
                [define if the compiler supports basic C++11 syntax])
    fi
    AC_SUBST(HAVE_CXX11)
  fi
 ])
--- a/tests/Grid_main.cc
+++ b/tests/Grid_main.cc
@ -104,6 +104,9 @@ int main (int argc, char ** argv)
    random(FineRNG,scVec);
    fflush(stdout);
    cVec = cMat * cVec;  // LatticeColourVector     = LatticeColourMatrix     * LatticeColourVector
    sVec = sMat * sVec;  // LatticeSpinVector       = LatticeSpinMatrix       * LatticeSpinVector
    scVec= scMat * scVec;// LatticeSpinColourVector = LatticeSpinColourMatrix * LatticeSpinColourVector
@ -113,12 +116,14 @@ int main (int argc, char ** argv)
    cMat = outerProduct(cVec,cVec);
    scalar = localInnerProduct(cVec,cVec);
    scalar += scalar;
    scalar -= scalar;
    scalar *= scalar;
    add(scalar,scalar,scalar);
    sub(scalar,scalar,scalar);
    mult(scalar,scalar,scalar);
    mac(scalar,scalar,scalar);
    scalar = scalar+scalar;
    scalar = scalar-scalar;
@ -142,7 +147,7 @@ int main (int argc, char ** argv)
    scalar=trace(scalar);
    scalar=localInnerProduct(cVec,cVec);
    scalar=localNorm2(cVec);
-
+    
 //     -=,+=,*=,()
 //     add,+,sub,-,mult,mac,*
 //     adj,conjugate
@ -154,51 +159,11 @@ int main (int argc, char ** argv)
 //     localNorm2
 //     localInnerProduct
    scMat = sMat*scMat;  // LatticeSpinColourMatrix = LatticeSpinMatrix       * LatticeSpinColourMatrix
 #ifdef SSE4
    ///////// Tests the new class Grid_simd 
    std::complex<double> ctest(3.0,2.0);
    std::complex<float> ctestf(3.0,2.0);
    MyComplexF TestMe1(1.0); // fills only real part
    MyComplexD TestMe2(ctest);
    MyComplexD TestMe3(ctest);// compiler generate conversion of basic types
    //MyRealF TestMe5(ctest);// Must generate compiler error
    MyRealD TestRe1(2.0); 
    MyRealF TestRe2(3.0); 
    vone(TestRe2);
    MyComplexF TestMe6(ctestf);
    MyComplexF TestMe7(ctestf);  
    MyComplexD TheSum= TestMe2*TestMe3;
    MyComplexF TheSumF= TestMe6*TestMe7;
    double dsum[2];
    _mm_store_pd(dsum, TheSum.v);
    for (int i =0; i< 2; i++)
      printf("%f\n", dsum[i]);
    MyComplexD TheSumI = timesMinusI(TheSum);
    MyComplexF TheSumIF = timesMinusI(TheSumF);
    float fsum[4];
    _mm_store_ps(fsum, TheSumF.v);
    for (int i =0; i< 4; i++)
      printf("%f\n", fsum[i]);
    vstore(TheSumI, &ctest);
    std::complex<float> sum = Reduce(TheSumF);
    std::cout << ctest<< std::endl;
    std::cout << sum<< std::endl;
 #endif
    ///////////////////////
    // Non-lattice (const objects) * Lattice
    ColourMatrix cm;
    SpinColourMatrix scm;
@ -230,7 +195,7 @@ int main (int argc, char ** argv)
    scMat = scMat*mydouble;
    scMat = mydouble*scMat;
    cMat = mydouble*cMat;
-    
+  
    sMat = adj(sMat);       // LatticeSpinMatrix adjoint
    sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix
    sMat = GammaFive*sMat;  // SpinMatrix * LatticeSpinMatrix
@ -241,6 +206,9 @@ int main (int argc, char ** argv)
    scm=transpose(scm);
    scm=transposeIndex<1>(scm);
 //    Foo = Foo+scalar; // LatticeColourMatrix+Scalar
 //    Foo = Foo*scalar; // LatticeColourMatrix*Scalar
 //    Foo = Foo-scalar; // LatticeColourMatrix-Scalar
@ -280,7 +248,6 @@ int main (int argc, char ** argv)
      pokeIndex<1> (c_m,c,0,0);
    }
    FooBar = Bar;
    /*
@ -333,14 +300,14 @@ int main (int argc, char ** argv)
    // Lattice SU(3) x SU(3)
    Fine.Barrier();
    FooBar = Foo * Bar;
-    
+
    // Lattice 12x12 GEMM
    scFooBar = scFoo * scBar;
-    
+
    // Benchmark some simple operations LatticeSU3 * Lattice SU3.
    double t0,t1,flops;
    double bytes;
-    int ncall=100;
+    int ncall=5000;
    int Nc = Grid::QCD::Nc;
    LatticeGaugeField U(&Fine);
@ -352,19 +319,21 @@ int main (int argc, char ** argv)
    if ( Fine.IsBoss() ) {
      printf("%f flop and %f bytes\n",flops,bytes/ncall);
    }
-        FooBar = Foo * Bar;
+    FooBar = Foo * Bar;
    Fine.Barrier();
    t0=usecond();
    for(int i=0;i<ncall;i++){
      Fine.Barrier();
      mult(FooBar,Foo,Bar); // this is better
    }
    t1=usecond();
    Fine.Barrier();
    if ( Fine.IsBoss() ) {
 #ifdef OMP
      printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp_get_max_threads(),lat,(t1-t0)/ncall);
 #endif
      printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp,lat,(t1-t0)/ncall);
      printf("mult NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
      printf("mult NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
    }
@ -526,5 +495,9 @@ int main (int argc, char ** argv)
   } // loop for lat
 } // loop for omp
 std::cout << sizeof(vComplexF) << std::endl;
 Grid_finalize();
 }
--- a/tests/InvSqrt.gnu
+++ b/tests/InvSqrt.gnu
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
-bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez Grid_rng_fixed Grid_simd_new
+bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez Grid_rng_fixed 
 Grid_main_SOURCES = Grid_main.cc
 Grid_main_LDADD = -lGrid
@ -34,5 +34,5 @@ Grid_stencil_LDADD = -lGrid
 Grid_simd_SOURCES = Grid_simd.cc
 Grid_simd_LDADD = -lGrid
-Grid_simd_new_SOURCES = Grid_simd_new.cc
+#Grid_simd_new_SOURCES = Grid_simd_new.cc
-Grid_simd_new_LDADD = -lGrid
+#Grid_simd_new_LDADD = -lGrid
--- a/tests/Sqrt.gnu
+++ b/tests/Sqrt.gnu
@ -0,0 +1,2 @@
 f(x) = 6.81384+(-2.34645e-06/(x+0.000228091))+(-1.51593e-05/(x+0.00112084))+(-6.89254e-05/(x+0.003496))+(-0.000288983/(x+0.00954309))+(-0.00119277/(x+0.024928))+(-0.0050183/(x+0.0646627))+(-0.0226449/(x+0.171576))+(-0.123767/(x+0.491792))+(-1.1705/(x+1.78667))+(-102.992/(x+18.4866));
 f(x) = 0.14676+(0.00952992/(x+5.40933e-05))+(0.0115952/(x+0.000559699))+(0.0161824/(x+0.00203338))+(0.0243252/(x+0.00582831))+(0.0379533/(x+0.0154649))+(0.060699/(x+0.0401156))+(0.100345/(x+0.104788))+(0.178335/(x+0.286042))+(0.381586/(x+0.892189))+(1.42625/(x+4.38422));
		`@ -1 +1 @@`
			`/opt/local/share/automake-1.15/config.guess`				`/usr/share/automake-1.14/config.guess`
		`@ -1 +1 @@`
			`/opt/local/share/automake-1.15/config.sub`				`/usr/share/automake-1.14/config.sub`
		`@ -0,0 +1,2 @@`
							`f(x) = 6.81384+(-2.34645e-06/(x+0.000228091))+(-1.51593e-05/(x+0.00112084))+(-6.89254e-05/(x+0.003496))+(-0.000288983/(x+0.00954309))+(-0.00119277/(x+0.024928))+(-0.0050183/(x+0.0646627))+(-0.0226449/(x+0.171576))+(-0.123767/(x+0.491792))+(-1.1705/(x+1.78667))+(-102.992/(x+18.4866));`
							`f(x) = 0.14676+(0.00952992/(x+5.40933e-05))+(0.0115952/(x+0.000559699))+(0.0161824/(x+0.00203338))+(0.0243252/(x+0.00582831))+(0.0379533/(x+0.0154649))+(0.060699/(x+0.0401156))+(0.100345/(x+0.104788))+(0.178335/(x+0.286042))+(0.381586/(x+0.892189))+(1.42625/(x+4.38422));`