Merge branch 'master' of https://github.com/paboyle/Grid

Conflicts: lib/Grid_simd.h
2025-10-17 06:34:42 +01:00 · 2015-05-26 20:04:08 +01:00
parent bfb1cd36e2 500f6ed0c5
commit 48bb3ab4e7
28 changed files with 1634 additions and 294 deletions
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.15 from Makefile.am.
+# Makefile.in generated by automake 1.14.1 from Makefile.am.
 # @configure_input@

-# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.

 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -14,17 +14,7 @@

@SET_MAKE@
 VPATH = @srcdir@
-am__is_gnu_make = { \
-  if test -z '$(MAKELEVEL)'; then \
-    false; \
-  elif test -n '$(MAKE_HOST)'; then \
-    true; \
-  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
-    true; \
-  else \
-    false; \
-  fi; \
-}
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
 am__make_running_with_option = \
  case $${target_option-} in \
      ?) ;; \
@@ -89,12 +79,15 @@ build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
 subdir = .
+DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
+	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+	$(top_srcdir)/configure $(am__configure_deps) COPYING TODO \
+	compile config.guess config.sub depcomp install-sh missing
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
-DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
-	$(am__configure_deps) $(am__DIST_COMMON)
 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
 configure.lineno config.status.lineno
 mkinstalldirs = $(install_sh) -d
@@ -157,9 +150,6 @@ ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
 DIST_SUBDIRS = $(SUBDIRS)
-am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \
-	INSTALL NEWS README TODO compile config.guess config.sub \
-	depcomp install-sh missing
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -223,6 +213,7 @@ ECHO_T = @ECHO_T@
 EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 GREP = @GREP@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -325,6 +316,7 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
 	$(am__cd) $(top_srcdir) && \
 	  $(AUTOMAKE) --gnu Makefile
+.PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
 	  *config.status*) \
@@ -531,15 +523,15 @@ dist-xz: distdir
 	$(am__post_remove_distdir)

 dist-tarZ: distdir
-	@echo WARNING: "Support for distribution archives compressed with" \
-		       "legacy program 'compress' is deprecated." >&2
+	@echo WARNING: "Support for shar distribution archives is" \
+	               "deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
 	$(am__post_remove_distdir)

 dist-shar: distdir
-	@echo WARNING: "Support for shar distribution archives is" \
-	               "deprecated." >&2
+	@echo WARNING: "Support for distribution archives compressed with" \
+		       "legacy program 'compress' is deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
 	$(am__post_remove_distdir)
@@ -575,17 +567,17 @@ distcheck: dist
 	esac
 	chmod -R a-w $(distdir)
 	chmod u+w $(distdir)
-	mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
+	mkdir $(distdir)/_build $(distdir)/_inst
 	chmod a-w $(distdir)
 	test -d $(distdir)/_build || exit 0; \
 	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
 	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
 	  && am__cwd=`pwd` \
-	  && $(am__cd) $(distdir)/_build/sub \
-	  && ../../configure \
+	  && $(am__cd) $(distdir)/_build \
+	  && ../configure \
 	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
 	    $(DISTCHECK_CONFIGURE_FLAGS) \
-	    --srcdir=../.. --prefix="$$dc_install_base" \
+	    --srcdir=.. --prefix="$$dc_install_base" \
 	  && $(MAKE) $(AM_MAKEFLAGS) \
 	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
 	  && $(MAKE) $(AM_MAKEFLAGS) check \
@@ -759,8 +751,6 @@ uninstall-am:
 	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
 	pdf-am ps ps-am tags tags-am uninstall uninstall-am

-.PRECIOUS: Makefile
-

 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
--- a/4
+++ b/4
@@ -1,8 +1,8 @@
 ================================================================
 *** Hacks and bug fixes to clean up and Audits
 ================================================================
-* Base class to share common code between vRealF, VComplexF etc... 
-  - Performance check on Guido's reimplementation strategy
+* Base class to share common code between vRealF, VComplexF etc... done
+  - Performance check on Guido's reimplementation strategy  - (GUIDO) tested and no difference was found, merged

 * FIXME audit

--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.15 -*- Autoconf -*-
+# generated automatically by aclocal 1.14.1 -*- Autoconf -*-

-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.

 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])

-# Copyright (C) 2002-2014 Free Software Foundation, Inc.
+# Copyright (C) 2002-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.15'
+[am__api_version='1.14'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.15], [],
+m4_if([$1], [1.14.1], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])

@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.15])dnl
+[AM_AUTOMAKE_VERSION([1.14.1])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])

 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-

-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -103,14 +103,15 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 # configured tree to be moved without reconfiguration.

 AC_DEFUN([AM_AUX_DIR_EXPAND],
-[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
-# Expand $ac_aux_dir to an absolute path.
-am_aux_dir=`cd "$ac_aux_dir" && pwd`
+[dnl Rely on autoconf to set up CDPATH properly.
+AC_PREREQ([2.50])dnl
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
 ])

 # AM_CONDITIONAL                                            -*- Autoconf -*-

-# Copyright (C) 1997-2014 Free Software Foundation, Inc.
+# Copyright (C) 1997-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -141,7 +142,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])

-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -332,7 +333,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl

 # Generate code to set up dependency tracking.              -*- Autoconf -*-

-# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -408,7 +409,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],

 # Do all the work for Automake.                             -*- Autoconf -*-

-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -498,8 +499,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
-# We need awk for the "check" target (and possibly the TAP driver).  The
-# system "awk" is bad on some platforms.
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
 AC_REQUIRE([AC_PROG_AWK])dnl
 AC_REQUIRE([AC_PROG_MAKE_SET])dnl
 AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@@ -572,11 +573,7 @@ to "yes", and re-run configure.
 END
    AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
  fi
-fi
-dnl The trailing newline in this macro's definition is deliberate, for
-dnl backward compatibility and to allow trailing 'dnl'-style comments
-dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
-])
+fi])

 dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@@ -605,7 +602,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])

-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -616,7 +613,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
 # Define $install_sh.
 AC_DEFUN([AM_PROG_INSTALL_SH],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh+set}" != xset; then
+if test x"${install_sh}" != xset; then
  case $am_aux_dir in
  *\ * | *\	*)
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -626,7 +623,7 @@ if test x"${install_sh+set}" != xset; then
 fi
 AC_SUBST([install_sh])])

-# Copyright (C) 2003-2014 Free Software Foundation, Inc.
+# Copyright (C) 2003-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -647,7 +644,7 @@ AC_SUBST([am__leading_dot])])

 # Check to see how 'make' treats includes.	            -*- Autoconf -*-

-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -697,7 +694,7 @@ rm -f confinc confmf

 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-

-# Copyright (C) 1997-2014 Free Software Foundation, Inc.
+# Copyright (C) 1997-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -736,7 +733,7 @@ fi

 # Helper functions for option handling.                     -*- Autoconf -*-

-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -767,7 +764,7 @@ AC_DEFUN([_AM_IF_OPTION],

 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

-# Copyright (C) 1996-2014 Free Software Foundation, Inc.
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -848,7 +845,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])

-# Copyright (C) 2009-2014 Free Software Foundation, Inc.
+# Copyright (C) 2009-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -908,7 +905,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])

-# Copyright (C) 2001-2014 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -936,7 +933,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])

-# Copyright (C) 2006-2014 Free Software Foundation, Inc.
+# Copyright (C) 2006-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -955,7 +952,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])

 # Check how to create a tarball.                            -*- Autoconf -*-

-# Copyright (C) 2004-2014 Free Software Foundation, Inc.
+# Copyright (C) 2004-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1086,3 +1083,4 @@ AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR

+m4_include([m4/ax_cxx_compile_stdcxx_11.m4])
--- a/config.guess
+++ b/config.guess
@@ -1 +1 @@
-/opt/local/share/automake-1.15/config.guess
+/usr/share/automake-1.14/config.guess
--- a/config.sub
+++ b/config.sub
@@ -1 +1 @@
-/opt/local/share/automake-1.15/config.sub
+/usr/share/automake-1.14/config.sub
--- a/199
+++ b/199
@@ -633,6 +633,7 @@ BUILD_COMMS_MPI_TRUE
 EGREP
 GREP
 CXXCPP
+HAVE_CXX11
 RANLIB
 OPENMP_CXXFLAGS
 am__fastdepCXX_FALSE
@@ -2466,7 +2467,7 @@ test -n "$target_alias" &&
    NONENONEs,x,x, &&
  program_prefix=${target_alias}-

-am__api_version='1.15'
+am__api_version='1.14'

 # Find a good install program.  We prefer a C program (faster),
 # so one script is as good as another.  But avoid the broken or
@@ -2638,8 +2639,8 @@ test "$program_suffix" != NONE &&
 ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
 program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`

-# Expand $ac_aux_dir to an absolute path.
-am_aux_dir=`cd "$ac_aux_dir" && pwd`
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`

 if test x"${MISSING+set}" != xset; then
  case $am_aux_dir in
@@ -2658,7 +2659,7 @@ else
 $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
 fi

-if test x"${install_sh+set}" != xset; then
+if test x"${install_sh}" != xset; then
  case $am_aux_dir in
  *\ * | *\	*)
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -2986,8 +2987,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 mkdir_p='$(MKDIR_P)'

-# We need awk for the "check" target (and possibly the TAP driver).  The
-# system "awk" is bad on some platforms.
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
 # Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AMTAR='$${TAR-tar}'
@@ -3046,7 +3047,6 @@ END
 fi


-
 ac_config_headers="$ac_config_headers lib/Grid_config.h"

 # Check whether --enable-silent-rules was given.
@@ -3966,6 +3966,191 @@ else
  RANLIB="$ac_cv_prog_RANLIB"
 fi

+    ax_cxx_compile_cxx11_required=true
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+  ac_success=no
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features by default" >&5
+$as_echo_n "checking whether $CXX supports C++11 features by default... " >&6; }
+if ${ax_cv_cxx_compile_cxx11+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() override {}
+    };
+
+    typedef check<check<bool>> right_angle_brackets;
+
+    int a;
+    decltype(a) b;
+
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
+
+    auto d = a;
+    auto l = [](){};
+    // Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
+    struct use_l { use_l() { l(); } };
+
+    // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+    // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
+    namespace test_template_alias_sfinae {
+        struct foo {};
+
+        template<typename T>
+        using member = typename T::member_type;
+
+        template<typename T>
+        void func(...) {}
+
+        template<typename T>
+        void func(member<T>*) {}
+
+        void test();
+
+        void test() {
+            func<foo>(0);
+        }
+    }
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ax_cv_cxx_compile_cxx11=yes
+else
+  ax_cv_cxx_compile_cxx11=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_cxx_compile_cxx11" >&5
+$as_echo "$ax_cv_cxx_compile_cxx11" >&6; }
+  if test x$ax_cv_cxx_compile_cxx11 = xyes; then
+    ac_success=yes
+  fi
+
+
+
+    if test x$ac_success = xno; then
+            for switch in -std=c++11 -std=c++0x +std=c++11; do
+      cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
+$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
+if eval \${$cachevar+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() override {}
+    };
+
+    typedef check<check<bool>> right_angle_brackets;
+
+    int a;
+    decltype(a) b;
+
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
+
+    auto d = a;
+    auto l = [](){};
+    // Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
+    struct use_l { use_l() { l(); } };
+
+    // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+    // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
+    namespace test_template_alias_sfinae {
+        struct foo {};
+
+        template<typename T>
+        using member = typename T::member_type;
+
+        template<typename T>
+        void func(...) {}
+
+        template<typename T>
+        void func(member<T>*) {}
+
+        void test();
+
+        void test() {
+            func<foo>(0);
+        }
+    }
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  eval $cachevar=yes
+else
+  eval $cachevar=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+         CXXFLAGS="$ac_save_CXXFLAGS"
+fi
+eval ac_res=\$$cachevar
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+  if test x$ax_cxx_compile_cxx11_required = xtrue; then
+    if test x$ac_success = xno; then
+      as_fn_error $? "*** A compiler with support for C++11 language features is required." "$LINENO" 5
+    fi
+  else
+    if test x$ac_success = xno; then
+      HAVE_CXX11=0
+      { $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5
+$as_echo "$as_me: No compiler with C++11 support was found" >&6;}
+    else
+      HAVE_CXX11=1
+
+$as_echo "#define HAVE_CXX11 1" >>confdefs.h
+
+    fi
+
+
+  fi
+
+

 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)
--- a/configure.ac
+++ b/configure.ac
@@ -3,9 +3,9 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-05-19 13:51:08 neo>
+# Time-stamp: <2015-05-25 14:54:34 neo>

-AC_PREREQ([2.69])
+AC_PREREQ([2.63])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
 AC_CANONICAL_SYSTEM
 AM_INIT_AUTOMAKE(subdir-objects)
@@ -26,6 +26,8 @@ AC_LANG(C++)
 AC_PROG_CXX
 AC_OPENMP
 AC_PROG_RANLIB
+AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
+

 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)
--- a/lib/Grid_config.h
+++ b/lib/Grid_config.h
@@ -16,6 +16,9 @@
 /* GRID_COMMS_NONE */
 #define GRID_COMMS_NONE 1

+/* define if the compiler supports basic C++11 syntax */
+/* #undef HAVE_CXX11 */
+
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #define HAVE_DECL_BE64TOH 1
--- a/lib/Grid_config.h.in
+++ b/lib/Grid_config.h.in
@@ -15,6 +15,9 @@
 /* GRID_COMMS_NONE */
 #undef GRID_COMMS_NONE

+/* define if the compiler supports basic C++11 syntax */
+#undef HAVE_CXX11
+
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #undef HAVE_DECL_BE64TOH
--- a/lib/Grid_simd.h
+++ b/lib/Grid_simd.h
@@ -13,28 +13,6 @@

 typedef uint32_t Integer;

-#ifdef SSE4
-#include <pmmintrin.h>
-#endif
-#if defined(AVX1) || defined (AVX2)
-#include <immintrin.h>
-    
-// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
-#ifndef _mm256_set_m128i
-#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
-#endif
-
-#endif
-
-#ifdef AVX512
-#include <immintrin.h>
-#ifndef KNC_ONLY_STORES
-#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
-#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
-#endif
-
-#endif
-
 namespace Grid {

  typedef  float  RealF;
@@ -67,42 +45,46 @@ namespace Grid {
  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
  inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
  
-    ////////////////////////////////////////////////////////////////////////////////
-    //Provide support functions for basic real and complex data types required by Grid
-    //Single and double precision versions. Should be able to template this once only.
-    ////////////////////////////////////////////////////////////////////////////////
-    inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
-    inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
-    inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
-    inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
-    // conjugate already supported for complex
+  ////////////////////////////////////////////////////////////////////////////////
+  //Provide support functions for basic real and complex data types required by Grid
+  //Single and double precision versions. Should be able to template this once only.
+  ////////////////////////////////////////////////////////////////////////////////
+  inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
+  inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
+  inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
+  inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
+  // conjugate already supported for complex
  
-    inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
-    inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
-    inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
-    inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
+  inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
+  inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
+  inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
+  inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
  
-    //conjugate already supported for complex
+  //conjugate already supported for complex
  
-    inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
-    inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
-    inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
-    inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
-    inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
-    inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
-    inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
-    inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
+  inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
+  inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
+  inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
+  inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
+  inline void timesI(ComplexF &ret,const ComplexF &r)     { ret = timesI(r);}
+  inline void timesI(ComplexD &ret,const ComplexD &r)     { ret = timesI(r);}
+  inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
+  inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);}
  
-    inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
-    inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
-    inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
-    inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
+  inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
+  inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
+  inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
+  inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
  
-    inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
-    inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
-    inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
-    inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
+  inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
+  inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
+  inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
+  inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
  
+  inline void vstream(ComplexF &l, const ComplexF &r){ l=r;}
+  inline void vstream(ComplexD &l, const ComplexD &r){ l=r;}
+  inline void vstream(RealF &l, const RealF &r){ l=r;}
+  inline void vstream(RealD &l, const RealD &r){ l=r;}
  
  
  class Zero{};
@@ -113,7 +95,6 @@ namespace Grid {
  template<>            inline void zeroit(RealF &arg){ arg=0; };
  template<>            inline void zeroit(RealD &arg){ arg=0; };

-
 #if defined (SSE4)
    typedef __m128 fvec;
    typedef __m128d dvec;
@@ -245,56 +226,12 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
      default: assert(0); break;
      }
    };
+
 };

-#include <simd/Grid_vInteger.h>
-#include <simd/Grid_vRealF.h>
-#include <simd/Grid_vRealD.h>
-#include <simd/Grid_vComplexF.h>
-#include <simd/Grid_vComplexD.h>
-
+#include <simd/Grid_vector_types.h>

 namespace Grid {
-
-  // NB: Template the following on "type Complex" and then implement *,+,- for 
-  // ComplexF, ComplexD, RealF, RealD above to
-  // get full generality of binops with scalars.
-   inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-   inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
-   inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
-   inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
-   inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-   inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
-   inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
-   inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
-
-   inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-   inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
-   inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
-   inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
-   inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-   inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
-   inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
-   inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
-
-   inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-   inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
-   inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
-   inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
-   inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-   inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
-   inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
-   inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
-
-   inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-   inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
-   inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
-   inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
-   inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-   inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
-   inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
-   inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
-
  // Default precision
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
  typedef vRealD vReal;
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -93,12 +93,10 @@ nobase_include_HEADERS = algorithms/approx/bigfloat.h		\
 	qcd/Grid_qcd_2spinor.h					\
 	qcd/Grid_qcd_dirac.h					\
 	qcd/Grid_qcd_wilson_dop.h				\
-	simd/Grid_vComplexD.h					\
-	simd/Grid_vComplexF.h					\
-	simd/Grid_vInteger.h					\
-	simd/Grid_vRealD.h					\
-	simd/Grid_vRealF.h					\
 	simd/Grid_vector_types.h				\
-	simd/Grid_sse4.h					
+	simd/Grid_sse4.h					\
+	simd/Grid_avx.h						\
+	simd/Grid_knc.h					
+


--- a/lib/cshift/Grid_cshift_common.h
+++ b/lib/cshift/Grid_cshift_common.h
@@ -154,26 +154,35 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int
    cbmask=0x3;
  }

-
  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
  
 PARALLEL_NESTED_LOOP2
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-      
+      /*
      int o =n*rhs._grid->_slice_stride[dimension];
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
      if ( ocb&cbmask ) {
 	lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
      }
+      */
+
+      int o =n*rhs._grid->_slice_stride[dimension]+b;
+      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
+      if ( ocb&cbmask ) {
+	//lhs._odata[lo+o]=rhs._odata[ro+o];
+	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+      }

    }
  }
+  
 }

 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
 {
+ 
  int rd = rhs._grid->_rdimensions[dimension];

  if ( !rhs._grid->CheckerBoarded(dimension) ) {
--- a/lib/math/Grid_math_tensors.h
+++ b/lib/math/Grid_math_tensors.h
@@ -212,6 +212,16 @@ public:

  iMatrix(const Zero &z){ *this = zero; };
  iMatrix() =default;
+  
+  iMatrix& operator=(const iMatrix& rhs){
+    for(int i=0;i<N;i++)
+      for(int j=0;j<N;j++)
+	vstream(_internal[i][j],rhs._internal[i][j]);
+    return *this;
+  }; 
+  
+ 
+
  iMatrix(scalar_type s)  { (*this) = s ;};// recurse down and hit the constructor for vector_type

  /*
@@ -220,6 +230,9 @@ public:
  iMatrix<vtype,N> & operator= (const iMatrix<vtype,N> &copyme) = default;
  iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &&copyme) = default;
  */
+
+
+
  iMatrix<vtype,N> & operator= (const Zero &hero){
    zeroit(*this);
    return *this;
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -0,0 +1,399 @@
+//----------------------------------------------------------------------
+/*! @file Grid_avx.h
+  @brief Optimization libraries for AVX1/2 instructions set
+
+  Using intrinsics
+*/
+// Time-stamp: <2015-05-22 18:58:27 neo>
+//----------------------------------------------------------------------
+
+#include <immintrin.h>
+// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
+#endif
+
+namespace Optimization {
+  
+  struct Vsplat{
+    //Complex float
+    inline __m256 operator()(float a, float b){
+      return _mm256_set_ps(b,a,b,a,b,a,b,a);
+    }
+    // Real float
+    inline __m256 operator()(float a){
+      return _mm256_set_ps(a,a,a,a,a,a,a,a);
+    }
+    //Complex double
+    inline __m256d operator()(double a, double b){
+      return _mm256_set_pd(b,a,b,a);
+    }
+    //Real double
+    inline __m256d operator()(double a){
+      return _mm256_set_pd(a,a,a,a);
+    }
+    //Integer
+    inline __m256i operator()(Integer a){
+      return _mm256_set1_epi32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(__m256 a, float* F){
+      _mm256_store_ps(F,a);
+    }
+    //Double
+    inline void operator()(__m256d a, double* D){
+      _mm256_store_pd(D,a);
+    }
+    //Integer
+    inline void operator()(__m256i a, Integer* I){
+      _mm256_store_si256((__m256i*)I,a);
+    }
+
+  };
+
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, __m256 b){
+      _mm256_stream_ps(a,b);
+    }
+    //Double
+    inline void operator()(double * a, __m256d b){
+      _mm256_stream_pd(a,b);
+    }
+
+
+  };
+
+
+
+  struct Vset{
+    // Complex float 
+    inline __m256 operator()(Grid::ComplexF *a){
+      return _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Complex double 
+    inline __m256d operator()(Grid::ComplexD *a){
+      return _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Real float 
+    inline __m256 operator()(float *a){
+      return _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Real double
+    inline __m256d operator()(double *a){
+      return _mm256_set_pd(a[3],a[2],a[1],a[0]);
+    }
+    // Integer
+    inline __m256i operator()(Integer *a){
+      return _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+
+ 
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline __m256 operator()(__m256 a, __m256 b){
+      return _mm256_add_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m256d operator()(__m256d a, __m256d b){
+      return _mm256_add_pd(a,b);
+    }
+    //Integer
+    inline __m256i operator()(__m256i a, __m256i b){
+#if defined (AVX1) 
+          __m128i a0,a1;
+          __m128i b0,b1;
+          a0 = _mm256_extractf128_si256(a,0);
+          b0 = _mm256_extractf128_si256(b,0);
+          a1 = _mm256_extractf128_si256(a,1);
+          b1 = _mm256_extractf128_si256(b,1);
+          a0 = _mm_add_epi32(a0,b0);
+          a1 = _mm_add_epi32(a1,b1);
+          return _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            return _mm256_add_epi32(a,b);
+#endif
+
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline __m256 operator()(__m256 a, __m256 b){
+      return _mm256_sub_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m256d operator()(__m256d a, __m256d b){
+      return _mm256_sub_pd(a,b);
+    }
+    //Integer
+    inline __m256i operator()(__m256i a, __m256i b){
+#if defined (AVX1) 
+          __m128i a0,a1;
+          __m128i b0,b1;
+          a0 = _mm256_extractf128_si256(a,0);
+          b0 = _mm256_extractf128_si256(b,0);
+          a1 = _mm256_extractf128_si256(a,1);
+          b1 = _mm256_extractf128_si256(b,1);
+          a0 = _mm_sub_epi32(a0,b0);
+          a1 = _mm_sub_epi32(a1,b1);
+          return _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            return _mm256_sub_epi32(a,b);
+#endif
+
+    }
+  };
+
+
+  struct MultComplex{
+    // Complex float
+    inline __m256 operator()(__m256 a, __m256 b){
+      __m256 ymm0,ymm1,ymm2;
+      ymm0 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
+      // FIXME AVX2 could MAC
+      ymm1 = _mm256_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm256_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm256_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
+      return _mm256_addsub_ps(ymm0,ymm1);  
+    }
+    // Complex double
+    inline __m256d operator()(__m256d a, __m256d b){
+      //Multiplication of (ak+ibk)*(ck+idk)
+      // a + i b can be stored as a data structure
+      //From intel optimisation reference guide
+      /*
+	movsldup xmm0, Src1; load real parts into the destination,
+	; a1, a1, a0, a0
+	movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
+	mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
+	shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
+	movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
+	mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
+	addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
+	VSHUFPD (VEX.256 encoded version)
+	IF IMM0[0] = 0
+	THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
+	IF IMM0[1] = 0
+	THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
+	IF IMM0[2] = 0
+	THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
+	IF IMM0[3] = 0
+	THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
+      */
+      
+      __m256d ymm0,ymm1,ymm2;
+      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
+      ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
+      ymm1 = _mm256_shuffle_pd(b,b,0x5); // ymm1 <- br,bi  b'01,01 
+      ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai  b'11,11
+      ymm1 = _mm256_mul_pd(ymm1,ymm2);   // ymm1 <- br ai, ai bi
+      return _mm256_addsub_pd(ymm0,ymm1);
+    }
+  };
+
+  struct Mult{
+    // Real float
+    inline __m256 operator()(__m256 a, __m256 b){
+      return _mm256_mul_ps(a,b);
+    }
+    // Real double
+    inline __m256d operator()(__m256d a, __m256d b){
+      return _mm256_mul_pd(a,b);
+    }
+    // Integer
+    inline __m256i operator()(__m256i a, __m256i b){
+#if defined (AVX1) 
+      __m128i a0,a1;
+      __m128i b0,b1;
+      a0 = _mm256_extractf128_si256(a,0);
+      b0 = _mm256_extractf128_si256(b,0);
+      a1 = _mm256_extractf128_si256(a,1);
+      b1 = _mm256_extractf128_si256(b,1);
+      a0 = _mm_mul_epi32(a0,b0);
+      a1 = _mm_mul_epi32(a1,b1);
+      return _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+      return _mm256_mul_epi32(a,b);
+#endif
+
+    }
+  };
+
+
+  struct Conj{
+    // Complex single
+    inline __m256 operator()(__m256 in){
+      return _mm256_xor_ps(_mm256_addsub_ps(_mm256_setzero_ps(),in), _mm256_set1_ps(-0.f));
+    }
+    // Complex double
+    inline __m256d operator()(__m256d in){
+      return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));//untested
+      /*
+	// original 
+	//      addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
+	__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),_mm256_shuffle_pd(in,in,0x5));
+	return _mm256_shuffle_pd(tmp,tmp,0x5);
+      */
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline __m256 operator()(__m256 in, __m256 ret){
+      __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in);   // r,-i
+      return _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
+    }
+    //Complex double
+    inline __m256d operator()(__m256d in, __m256d ret){
+      __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
+      return _mm256_shuffle_pd(tmp,tmp,0x5);
+    }
+  };
+
+  struct TimesI{
+    //Complex single
+    inline __m256 operator()(__m256 in, __m256 ret){
+      __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SHUFFLE(2,3,0,1)); // i,r
+      return _mm256_addsub_ps(_mm256_setzero_ps(),tmp);          // i,-r
+    }
+    //Complex double
+    inline __m256d operator()(__m256d in, __m256d ret){
+      __m256d tmp = _mm256_shuffle_pd(in,in,0x5);
+      return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
+    }
+  };
+
+
+  
+
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  template < typename vtype > 
+    void permute(vtype a, vtype b, int perm) {
+    union { 
+      __m256 f;
+      vtype v;
+    } conv;
+    conv.v = b;
+    switch (perm){
+      // 8x32 bits=>3 permutes
+    case 2: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
+    case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
+    case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
+    default: assert(0); break;
+    }
+    a = conv.v;
+    
+  }
+  
+  //Complex float Reduce
+  template<>
+    inline Grid::ComplexF Reduce<Grid::ComplexF, __m256>::operator()(__m256 in){
+    __m256 v1,v2;
+    Optimization::permute(v1,in,0); // sse 128; paired complex single
+    v1 = _mm256_add_ps(v1,in);
+    Optimization::permute(v2,v1,1); // avx 256; quad complex single
+    v1 = _mm256_add_ps(v1,v2);
+    return Grid::ComplexF(v1[0],v1[1]);
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, __m256>::operator()(__m256 in){
+    __m256 v1,v2;
+    Optimization::permute(v1,in,0); // avx 256; octo-double
+    v1 = _mm256_add_ps(v1,in);
+    Optimization::permute(v2,v1,1); 
+    v1 = _mm256_add_ps(v1,v2);
+    Optimization::permute(v2,v1,2); 
+    v1 = _mm256_add_ps(v1,v2);
+    return v1[0];
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, __m256d>::operator()(__m256d in){
+    __m256d v1;
+    Optimization::permute(v1,in,0); // sse 128; paired complex single
+    v1 = _mm256_add_pd(v1,in);
+    return Grid::ComplexD(v1[0],v1[1]);
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, __m256d>::operator()(__m256d in){
+    __m256d v1,v2;
+    Optimization::permute(v1,in,0); // avx 256; quad double
+    v1 = _mm256_add_pd(v1,in);
+    Optimization::permute(v2,v1,1); 
+    v1 = _mm256_add_pd(v1,v2);
+    return v1[0];
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
+    // FIXME unimplemented
+    printf("Reduce : Missing integer implementation -> FIX\n");
+    assert(0);
+  }
+  
+  
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+namespace Grid {
+  typedef __m256  SIMD_Ftype; // Single precision type
+  typedef __m256d SIMD_Dtype; // Double precision type
+  typedef __m256i SIMD_Itype; // Integer type
+
+
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
--- a/lib/simd/Grid_knc.h
+++ b/lib/simd/Grid_knc.h
@@ -0,0 +1,323 @@
+//----------------------------------------------------------------------
+/*! @file Grid_knc.h
+  @brief Optimization libraries for AVX512 instructions set for KNC
+
+  Using intrinsics
+*/
+// Time-stamp: <2015-05-22 17:12:44 neo>
+//----------------------------------------------------------------------
+
+#include <immintrin.h>
+#ifndef KNC_ONLY_STORES
+#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
+#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
+#endif
+
+
+namespace Optimization {
+  
+  struct Vsplat{
+    //Complex float
+    inline __m512 operator()(float a, float b){
+      return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
+    }
+    // Real float
+    inline __m512 operator()(float a){
+      return _mm512_set1_ps(a);
+    }
+    //Complex double
+    inline __m512d operator()(double a, double b){
+      return _mm512_set_pd(b,a,b,a,b,a,b,a);
+    }
+    //Real double
+    inline __m512d operator()(double a){
+      return _mm512_set1_pd(a);
+    }
+    //Integer
+    inline __m512i operator()(Integer a){
+      return _mm512_set1_epi32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(__m512 a, float* F){
+      _mm512_store_ps(F,a);
+    }
+    //Double
+    inline void operator()(__m512d a, double* D){
+      _mm512_store_pd(D,a);
+    }
+    //Integer
+    inline void operator()(__m512i a, Integer* I){
+      _mm512_store_si512((__m512i *)I,a);
+    }
+
+  };
+
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, __m512 b){
+      _mm512_storenrngo_ps(a,b);
+    }
+    //Double
+    inline void operator()(double * a, __m512d b){
+      _mm512_storenrngo_pd(a,b);
+    }
+
+
+  };
+
+
+
+  struct Vset{
+    // Complex float 
+    inline __m512 operator()(Grid::ComplexF *a){
+      return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
+			   a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
+			   a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Complex double 
+    inline __m512d operator()(Grid::ComplexD *a){
+      return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
+			   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Real float 
+    inline __m512 operator()(float *a){
+      return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			    a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Real double
+    inline __m512d operator()(double *a){
+      return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+    // Integer
+    inline __m512i operator()(Integer *a){
+      return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+			       a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+
+
+ 
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_add_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_add_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_add_epi32(a,b);
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_sub_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_sub_pd(a,b);
+    }
+    //Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_sub_epi32(a,b);
+    }
+  };
+
+
+  struct MultComplex{
+    // Complex float
+    inline __m512 operator()(__m512 a, __m512 b){
+      __m512 vzero,ymm0,ymm1,real, imag;
+      vzero = _mm512_setzero_ps();
+      ymm0  = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // 
+      real  = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
+      imag  = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
+      ymm1  = _mm512_mul_ps(real, b);
+      ymm0  = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
+      return _mm512_fmadd_ps(ymm0,imag,ymm1);
+    }
+    // Complex double
+    inline __m512d operator()(__m512d a, __m512d b){
+      /* This is from
+       * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
+       * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
+       * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
+       * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
+       * booktitle = {Proceedings of the International Conference on Supercomputing},
+       * series = {ICS '11},
+       * year = {2011},
+       * isbn = {978-1-4503-0102-2},
+       * location = {Tucson, Arizona, USA},
+       * pages = {265--274},
+       * numpages = {10},
+       * url = {http://doi.acm.org/10.1145/1995896.1995938},
+       * doi = {10.1145/1995896.1995938},
+       * acmid = {1995938},
+       * publisher = {ACM},
+       * address = {New York, NY, USA},
+       * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
+       *                } 
+       */
+      __m512d vzero,ymm0,ymm1,real,imag;
+      vzero =_mm512_setzero_pd();
+      ymm0 =  _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // 
+      real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
+      imag =  _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
+      ymm1 =  _mm512_mul_pd(real, b);
+      ymm0 =  _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
+      return  _mm512_fmadd_pd(ymm0,imag,ymm1);
+    }
+  };
+  
+  struct Mult{
+    // Real float
+    inline __m512 operator()(__m512 a, __m512 b){
+      return _mm512_mul_ps(a,b);
+    }
+    // Real double
+    inline __m512d operator()(__m512d a, __m512d b){
+      return _mm512_mul_pd(a,b);
+    }
+    // Integer
+    inline __m512i operator()(__m512i a, __m512i b){
+      return _mm512_mullo_epi32(a,b);
+    }
+  };
+
+
+  struct Conj{
+    // Complex single
+    inline __m512 operator()(__m512 in){
+      return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag  
+    }
+    // Complex double
+    inline __m512d operator()(__m512d in){
+      return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
+      return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
+      return  _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
+    }
+
+
+  };
+
+  struct TimesI{
+    //Complex single
+    inline __m512 operator()(__m512 in, __m512 ret){
+      __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
+      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
+    }
+    //Complex double
+    inline __m512d operator()(__m512d in, __m512d ret){
+      __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
+      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
+    }
+
+
+  };
+
+
+  
+
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  
+  //Complex float Reduce
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
+    return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
+  }
+  //Real float Reduce
+  template<>
+  inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
+    return _mm512_reduce_add_ps(in);
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+  inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
+    return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
+  }
+  
+  //Real double Reduce
+  template<>
+  inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
+    return _mm512_reduce_add_pd(in);
+  }
+
+  //Integer Reduce
+  template<>
+  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
+    // FIXME unimplemented
+    printf("Reduce : Missing integer implementation -> FIX\n");
+    assert(0);
+  }
+  
+  
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+namespace Grid {
+  typedef __m512 SIMD_Ftype;  // Single precision type
+  typedef __m512d SIMD_Dtype; // Double precision type
+  typedef __m512i SIMD_Itype; // Integer type
+
+
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -0,0 +1,272 @@
+//----------------------------------------------------------------------
+/*! @file Grid_qpx.h
+  @brief Optimization libraries for QPX instructions set for BG/Q
+
+  Using intrinsics
+*/
+// Time-stamp: <2015-05-22 17:29:26 neo>
+//----------------------------------------------------------------------
+
+// lot of undefined functions
+
+namespace Optimization {
+  
+  struct Vsplat{
+    //Complex float
+    inline float operator()(float a, float b){
+      return {a,b,a,b};
+    }
+    // Real float
+    inline float operator()(float a){
+      return {a,a,a,a};
+    }
+    //Complex double
+    inline vector4double operator()(double a, double b){
+      return {a,b,a,b};
+    }
+    //Real double
+    inline vector4double operator()(double a){
+      return {a,a,a,a};
+    }
+    //Integer
+    inline int operator()(Integer a){
+#error
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(float a, float* F){
+      assert(0);
+    }
+    //Double
+    inline void operator()(vector4double a, double* D){
+      assert(0);
+    }
+    //Integer
+    inline void operator()(int a, Integer* I){
+      assert(0);
+    }
+
+  };
+
+
+  struct Vstream{
+    //Float
+    inline void operator()(float * a, float b){
+      assert(0);
+    }
+    //Double
+    inline void operator()(double * a, vector4double b){
+      assert(0);
+    }
+
+
+  };
+
+
+
+  struct Vset{
+    // Complex float 
+    inline float operator()(Grid::ComplexF *a){
+      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+    }
+    // Complex double 
+    inline vector4double operator()(Grid::ComplexD *a){
+      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+    }
+    // Real float 
+    inline float operator()(float *a){
+      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+    }
+    // Real double
+    inline vector4double operator()(double *a){
+      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+    }
+    // Integer
+    inline int operator()(Integer *a){
+#error
+    }
+
+
+  };
+
+  template <typename Out_type, typename In_type>
+    struct Reduce{
+      //Need templated class to overload output type
+      //General form must generate error if compiled
+      inline Out_type operator()(In_type in){
+	printf("Error, using wrong Reduce function\n");
+	exit(1);
+	return 0;
+      }
+    };
+
+
+ 
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline float operator()(float a, float b){
+#error
+    }
+    //Complex/Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+      return vec_add(a,b);
+    }
+    //Integer
+    inline int operator()(int a, int b){
+#error
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline float operator()(float a, float b){
+#error
+    }
+    //Complex/Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+#error
+    }
+    //Integer
+    inline floati operator()(int a, int b){
+#error
+    }
+  };
+
+
+  struct MultComplex{
+    // Complex float
+    inline float operator()(float a, float b){
+#error
+    }
+    // Complex double
+    inline vector4double operator()(vector4double a, vector4double b){
+#error
+    }
+  };
+
+  struct Mult{
+    // Real float
+    inline float operator()(float a, float b){
+#error
+    }
+    // Real double
+    inline vector4double operator()(vector4double a, vector4double b){
+#error
+    }
+    // Integer
+    inline int operator()(int a, int b){
+#error
+    }
+  };
+
+
+  struct Conj{
+    // Complex single
+    inline float operator()(float in){
+      assert(0);
+    }
+    // Complex double
+    inline vector4double operator()(vector4double in){
+      assert(0);
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline float operator()(float in, float ret){
+      assert(0);
+    }
+    //Complex double
+    inline vector4double operator()(vector4double in, vector4double ret){
+      assert(0);
+    }
+
+
+  };
+
+  struct TimesI{
+    //Complex single
+    inline float operator()(float in, float ret){
+  
+    }
+    //Complex double
+    inline vector4double operator()(vector4double in, vector4double ret){
+  
+    }
+
+
+  };
+
+
+  
+
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  
+  //Complex float Reduce
+  template<>
+    inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
+    assert(0);
+  }
+  //Real float Reduce
+  template<>
+    inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
+    assert(0);
+  }
+  
+  
+  //Complex double Reduce
+  template<>
+    inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
+    assert(0);
+  }
+  
+  //Real double Reduce
+  template<>
+    inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
+    assert(0);
+  }
+
+  //Integer Reduce
+  template<>
+    inline Integer Reduce<Integer, floati>::operator()(float in){
+    assert(0);
+  }
+  
+  
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// Here assign types 
+namespace Grid {
+  typedef float SIMD_Ftype  __attribute__ ((vector_size (16)));         // Single precision type
+  typedef vector4double SIMD_Dtype; // Double precision type
+  typedef int SIMD_Itype;           // Integer type
+
+
+  // Function name aliases
+  typedef Optimization::Vsplat   VsplatSIMD;
+  typedef Optimization::Vstore   VstoreSIMD;
+  typedef Optimization::Vset     VsetSIMD;
+  typedef Optimization::Vstream  VstreamSIMD;
+  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::Conj        ConjSIMD;
+  typedef Optimization::TimesMinusI TimesMinusISIMD;
+  typedef Optimization::TimesI      TimesISIMD;
+
+}
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -4,7 +4,7 @@

  Using intrinsics
 */
-// Time-stamp: <2015-05-20 16:45:39 neo>
+// Time-stamp: <2015-05-21 18:06:30 neo>
 //----------------------------------------------------------------------

 #include <pmmintrin.h>
@@ -53,12 +53,12 @@ namespace Optimization {

  struct Vstream{
    //Float
-    inline void operator()(__m128 a, __m128 b){
-      _mm_stream_ps((float *)&a,b);
+    inline void operator()(float * a, __m128 b){
+      _mm_stream_ps(a,b);
    }
    //Double
-    inline void operator()(__m128d a, __m128d b){
-      _mm_stream_pd((double *)&a,b);
+    inline void operator()(double * a, __m128d b){
+      _mm_stream_pd(a,b);
    }


--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -2,13 +2,23 @@
 /*! @file Grid_vector_types.h
  @brief Defines templated class Grid_simd to deal with inner vector types
 */
-// Time-stamp: <2015-05-20 17:31:55 neo>
+// Time-stamp: <2015-05-26 13:44:54 neo>
 //---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES

+#ifdef SSE4
 #include "Grid_sse4.h"
-
+#endif
+#if defined (AVX1)|| defined (AVX2)
+#include "Grid_avx.h"
+#endif
+#if defined AVX512
+#include "Grid_knc.h"
+#endif
+#if defined QPX
+#include "Grid_qpx.h"
+#endif

 namespace Grid {

@@ -25,8 +35,6 @@ namespace Grid {
  template <typename Condition, typename ReturnType> using EnableIf   =    Invoke<std::enable_if<Condition::value, ReturnType>>;
  template <typename Condition, typename ReturnType> using NotEnableIf=    Invoke<std::enable_if<!Condition::value, ReturnType>>;
  
-
-
  ////////////////////////////////////////////////////////
  // Check for complexity with type traits
  template <typename T>     struct is_complex : std::false_type {};
@@ -36,18 +44,71 @@ namespace Grid {
  // general forms to allow for vsplat syntax
  // need explicit declaration of types when used since
  // clang cannot automatically determine the output type sometimes
+  // use decltype?
  template < class Out, class Input1, class Input2, class Operation > 
    Out binary(Input1 src_1, Input2 src_2, Operation op){
    return op(src_1, src_2);
  } 

-  template < class SIMDout, class Input, class Operation > 
-    SIMDout unary(Input src, Operation op){
+  template < class Out, class Input, class Operation > 
+    Out unary(Input src, Operation op){
    return op(src);
  } 

  ///////////////////////////////////////////////

+//////////////////////////////////////////////////////////
+// Permute
+// Permute 0 every ABCDEFGH -> BA DC FE HG
+// Permute 1 every ABCDEFGH -> CD AB GH EF
+// Permute 2 every ABCDEFGH -> EFGH ABCD
+// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
+// Permute 4 possible on half precision @512bit vectors.
+//////////////////////////////////////////////////////////
+template<class vsimd>
+inline void Gpermute(vsimd &y,const vsimd &b,int perm){
+	union { 
+	  SIMD_Ftype f;
+	  decltype(vsimd::v) v;
+	} conv;
+	conv.v = b.v;
+      switch (perm){
+#if defined(AVX1)||defined(AVX2)
+      // 8x32 bits=>3 permutes
+      case 2: 
+	conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); 
+	break;
+      case 1: conv.f = _mm256_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
+      case 0: conv.f = _mm256_permute2f128_ps(conv.f,conv.f,0x01); break;
+#endif
+#ifdef SSE4
+      case 1: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
+      case 0: conv.f = _mm_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2));break;
+#endif
+#ifdef AVX512
+	// 16 floats=> permutes
+        // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo 
+        // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn 
+        // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
+        // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
+      case 3: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
+      case 2: conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break;
+      case 1: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
+      case 0: conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
+#endif
+#ifdef QPX
+#error not implemented
+#endif
+      default: assert(0); break;
+      }
+      y.v=conv.v;
+
+ };
+
+///////////////////////////////////////
+
+
+
  /*
    @brief Grid_simd class for the SIMD vector type operations
   */
@@ -56,6 +117,9 @@ namespace Grid {
    
  public:
    typedef typename RealPart < Scalar_type >::type Real; 
+    typedef Vector_type     vector_type;
+    typedef Scalar_type     scalar_type;
+   
    Vector_type v;
    
    
@@ -66,17 +130,21 @@ namespace Grid {
      vzero(*this);
      return (*this);
    }
-    Grid_simd(){};
    
+    Grid_simd& operator=(const Grid_simd&& rhs){v=rhs.v;return *this;};
+    Grid_simd& operator=(const Grid_simd& rhs){v=rhs.v;return *this;}; //faster than not declaring it and leaving to the compiler
+    Grid_simd()=default; 
+    Grid_simd(const Grid_simd& rhs):v(rhs.v){};    //compiles in movaps
+    Grid_simd(const Grid_simd&& rhs):v(rhs.v){};  
  
    //Enable if complex type
    template < class S = Scalar_type > 
-    Grid_simd(typename std::enable_if< is_complex < S >::value, S>::type a){
+    Grid_simd(const typename std::enable_if< is_complex < S >::value, S>::type a){
      vsplat(*this,a);
    };
    

-    Grid_simd(Real a){
+    Grid_simd(const Real a){
      vsplat(*this,Scalar_type(a));
    };
       
@@ -88,18 +156,25 @@ namespace Grid {
    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }

-    //not for integer types... FIXME
+
+    friend inline void mac (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ a,const Grid_simd   *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+    friend inline void mult(Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd   *__restrict__ r){ *y = (*l) * (*r); }
+    friend inline void sub (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd   *__restrict__ r){ *y = (*l) - (*r); }
+    friend inline void add (Grid_simd *__restrict__ y,const Scalar_type *__restrict__ l,const Grid_simd   *__restrict__ r){ *y = (*l) + (*r); }
+    friend inline void mac (Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ a,const Scalar_type *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+    friend inline void mult(Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) * (*r); }
+    friend inline void sub (Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) - (*r); }
+    friend inline void add (Grid_simd *__restrict__ y,const Grid_simd   *__restrict__ l,const Scalar_type *__restrict__ r){ *y = (*l) + (*r); }
+
+
+
+    //not for integer types... 
+    template <  class S = Scalar_type, NotEnableIf<std::is_integral < S >, int> = 0 > 
    friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
        
    ///////////////////////////////////////////////
    // Initialise to 1,0,i for the correct types
    ///////////////////////////////////////////////
-    // if not complex overload here 
-    template <  class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 > 
-      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0); }
-    template <  class S = Scalar_type, NotEnableIf<is_complex < S >,int> = 0 > 
-      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0); }
-    
    // For complex types
    template <  class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 > 
      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0,0.0); }
@@ -108,6 +183,14 @@ namespace Grid {
    template < class S = Scalar_type, EnableIf<is_complex < S >, int> = 0 > 
      friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);} 

+    // if not complex overload here 
+    template <  class S = Scalar_type, EnableIf<std::is_floating_point < S >,int> = 0 > 
+      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0); }
+    template <  class S = Scalar_type, EnableIf<std::is_floating_point < S >,int> = 0 > 
+      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0); }
+    
+
+   
    // For integral types
    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1); }
@@ -116,7 +199,7 @@ namespace Grid {
    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
      friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
-      friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
+      friend inline void vfalse(Grid_simd &ret){vsplat(ret,0);}
   
    ////////////////////////////////////
    // Arithmetic operator overloads +,-,*
@@ -192,8 +275,9 @@ namespace Grid {
    ///////////////////////
    // Vstream
    ///////////////////////
+    template <  class S = Scalar_type, NotEnableIf<std::is_integral < S >, int> = 0 > 
    friend inline void vstream(Grid_simd &out,const Grid_simd &in){
-      binary<void>(out.v, in.v, VstreamSIMD());
+      binary<void>((Real*)&out.v, in.v, VstreamSIMD());
    }

    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
@@ -291,7 +375,7 @@ namespace Grid {
    // Unary negation
    ///////////////////////
    friend inline Grid_simd operator -(const Grid_simd &r) {
-      vComplexF ret;
+      Grid_simd ret;
      vzero(ret);
      ret = ret - r;
      return ret;
@@ -336,7 +420,7 @@ namespace Grid {
  }

  template<class scalar_type, class vector_type >
-  inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);}
+    inline void zeroit(Grid_simd< scalar_type, vector_type> &z){ vzero(z);}


  template<class scalar_type, class vector_type >
@@ -354,33 +438,15 @@ namespace Grid {

  // Define available types (now change names to avoid clashing with the rest of the code)

-  typedef Grid_simd< float                 , SIMD_Ftype > MyRealF;
-  typedef Grid_simd< double                , SIMD_Dtype > MyRealD;
-  typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF;
-  typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD;
+  typedef Grid_simd< float                 , SIMD_Ftype > vRealF;
+  typedef Grid_simd< double                , SIMD_Dtype > vRealD;
+  typedef Grid_simd< std::complex< float > , SIMD_Ftype > vComplexF;
+  typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD;
+  typedef Grid_simd< Integer               , SIMD_Itype > vInteger;




-  ////////////////////////////////////////////////////////////////////
-  // Temporary hack to keep independent from the rest of the code
-  template<> struct isGridTensor<MyRealD > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<MyRealF > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<MyComplexD > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-  template<> struct isGridTensor<MyComplexF > {
-    static const bool value = false;
-    static const bool notvalue = true;
-  };
-



--- a/lib/simd/Old/Grid_vComplexD.h
+++ b/lib/simd/Old/Grid_vComplexD.h
--- a/lib/simd/Old/Grid_vComplexF.h
+++ b/lib/simd/Old/Grid_vComplexF.h
@@ -54,7 +54,7 @@ namespace Grid {
        //////////////////////////////////
        friend inline void vone(vComplexF &ret)      { vsplat(ret,1.0,0.0); }
        friend inline void vzero(vComplexF &ret)     { vsplat(ret,0.0,0.0); }
-        friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0);}
+        friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0); }
          
        ////////////////////////////////////
        // Arithmetic operator overloads +,-,*
--- a/lib/simd/Old/Grid_vInteger.h
+++ b/lib/simd/Old/Grid_vInteger.h
--- a/lib/simd/Old/Grid_vRealD.h
+++ b/lib/simd/Old/Grid_vRealD.h
--- a/lib/simd/Old/Grid_vRealF.h
+++ b/lib/simd/Old/Grid_vRealF.h
--- a/m4/ax_cxx_compile_stdcxx_11.m4
+++ b/m4/ax_cxx_compile_stdcxx_11.m4
@@ -0,0 +1,167 @@
+# ============================================================================
+#  http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html
+# ============================================================================
+#
+# SYNOPSIS
+#
+#   AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional])
+#
+# DESCRIPTION
+#
+#   Check for baseline language coverage in the compiler for the C++11
+#   standard; if necessary, add switches to CXXFLAGS to enable support.
+#
+#   The first argument, if specified, indicates whether you insist on an
+#   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
+#   -std=c++11).  If neither is specified, you get whatever works, with
+#   preference for an extended mode.
+#
+#   The second argument, if specified 'mandatory' or if left unspecified,
+#   indicates that baseline C++11 support is required and that the macro
+#   should error out if no mode with that support is found.  If specified
+#   'optional', then configuration proceeds regardless, after defining
+#   HAVE_CXX11 if and only if a supporting mode is found.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
+#   Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
+#   Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
+#   Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov@google.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 11
+
+m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [[
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() override {}
+    };
+
+    typedef check<check<bool>> right_angle_brackets;
+
+    int a;
+    decltype(a) b;
+
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
+
+    auto d = a;
+    auto l = [](){};
+    // Prevent Clang error: unused variable 'l' [-Werror,-Wunused-variable]
+    struct use_l { use_l() { l(); } };
+
+    // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+    // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function because of this
+    namespace test_template_alias_sfinae {
+        struct foo {};
+
+        template<typename T>
+        using member = typename T::member_type;
+
+        template<typename T>
+        void func(...) {}
+
+        template<typename T>
+        void func(member<T>*) {}
+
+        void test();
+
+        void test() {
+            func<foo>(0);
+        }
+    }
+]])
+
+AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl
+  m4_if([$1], [], [],
+        [$1], [ext], [],
+        [$1], [noext], [],
+        [m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl
+  m4_if([$2], [], [ax_cxx_compile_cxx11_required=true],
+        [$2], [mandatory], [ax_cxx_compile_cxx11_required=true],
+        [$2], [optional], [ax_cxx_compile_cxx11_required=false],
+        [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])])
+  AC_LANG_PUSH([C++])dnl
+  ac_success=no
+  AC_CACHE_CHECK(whether $CXX supports C++11 features by default,
+  ax_cv_cxx_compile_cxx11,
+  [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
+    [ax_cv_cxx_compile_cxx11=yes],
+    [ax_cv_cxx_compile_cxx11=no])])
+  if test x$ax_cv_cxx_compile_cxx11 = xyes; then
+    ac_success=yes
+  fi
+
+  m4_if([$1], [noext], [], [dnl
+  if test x$ac_success = xno; then
+    for switch in -std=gnu++11 -std=gnu++0x; do
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
+                     $cachevar,
+        [ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXXFLAGS="$ac_save_CXXFLAGS"])
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+
+  m4_if([$1], [ext], [], [dnl
+  if test x$ac_success = xno; then
+    dnl HP's aCC needs +std=c++11 according to:
+    dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
+    for switch in -std=c++11 -std=c++0x +std=c++11; do
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
+                     $cachevar,
+        [ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXXFLAGS="$ac_save_CXXFLAGS"])
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+  AC_LANG_POP([C++])
+  if test x$ax_cxx_compile_cxx11_required = xtrue; then
+    if test x$ac_success = xno; then
+      AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.])
+    fi
+  else
+    if test x$ac_success = xno; then
+      HAVE_CXX11=0
+      AC_MSG_NOTICE([No compiler with C++11 support was found])
+    else
+      HAVE_CXX11=1
+      AC_DEFINE(HAVE_CXX11,1,
+                [define if the compiler supports basic C++11 syntax])
+    fi
+
+    AC_SUBST(HAVE_CXX11)
+  fi
+])
--- a/tests/Grid_main.cc
+++ b/tests/Grid_main.cc
@@ -104,6 +104,9 @@ int main (int argc, char ** argv)
    random(FineRNG,scVec);

    fflush(stdout);
+    
+
+     
    cVec = cMat * cVec;  // LatticeColourVector     = LatticeColourMatrix     * LatticeColourVector
    sVec = sMat * sVec;  // LatticeSpinVector       = LatticeSpinMatrix       * LatticeSpinVector
    scVec= scMat * scVec;// LatticeSpinColourVector = LatticeSpinColourMatrix * LatticeSpinColourVector
@@ -113,12 +116,14 @@ int main (int argc, char ** argv)
    cMat = outerProduct(cVec,cVec);
    scalar = localInnerProduct(cVec,cVec);

+ 
    scalar += scalar;
    scalar -= scalar;
    scalar *= scalar;
    add(scalar,scalar,scalar);
    sub(scalar,scalar,scalar);
    mult(scalar,scalar,scalar);
+
    mac(scalar,scalar,scalar);
    scalar = scalar+scalar;
    scalar = scalar-scalar;
@@ -154,51 +159,11 @@ int main (int argc, char ** argv)
 //     localNorm2
 //     localInnerProduct
    
+  
    scMat = sMat*scMat;  // LatticeSpinColourMatrix = LatticeSpinMatrix       * LatticeSpinColourMatrix

    
-
-#ifdef SSE4
-    ///////// Tests the new class Grid_simd 
-    std::complex<double> ctest(3.0,2.0);
-    std::complex<float> ctestf(3.0,2.0);
-    MyComplexF TestMe1(1.0); // fills only real part
-    MyComplexD TestMe2(ctest);
-    MyComplexD TestMe3(ctest);// compiler generate conversion of basic types
-    //MyRealF TestMe5(ctest);// Must generate compiler error
-    MyRealD TestRe1(2.0); 
-    MyRealF TestRe2(3.0); 
- 
-    vone(TestRe2);
-
-    MyComplexF TestMe6(ctestf);
-    MyComplexF TestMe7(ctestf);  
-    
-    MyComplexD TheSum= TestMe2*TestMe3;
-    MyComplexF TheSumF= TestMe6*TestMe7;
-
-    
-
-    double dsum[2];
-    _mm_store_pd(dsum, TheSum.v);
-    for (int i =0; i< 2; i++)
-      printf("%f\n", dsum[i]);
-    MyComplexD TheSumI = timesMinusI(TheSum);
-    MyComplexF TheSumIF = timesMinusI(TheSumF);
-
-    float fsum[4];
-    _mm_store_ps(fsum, TheSumF.v);
-    for (int i =0; i< 4; i++)
-      printf("%f\n", fsum[i]);
-
-    vstore(TheSumI, &ctest);
-    std::complex<float> sum = Reduce(TheSumF);
-    std::cout << ctest<< std::endl;
-    std::cout << sum<< std::endl;
-
-#endif
    ///////////////////////
-
    // Non-lattice (const objects) * Lattice
    ColourMatrix cm;
    SpinColourMatrix scm;
@@ -241,6 +206,9 @@ int main (int argc, char ** argv)
    scm=transpose(scm);
    scm=transposeIndex<1>(scm);
    
+
+
+
 //    Foo = Foo+scalar; // LatticeColourMatrix+Scalar
 //    Foo = Foo*scalar; // LatticeColourMatrix*Scalar
 //    Foo = Foo-scalar; // LatticeColourMatrix-Scalar
@@ -280,7 +248,6 @@ int main (int argc, char ** argv)
      pokeIndex<1> (c_m,c,0,0);
    }

-    
    FooBar = Bar;
 
    /*
@@ -340,7 +307,7 @@ int main (int argc, char ** argv)
    // Benchmark some simple operations LatticeSU3 * Lattice SU3.
    double t0,t1,flops;
    double bytes;
-    int ncall=100;
+    int ncall=5000;
    int Nc = Grid::QCD::Nc;

    LatticeGaugeField U(&Fine);
@@ -352,19 +319,21 @@ int main (int argc, char ** argv)
    if ( Fine.IsBoss() ) {
      printf("%f flop and %f bytes\n",flops,bytes/ncall);
    }
-        FooBar = Foo * Bar;
+    FooBar = Foo * Bar;
    Fine.Barrier();
    t0=usecond();
    for(int i=0;i<ncall;i++){
      Fine.Barrier();
      mult(FooBar,Foo,Bar); // this is better
    }
+
    t1=usecond();
    Fine.Barrier();
    if ( Fine.IsBoss() ) {
 #ifdef OMP
      printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp_get_max_threads(),lat,(t1-t0)/ncall);
 #endif
+      printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp,lat,(t1-t0)/ncall);
      printf("mult NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
      printf("mult NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
    }
@@ -526,5 +495,9 @@ int main (int argc, char ** argv)

   } // loop for lat
 } // loop for omp
+
+
+ std::cout << sizeof(vComplexF) << std::endl;
+ 
 Grid_finalize();
 }
--- a/tests/InvSqrt.gnu
+++ b/tests/InvSqrt.gnu
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
-bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez Grid_rng_fixed Grid_simd_new
+bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez Grid_rng_fixed 

 Grid_main_SOURCES = Grid_main.cc
 Grid_main_LDADD = -lGrid
@@ -34,5 +34,5 @@ Grid_stencil_LDADD = -lGrid
 Grid_simd_SOURCES = Grid_simd.cc
 Grid_simd_LDADD = -lGrid

-Grid_simd_new_SOURCES = Grid_simd_new.cc
-Grid_simd_new_LDADD = -lGrid
+#Grid_simd_new_SOURCES = Grid_simd_new.cc
+#Grid_simd_new_LDADD = -lGrid
--- a/tests/Sqrt.gnu
+++ b/tests/Sqrt.gnu
@@ -0,0 +1,2 @@
+f(x) = 6.81384+(-2.34645e-06/(x+0.000228091))+(-1.51593e-05/(x+0.00112084))+(-6.89254e-05/(x+0.003496))+(-0.000288983/(x+0.00954309))+(-0.00119277/(x+0.024928))+(-0.0050183/(x+0.0646627))+(-0.0226449/(x+0.171576))+(-0.123767/(x+0.491792))+(-1.1705/(x+1.78667))+(-102.992/(x+18.4866));
+f(x) = 0.14676+(0.00952992/(x+5.40933e-05))+(0.0115952/(x+0.000559699))+(0.0161824/(x+0.00203338))+(0.0243252/(x+0.00582831))+(0.0379533/(x+0.0154649))+(0.060699/(x+0.0401156))+(0.100345/(x+0.104788))+(0.178335/(x+0.286042))+(0.381586/(x+0.892189))+(1.42625/(x+4.38422));