Got unpreconditioned conjugate gradient to run and converge on a random (uniform random,

not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks. Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop.
2026-02-14 19:00:53 +00:00 · 2015-05-19 13:57:35 +01:00
parent 05d862782f
commit 4dba8522a1
33 changed files with 566 additions and 316 deletions
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.14.1 from Makefile.am.
+# Makefile.in generated by automake 1.15 from Makefile.am.
 # @configure_input@

-# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.

 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -14,7 +14,17 @@

@SET_MAKE@
 VPATH = @srcdir@
-am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
 am__make_running_with_option = \
  case $${target_option-} in \
      ?) ;; \
@@ -79,14 +89,12 @@ build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
 subdir = .
-DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
-	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
-	$(top_srcdir)/configure $(am__configure_deps) COPYING TODO \
-	compile config.guess config.sub depcomp install-sh missing
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
+	$(am__configure_deps) $(am__DIST_COMMON)
 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
 configure.lineno config.status.lineno
 mkinstalldirs = $(install_sh) -d
@@ -149,6 +157,9 @@ ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
 DIST_SUBDIRS = $(SUBDIRS)
+am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \
+	INSTALL NEWS README TODO compile config.guess config.sub \
+	depcomp install-sh missing
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -314,7 +325,6 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
 	$(am__cd) $(top_srcdir) && \
 	  $(AUTOMAKE) --gnu Makefile
-.PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
 	  *config.status*) \
@@ -521,15 +531,15 @@ dist-xz: distdir
 	$(am__post_remove_distdir)

 dist-tarZ: distdir
-	@echo WARNING: "Support for shar distribution archives is" \
-	               "deprecated." >&2
+	@echo WARNING: "Support for distribution archives compressed with" \
+		       "legacy program 'compress' is deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
 	$(am__post_remove_distdir)

 dist-shar: distdir
-	@echo WARNING: "Support for distribution archives compressed with" \
-		       "legacy program 'compress' is deprecated." >&2
+	@echo WARNING: "Support for shar distribution archives is" \
+	               "deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
 	$(am__post_remove_distdir)
@@ -565,17 +575,17 @@ distcheck: dist
 	esac
 	chmod -R a-w $(distdir)
 	chmod u+w $(distdir)
-	mkdir $(distdir)/_build $(distdir)/_inst
+	mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
 	chmod a-w $(distdir)
 	test -d $(distdir)/_build || exit 0; \
 	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
 	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
 	  && am__cwd=`pwd` \
-	  && $(am__cd) $(distdir)/_build \
-	  && ../configure \
+	  && $(am__cd) $(distdir)/_build/sub \
+	  && ../../configure \
 	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
 	    $(DISTCHECK_CONFIGURE_FLAGS) \
-	    --srcdir=.. --prefix="$$dc_install_base" \
+	    --srcdir=../.. --prefix="$$dc_install_base" \
 	  && $(MAKE) $(AM_MAKEFLAGS) \
 	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
 	  && $(MAKE) $(AM_MAKEFLAGS) check \
@@ -749,6 +759,8 @@ uninstall-am:
 	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
 	pdf-am ps ps-am tags tags-am uninstall uninstall-am

+.PRECIOUS: Makefile
+

 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.15 -*- Autoconf -*-

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.

 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])

-# Copyright (C) 2002-2013 Free Software Foundation, Inc.
+# Copyright (C) 2002-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.14'
+[am__api_version='1.15'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.14.1], [],
+m4_if([$1], [1.15], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])

@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.14.1])dnl
+[AM_AUTOMAKE_VERSION([1.15])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])

 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -103,15 +103,14 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 # configured tree to be moved without reconfiguration.

 AC_DEFUN([AM_AUX_DIR_EXPAND],
-[dnl Rely on autoconf to set up CDPATH properly.
-AC_PREREQ([2.50])dnl
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
 ])

 # AM_CONDITIONAL                                            -*- Autoconf -*-

-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -142,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])

-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -333,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl

 # Generate code to set up dependency tracking.              -*- Autoconf -*-

-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -409,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],

 # Do all the work for Automake.                             -*- Autoconf -*-

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -499,8 +498,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 AC_REQUIRE([AC_PROG_AWK])dnl
 AC_REQUIRE([AC_PROG_MAKE_SET])dnl
 AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@@ -573,7 +572,11 @@ to "yes", and re-run configure.
 END
    AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
  fi
-fi])
+fi
+dnl The trailing newline in this macro's definition is deliberate, for
+dnl backward compatibility and to allow trailing 'dnl'-style comments
+dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
+])

 dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@@ -602,7 +605,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -613,7 +616,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
 # Define $install_sh.
 AC_DEFUN([AM_PROG_INSTALL_SH],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
  case $am_aux_dir in
  *\ * | *\	*)
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -623,7 +626,7 @@ if test x"${install_sh}" != xset; then
 fi
 AC_SUBST([install_sh])])

-# Copyright (C) 2003-2013 Free Software Foundation, Inc.
+# Copyright (C) 2003-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -644,7 +647,7 @@ AC_SUBST([am__leading_dot])])

 # Check to see how 'make' treats includes.	            -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -694,7 +697,7 @@ rm -f confinc confmf

 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-

-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -733,7 +736,7 @@ fi

 # Helper functions for option handling.                     -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -764,7 +767,7 @@ AC_DEFUN([_AM_IF_OPTION],

 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -845,7 +848,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])

-# Copyright (C) 2009-2013 Free Software Foundation, Inc.
+# Copyright (C) 2009-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -905,7 +908,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -933,7 +936,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])

-# Copyright (C) 2006-2013 Free Software Foundation, Inc.
+# Copyright (C) 2006-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -952,7 +955,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])

 # Check how to create a tarball.                            -*- Autoconf -*-

-# Copyright (C) 2004-2013 Free Software Foundation, Inc.
+# Copyright (C) 2004-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
--- a/benchmarks/Grid_wilson.cc
+++ b/benchmarks/Grid_wilson.cc
@@ -46,6 +46,13 @@ int main (int argc, char ** argv)
    volume=volume*latt_size[mu];
  }  

+  // Only one non-zero (y)
+  Umu=zero;
+  for(int nn=0;nn<Nd;nn++){
+    random(pRNG,U[nn]);
+    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
+  }
+
  for(int mu=0;mu<Nd;mu++){
    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
  }
@@ -53,7 +60,6 @@ int main (int argc, char ** argv)
  { // Naive wilson implementation
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
-
      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
      tmp = U[mu]*Cshift(src,mu,1);
      for(int i=0;i<ref._odata.size();i++){
@@ -75,7 +81,7 @@ int main (int argc, char ** argv)
  int ncall=1000;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
-    Dw.M(src,result);
+    Dw.Dhop(src,result,0);
  }
  double t1=usecond();
  double flops=1320*volume*ncall;
@@ -99,5 +105,30 @@ int main (int argc, char ** argv)
    }
  }

+  { // Naive wilson dag implementation
+    ref = zero;
+    for(int mu=0;mu<Nd;mu++){
+
+      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
+      tmp = U[mu]*Cshift(src,mu,1);
+      for(int i=0;i<ref._odata.size();i++){
+	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+      }
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu,-1);
+      for(int i=0;i<ref._odata.size();i++){
+	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+      }
+    }
+  }
+  Dw.Dhop(src,result,1);
+  std::cout << "Called DwDag"<<std::endl;
+  std::cout << "norm result "<< norm2(result)<<std::endl;
+  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  err = ref -result;
+  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+
+
  Grid_finalize();
 }
--- a/benchmarks/Grid_wilson_cg_unprec.cc
+++ b/benchmarks/Grid_wilson_cg_unprec.cc
@@ -0,0 +1,56 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeFermion src(&Grid); random(pRNG,src);
+  std::cout << "src norm" << norm2(src)<<std::endl;
+  LatticeFermion result(&Grid); result=zero;
+  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+
+  std::vector<LatticeColourMatrix> U(4,&Grid);
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass=0.5;
+  WilsonMatrix Dw(Umu,mass);
+  HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
+  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  CG(HermOp,src,result);
+
+  
+  Grid_finalize();
+}
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -5,12 +5,14 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
-bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3
-
+bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec

 Grid_wilson_SOURCES = Grid_wilson.cc
 Grid_wilson_LDADD = -lGrid

+Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc
+Grid_wilson_cg_unprec_LDADD = -lGrid
+
 Grid_comms_SOURCES = Grid_comms.cc
 Grid_comms_LDADD = -lGrid

--- a/config.guess
+++ b/config.guess
@@ -1 +1 @@
-/usr/share/automake-1.14/config.guess
+/opt/local/share/automake-1.15/config.guess
--- a/config.sub
+++ b/config.sub
@@ -1 +1 @@
-/usr/share/automake-1.14/config.sub
+/opt/local/share/automake-1.15/config.sub
--- a/13
+++ b/13
@@ -2466,7 +2466,7 @@ test -n "$target_alias" &&
    NONENONEs,x,x, &&
  program_prefix=${target_alias}-

-am__api_version='1.14'
+am__api_version='1.15'

 # Find a good install program.  We prefer a C program (faster),
 # so one script is as good as another.  But avoid the broken or
@@ -2638,8 +2638,8 @@ test "$program_suffix" != NONE &&
 ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
 program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`

-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`

 if test x"${MISSING+set}" != xset; then
  case $am_aux_dir in
@@ -2658,7 +2658,7 @@ else
 $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
 fi

-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
  case $am_aux_dir in
  *\ * | *\	*)
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -2986,8 +2986,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 mkdir_p='$(MKDIR_P)'

-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 # Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AMTAR='$${TAR-tar}'
@@ -3046,6 +3046,7 @@ END
 fi


+
 ac_config_headers="$ac_config_headers lib/Grid_config.h"

 # Check whether --enable-silent-rules was given.
--- a/lib/Grid_simd.h
+++ b/lib/Grid_simd.h
@@ -50,15 +50,20 @@ namespace Grid {
  typedef std::complex<Real>  Complex;

  inline RealF adj(const RealF  & r){ return r; }
-  inline RealF conj(const RealF  & r){ return r; }
+  inline RealF conjugate(const RealF  & r){ return r; }
  inline RealF real(const RealF  & r){ return r; }

  inline RealD adj(const RealD  & r){ return r; }
-  inline RealD conj(const RealD  & r){ return r; }
+  inline RealD conjugate(const RealD  & r){ return r; }
  inline RealD real(const RealD  & r){ return r; }

-  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
-  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
+  inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
+  inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
+  inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
+  inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
+
+  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
+  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
  inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }

@@ -70,15 +75,14 @@ namespace Grid {
    inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
    inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
    inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
-    inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
-    // conj already supported for complex
+    // conjugate already supported for complex
    
    inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
    inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
    inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
    inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
-    inline ComplexF  adj(const ComplexF& r ){ return(conj(r)); }
-    //conj already supported for complex
+
+    //conjugate already supported for complex

    inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
    inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -7,8 +7,8 @@ namespace Grid {
  // LinearOperators Take a something and return a something.
  /////////////////////////////////////////////////////////////////////////////////////////////
  //
-  // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugate (transpose if real):
-  //
+  // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
+  //SBase
  //   i)  F(a x + b y) = aF(x) + b F(y).
  //  ii)  <x|Op|y> = <y|AdjOp|x>^\ast
  //
@@ -25,12 +25,13 @@ namespace Grid {
  /////////////////////////////////////////////////////////////////////////////////////////////
    template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> {
    public:
-      virtual RealD OpAndNorm(const Field &in, Field &out);
+      virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2);
      void AdjOp(const Field &in, Field &out) {
 	Op(in,out);
      };
      void Op(const Field &in, Field &out) {
-	OpAndNorm(in,out);
+	double n1,n2;
+	OpAndNorm(in,out,n1,n2);
      };
    };

@@ -80,8 +81,8 @@ namespace Grid {
      Matrix &_Mat;
    public:
      HermitianOperator(Matrix &Mat): _Mat(Mat) {};
-      RealD OpAndNorm(const Field &in, Field &out){
-	return _Mat.MdagM(in,out);
+      void OpAndNorm(const Field &in, Field &out,double &n1,double &n2){
+	return _Mat.MdagM(in,out,n1,n2);
      }
    };

--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -18,6 +18,7 @@ public:
      std::cout << Tolerance<<std::endl;
    };

+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi) {assert(0);};
    void operator() (HermitianOperatorBase<Field> &Linop,const Field &src, Field &psi){

      RealD cp,c,a,d,b,ssq,qq,b_pred;
@@ -61,21 +62,27 @@ public:
 	
 	Linop.OpAndNorm(p,mmp,d,qq);
 	  
+	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<std::endl;
+      
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
 	
-	cp = axpy_norm(r,mmp,r,-a);
+	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;
+
+	cp = axpy_norm(r,-a,mmp,r);
 	b = cp/c;
+	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,b "<<cp<< " "<<b <<std::endl;
 	
 	// Fuse these loops ; should be really easy
 	psi= a*p+psi;
 	p  = p*b+r;
 	  
-	std::cout << "Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
 	// Stopping condition
 	if ( cp <= rsq ) { 
 	  
-	  Linop.Op(p,mmp);
+	  Linop.Op(psi,mmp);
 	  p=mmp-src;
 	  
 	  RealD mmpnorm = sqrt(norm2(mmp));
@@ -83,8 +90,11 @@ public:
 	  RealD srcnorm = sqrt(norm2(src));
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm/srcnorm;
+
+	  std::cout<<"ConjugateGradient: Converged on iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	  std::cout<<"ConjugateGradient: true   residual  is "<<true_residual<<" sol "<<psinorm<<" src "<<srcnorm<<std::endl;
 	  std::cout<<"ConjugateGradient: target residual was "<<Tolerance<<std::endl;
+	  return;
 	}
      }
      std::cout<<"ConjugateGradient did NOT converge"<<std::endl;
--- a/lib/lattice/Grid_lattice_ET.h
+++ b/lib/lattice/Grid_lattice_ET.h
@@ -102,7 +102,7 @@ template <class arg> struct name\

 GridUnopClass(UnarySub,-a);
 GridUnopClass(UnaryAdj,adj(a));
-GridUnopClass(UnaryConj,conj(a));
+GridUnopClass(UnaryConj,conjugate(a));
 GridUnopClass(UnaryTrace,trace(a));
 GridUnopClass(UnaryTranspose,transpose(a));

@@ -178,7 +178,7 @@ template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,con

 GRID_DEF_UNOP(operator -,UnarySub);
 GRID_DEF_UNOP(adj,UnaryAdj);
-GRID_DEF_UNOP(conj,UnaryConj);
+GRID_DEF_UNOP(conjugate,UnaryConj);
 GRID_DEF_UNOP(trace,UnaryTrace);
 GRID_DEF_UNOP(transpose,UnaryTranspose);

--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -144,14 +144,44 @@ PARALLEL_FOR_LOOP
  }
  
  template<class sobj,class vobj> strong_inline
-  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
-    conformable(lhs,rhs);
+  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    conformable(x,y);
 #pragma omp parallel for
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
-      vobj tmp = a*lhs._odata[ss];
-      vstream(ret._odata[ss],tmp+rhs._odata[ss]);
+    for(int ss=0;ss<x._grid->oSites();ss++){
+      vobj tmp = a*x._odata[ss]+y._odata[ss];
+      vstream(ret._odata[ss],tmp);
    }
  }
+  template<class sobj,class vobj> strong_inline
+  void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    conformable(x,y);
+#pragma omp parallel for
+    for(int ss=0;ss<x._grid->oSites();ss++){
+      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
+      vstream(ret._odata[ss],tmp);
+    }
+  }
+
+  template<class sobj,class vobj> strong_inline
+  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    conformable(x,y);
+#pragma omp parallel for
+    for(int ss=0;ss<x._grid->oSites();ss++){
+      vobj tmp = a*x._odata[ss]+y._odata[ss];
+      vstream(ret._odata[ss],tmp);
+    }
+    return norm2(ret);
+  }
+  template<class sobj,class vobj> strong_inline
+  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    conformable(x,y);
+#pragma omp parallel for
+    for(int ss=0;ss<x._grid->oSites();ss++){
+      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
+      vstream(ret._odata[ss],tmp);
+    }
+    return norm2(ret); // FIXME implement parallel norm in ss loop
+  }

 }
 #endif
--- a/lib/lattice/Grid_lattice_base.h
+++ b/lib/lattice/Grid_lattice_base.h
@@ -9,7 +9,7 @@ namespace Grid {
 // Functionality:
 //     -=,+=,*=,()
 //     add,+,sub,-,mult,mac,*
-//     adj,conj
+//     adj,conjugate
 //     real,imag
 //     transpose,transposeIndex  
 //     trace,traceIndex
--- a/lib/lattice/Grid_lattice_reality.h
+++ b/lib/lattice/Grid_lattice_reality.h
@@ -18,11 +18,11 @@ PARALLEL_FOR_LOOP
        return ret;
    };

-    template<class vobj> inline Lattice<vobj> conj(const Lattice<vobj> &lhs){
+    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
        Lattice<vobj> ret(lhs._grid);
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = conj(lhs._odata[ss]);
+            ret._odata[ss] = conjugate(lhs._odata[ss]);
        }
        return ret;
    };
--- a/lib/math/Grid_math_reality.h
+++ b/lib/math/Grid_math_reality.h
@@ -98,26 +98,26 @@ template<class vtype,int N> inline void  timesMinusI(iMatrix<vtype,N> &ret,const
 /////////////////////////////////////////////// 
 // Conj function for scalar, vector, matrix
 /////////////////////////////////////////////// 
-template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r)
+template<class vtype> inline iScalar<vtype> conjugate(const iScalar<vtype>&r)
 {
    iScalar<vtype> ret;
-    ret._internal = conj(r._internal);
+    ret._internal = conjugate(r._internal);
    return ret;
 }
-template<class vtype,int N> inline iVector<vtype,N> conj(const iVector<vtype,N>&r)
+template<class vtype,int N> inline iVector<vtype,N> conjugate(const iVector<vtype,N>&r)
 {
  iVector<vtype,N> ret;
  for(int i=0;i<N;i++){
-    ret._internal[i] = conj(r._internal[i]);
+    ret._internal[i] = conjugate(r._internal[i]);
  }
  return ret;
 }
-template<class vtype,int N> inline iMatrix<vtype,N> conj(const iMatrix<vtype,N>&r)
+template<class vtype,int N> inline iMatrix<vtype,N> conjugate(const iMatrix<vtype,N>&r)
 {
  iMatrix<vtype,N> ret;
  for(int i=0;i<N;i++){
  for(int j=0;j<N;j++){
-    ret._internal[i][j] = conj(r._internal[i][j]);
+    ret._internal[i][j] = conjugate(r._internal[i][j]);
  }}
  return ret;
 }
--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -21,7 +21,13 @@ const int WilsonMatrix::Tm = 7;
  class WilsonCompressor {
  public:
    int mu;
-    
+    int dag;
+
+    WilsonCompressor(int _dag){
+      mu=0;
+      dag=_dag;
+      assert((dag==0)||(dag==1));
+    }
    void Point(int p) { 
      mu=p;
    };
@@ -29,7 +35,11 @@ const int WilsonMatrix::Tm = 7;
    vHalfSpinColourVector operator () (const vSpinColourVector &in)
    {
      vHalfSpinColourVector ret;
-      switch(mu) {
+      int mudag=mu;
+      if (dag) {
+	mudag=(mu+Nd)%(2*Nd);
+      }
+      switch(mudag) {
      case WilsonMatrix::Xp:
 	spProjXp(ret,in);
 	break;
@@ -87,43 +97,52 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF

 RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out);
-  return 0.0;
+  Dhop(in,out,0);
+  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
+  return norm2(out);
 }
 RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out);
-  return 0.0;
+  Dhop(in,out,1);
+  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
+  return norm2(out);
 }

 void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out);
+  Dhop(in,out,0);
+  out = 0.5*out; // FIXME : scale factor in Dhop
 }
 void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out);
+  Dhop(in,out,1);
 }
 void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out)
 {
+  out = (4.0+mass)*in;
  return ;
 }
 void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
 {
+  out = (1.0/(4.0+mass))*in;
  return ;
 }
 void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
 {
+  out = (1.0/(4.0+mass))*in;
  return ;
 }
 void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 {
+  out = (1.0/(4.0+mass))*in;
  return ;
 }

-void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
+void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
 {
-  WilsonCompressor compressor;
+  assert((dag==0) ||(dag==1));
+
+  WilsonCompressor compressor(dag);
  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);

 PARALLEL_FOR_LOOP
@@ -140,13 +159,13 @@ PARALLEL_FOR_LOOP
    int ssu= ss;

    // Xp
-    offset = Stencil._offsets [Xp][ss];
-    local  = Stencil._is_local[Xp][ss];
-    perm   = Stencil._permute[Xp][ss];
-    ptype  = Stencil._permute_type[Xp];
+    int idx = (Xp+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];

-    if ( local && perm ) 
-    {
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
      spProjXp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
@@ -154,18 +173,17 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
-    //prefetch(Umu._odata[ssu](Yp));
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
    spReconXp(result,Uchi);

    // Yp
-    offset = Stencil._offsets [Yp][ss];
-    local  = Stencil._is_local[Yp][ss];
-    perm   = Stencil._permute[Yp][ss];
-    ptype  = Stencil._permute_type[Yp];
+    idx = (Yp+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];

-    if ( local && perm ) 
-    {
+    if ( local && perm ) {
      spProjYp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
@@ -173,18 +191,16 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
-    //    prefetch(Umu._odata[ssu](Zp));
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
    accumReconYp(result,Uchi);

    // Zp
-    offset = Stencil._offsets [Zp][ss];
-    local  = Stencil._is_local[Zp][ss];
-    perm   = Stencil._permute[Zp][ss];
-    ptype  = Stencil._permute_type[Zp];
-
-    if ( local && perm ) 
-    {
+    idx = (Zp+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
      spProjZp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
@@ -192,18 +208,16 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
-    //    prefetch(Umu._odata[ssu](Tp));
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
    accumReconZp(result,Uchi);

    // Tp
-    offset = Stencil._offsets [Tp][ss];
-    local  = Stencil._is_local[Tp][ss];
-    perm   = Stencil._permute[Tp][ss];
-    ptype  = Stencil._permute_type[Tp];
-
-    if ( local && perm ) 
-    {
+    idx = (Tp+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
      spProjTp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
@@ -211,15 +225,15 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
-    //    prefetch(Umu._odata[ssu](Xm));
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
    accumReconTp(result,Uchi);

    // Xm
-    offset = Stencil._offsets [Xm][ss];
-    local  = Stencil._is_local[Xm][ss];
-    perm   = Stencil._permute[Xm][ss];
-    ptype  = Stencil._permute_type[Xm];
+    idx = (Xm+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];

    if ( local && perm ) 
    {
@@ -230,18 +244,18 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
    accumReconXm(result,Uchi);


    // Ym
-    offset = Stencil._offsets [Ym][ss];
-    local  = Stencil._is_local[Ym][ss];
-    perm   = Stencil._permute[Ym][ss];
-    ptype  = Stencil._permute_type[Ym];
+    idx = (Ym+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];

-    if ( local && perm ) 
-    {
+    if ( local && perm ) {
      spProjYm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
@@ -249,17 +263,16 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
    accumReconYm(result,Uchi);

    // Zm
-    offset = Stencil._offsets [Zm][ss];
-    local  = Stencil._is_local[Zm][ss];
-    perm   = Stencil._permute[Zm][ss];
-    ptype  = Stencil._permute_type[Zm];
-
-    if ( local && perm ) 
-    {
+    idx = (Zm+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
      spProjZm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
@@ -267,17 +280,16 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
    accumReconZm(result,Uchi);

    // Tm
-    offset = Stencil._offsets [Tm][ss];
-    local  = Stencil._is_local[Tm][ss];
-    perm   = Stencil._permute[Tm][ss];
-    ptype  = Stencil._permute_type[Tm];
-
-    if ( local && perm ) 
-    {
+    idx = (Tm+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
      spProjTm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
    } else if ( local ) {
@@ -285,7 +297,7 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
    accumReconTm(result,Uchi);

    vstream(out._odata[ss],result);
@@ -294,10 +306,6 @@ PARALLEL_FOR_LOOP
 }


-void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out)
-{
-  return;
-}



--- a/lib/qcd/Grid_qcd_wilson_dop.h
+++ b/lib/qcd/Grid_qcd_wilson_dop.h
@@ -45,10 +45,7 @@ namespace Grid {
      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);

      // non-hermitian hopping term; half cb or both
-      void Dhop(const LatticeFermion &in, LatticeFermion &out);
-
-      // m+4r -1/2 Dhop; both cb's
-      void Dw(const LatticeFermion &in, LatticeFermion &out);
+      void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);

      typedef iScalar<iMatrix<vComplex, Nc> > matrix;

--- a/lib/simd/Grid_vComplexD.h
+++ b/lib/simd/Grid_vComplexD.h
@@ -32,7 +32,7 @@ namespace Grid {
        friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
        friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
        friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
-        friend inline vComplexD adj(const vComplexD &in){ return conj(in); }
+        friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); }

        //////////////////////////////////
        // Initialise to 1,0,i
@@ -166,11 +166,11 @@ namespace Grid {
 	// all subtypes; may not be a good assumption, but could
 	// add the vector width as a template param for BG/Q for example
 	////////////////////////////////////////////////////////////////////
-	/*
 	friend inline void permute(vComplexD &y,vComplexD b,int perm)
 	{
 	  Gpermute<vComplexD>(y,b,perm);
 	}
+	/*
 	friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
 	{
 	  Gmerge<vComplexD,ComplexD >(y,extracted);
@@ -269,7 +269,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
        ////////////////////////
        // Conjugate
        ////////////////////////
-        friend inline vComplexD conj(const vComplexD &in){
+        friend inline vComplexD conjugate(const vComplexD &in){
            vComplexD ret ; vzero(ret);
 #if defined (AVX1)|| defined (AVX2)
 	    //	    addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
@@ -345,17 +345,17 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 // REDUCE FIXME must be a cleaner implementation
       friend inline ComplexD Reduce(const vComplexD & in)
       { 
-#if defined SSE4
-         return ComplexD(in.v[0], in.v[1]); // inefficient
+#ifdef SSE4
+	 return ComplexD(in.v[0],in.v[1]);
+#endif
+#if defined(AVX1) || defined (AVX2)
+	 vComplexD v1;
+	 permute(v1,in,0); // sse 128; paired complex single
+	 v1=v1+in;
+	 return ComplexD(v1.v[0],v1.v[1]);
 #endif
-#if defined (AVX1) || defined(AVX2)
-	 //            return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
-	 __attribute__ ((aligned(32))) double c_[4];
-         _mm256_store_pd(c_,in.v);
-	 return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
-#endif 
 #ifdef AVX512
-            return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
+	 return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
 #endif 
 #ifdef QPX
 #endif
@@ -387,7 +387,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
    };


-    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
+    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; }


    typedef  vComplexD vDComplex;
--- a/lib/simd/Grid_vComplexF.h
+++ b/lib/simd/Grid_vComplexF.h
@@ -47,7 +47,7 @@ namespace Grid {
        friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
        friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
        friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
-        friend inline vComplexF adj(const vComplexF &in){ return conj(in); }
+        friend inline vComplexF adj(const vComplexF &in){ return conjugate(in); }
        
        //////////////////////////////////
        // Initialise to 1,0,i
@@ -228,42 +228,25 @@ namespace Grid {
            ret.v = {a,b,a,b};
 #endif
        }
-       friend inline ComplexF Reduce(const vComplexF & in)
-       {
+	friend inline void permute(vComplexF &y,vComplexF b,int perm)
+	{
+	  Gpermute<vComplexF>(y,b,perm);
+	}
+	friend inline ComplexF Reduce(const vComplexF & in)
+	{
 #ifdef SSE4
-	   union {
-	     cvec v1;    // SSE 4 x float vector
-	     float f[4];  // scalar array of 4 floats
-	   } u128;
-	   u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
-	   return ComplexF(u128.f[0], u128.f[1]);
+	 vComplexF v1;
+	 permute(v1,in,0); // sse 128; paired complex single
+	 v1=v1+in;
+	 return ComplexF(v1.v[0],v1.v[1]);
 #endif
-#ifdef AVX1
-	   //it would be better passing 2 arguments to saturate the vector lanes
-	   union {
-	     __m256 v1;    
-	     float f[8];  
-	   } u256;
-	   //SWAP lanes
-	   // FIXME .. icc complains with lib/lattice/Grid_lattice_reduction.h (49): (col. 20) warning #13211: Immediate parameter to intrinsic call too large
-	   __m256 t0 = _mm256_permute2f128_ps(in.v, in.v, 1);
-	   __m256 t1 = _mm256_permute_ps(in.v  , 0b11011000);//real (0,2,1,3)
-	   __m256 t2 = _mm256_permute_ps(t0 , 0b10001101);//imag (1,3,0,2)
-	   t0 = _mm256_blend_ps(t1, t2, 0b0101000001010000);// (0,0,1,1,0,0,1,1)
-	   t1 = _mm256_hadd_ps(t0,t0);
-	   u256.v1 = _mm256_hadd_ps(t1, t1);
-	   return ComplexF(u256.f[0], u256.f[4]);
-#endif
-#ifdef AVX2
-	   union {
-	     __m256 v1;    
-	     float f[8];  
-	   } u256;
-	   const __m256i mask= _mm256_set_epi32( 7, 5, 3, 1, 6, 4, 2, 0);
-	   __m256 tmp1 = _mm256_permutevar8x32_ps(in.v, mask);
-	   __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
-	   u256.v1 = _mm256_hadd_ps(tmp2, tmp2);
-	   return ComplexF(u256.f[0], u256.f[4]);
+#if defined(AVX1) || defined (AVX2)
+	 vComplexF v1,v2;
+	 permute(v1,in,0); // sse 128; paired complex single
+	 v1=v1+in;
+	 permute(v2,v1,1); // avx 256; quad complex single
+	 v1=v1+v2;
+	 return ComplexF(v1.v[0],v1.v[1]);
 #endif
 #ifdef AVX512
            return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
@@ -345,13 +328,10 @@ namespace Grid {
        // Conjugate
        ///////////////////////
 								     
-        friend inline vComplexF conj(const vComplexF &in){
+        friend inline vComplexF conjugate(const vComplexF &in){
            vComplexF ret ; vzero(ret);
 #if defined (AVX1)|| defined (AVX2)
-	    cvec tmp;
-	    tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
-	    ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
-
+	    ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f));
 #endif
 #ifdef SSE4
 	    ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f));
@@ -433,10 +413,6 @@ namespace Grid {
            return *this;
        }

-      friend inline void permute(vComplexF &y,vComplexF b,int perm)
-      {
-	Gpermute<vComplexF>(y,b,perm);
-      }
      /*
      friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
      {
@@ -460,7 +436,7 @@ namespace Grid {

    inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) 
    {
-      return conj(l)*r; 
+      return conjugate(l)*r; 
    }

    inline void zeroit(vComplexF &z){ vzero(z);}
--- a/lib/simd/Grid_vInteger.h
+++ b/lib/simd/Grid_vInteger.h
@@ -117,7 +117,7 @@ namespace Grid {
        };
        
        ///////////////////////////////////////////////
-        // mult, sub, add, adj,conj, mac functions
+        // mult, sub, add, adj,conjugate, mac functions
        ///////////////////////////////////////////////
        friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
        friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
--- a/lib/simd/Grid_vRealD.h
+++ b/lib/simd/Grid_vRealD.h
@@ -26,7 +26,7 @@ namespace Grid {
        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
        friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
        friend inline vRealD adj(const vRealD &in) { return in; }
-        friend inline vRealD conj(const vRealD &in){ return in; }
+        friend inline vRealD conjugate(const vRealD &in){ return in; }
        
        friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
 #if defined (AVX1) || defined (SSE4)
@@ -112,11 +112,12 @@ namespace Grid {
 	// all subtypes; may not be a good assumption, but could
 	// add the vector width as a template param for BG/Q for example
 	////////////////////////////////////////////////////////////////////
-	/*
+
 	friend inline void permute(vRealD &y,vRealD b,int perm)
 	{
 	  Gpermute<vRealD>(y,b,perm);
 	}
+	/*
 	friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
 	{
 	  Gmerge<vRealD,RealD >(y,extracted);
@@ -209,48 +210,26 @@ namespace Grid {

       friend inline RealD Reduce(const vRealD & in)
       {
-#if defined (SSE4)
-	 // FIXME Hack
-	 const RealD * ptr =(const RealD *)  &in;
-	 RealD ret = 0; 
-	 for(int i=0;i<vRealD::Nsimd();i++){
-	   ret = ret+ptr[i];
-	 }
-	 return ret;
+#ifdef SSE4
+	 vRealD v1;
+	 permute(v1,in,0); // sse 128; paired real double
+	 v1=v1+in;
+	 return RealD(v1.v[0]);
 #endif
-#if defined (AVX1) || defined(AVX2)
-	 typedef union  {
-	   uint64_t l;
-	   double   d;
-	 } my_conv_t;
-	 my_conv_t converter;
-// more reduce_add
-/*
-            __attribute__ ((aligned(32))) double c_[16];
-	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
-            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
-  	             tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp  = 3+2,1+0,1+0,3+2
-                    hadd = _mm256_hadd_pd(tmp,tmp);               // tmp  = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
-                    _mm256_store_pd(c_,hadd);ô
-             return c[0]
-*/
-	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
-            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
-                    hadd = _mm256_hadd_pd(hadd,hadd);             // hadd = 1+0+3+2...
-		    converter.l = _mm256_extract_epi64((ivec)hadd,0);
-            return converter.d;
+#if defined(AVX1) || defined (AVX2)
+	 vRealD v1,v2;
+	 permute(v1,in,0); // avx 256; quad double
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 return v1.v[0];
 #endif
 #ifdef AVX512
            return _mm512_reduce_add_pd(in.v);
-/*
-            __attribute__ ((aligned(32))) double c_[8];
-           _mm512_store_pd(c_,in.v);
-            return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
-*/
 #endif
 #ifdef QPX
 #endif
-        }
+       }

        // *=,+=,-= operators
        inline vRealD &operator *=(const vRealD &r) {
@@ -270,7 +249,7 @@ namespace Grid {
        static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
    };

-    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
+    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; }
    inline void zeroit(vRealD &z){ vzero(z);}

    inline vRealD outerProduct(const vRealD &l, const vRealD& r)
--- a/lib/simd/Grid_vRealF.h
+++ b/lib/simd/Grid_vRealF.h
@@ -92,13 +92,13 @@ namespace Grid {
        };
        
        ///////////////////////////////////////////////
-        // mult, sub, add, adj,conj, mac functions
+        // mult, sub, add, adj,conjugate, mac functions
        ///////////////////////////////////////////////
        friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
        friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
        friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
        friend inline vRealF adj(const vRealF &in) { return in; }
-        friend inline vRealF conj(const vRealF &in){ return in; }
+        friend inline vRealF conjugate(const vRealF &in){ return in; }

        friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
 #if defined (AVX1) || defined (SSE4)
@@ -133,11 +133,12 @@ namespace Grid {
 	// all subtypes; may not be a good assumption, but could
 	// add the vector width as a template param for BG/Q for example
 	////////////////////////////////////////////////////////////////////
-	/*
+
 	friend inline void permute(vRealF &y,vRealF b,int perm)
 	{
 	  Gpermute<vRealF>(y,b,perm);
 	}
+	/*
 	friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
 	{
 	  Gmerge<vRealF,RealF >(y,extracted);
@@ -155,7 +156,6 @@ namespace Grid {
 	  Gextract<vRealF,RealF>(y,extracted);
 	}
 	*/
-
        
        /////////////////////////////////////////////////////
        // Broadcast a value across Nsimd copies.
@@ -243,33 +243,26 @@ friend inline void vstore(const vRealF &ret, float *a){
        }
       friend inline RealF Reduce(const vRealF & in)
       {
-#if defined (SSE4)
-	 // FIXME Hack
-	 const RealF * ptr = (const RealF *) &in;
-	 RealF ret = 0; 
-	 for(int i=0;i<vRealF::Nsimd();i++){
-	   ret = ret+ptr[i];
-	 }
-	 return ret;
+#ifdef SSE4
+	 vRealF v1,v2;
+	 permute(v1,in,0); // sse 128; quad single
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 return v1.v[0];
 #endif
-#if defined (AVX1) || defined(AVX2)
-            __attribute__ ((aligned(32))) float c_[16];
-            __m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
-            __m256 hadd = _mm256_hadd_ps(in.v,tmp);
-                   tmp = _mm256_permute2f128_ps(hadd,hadd,0x01);
-                   hadd = _mm256_hadd_ps(tmp,tmp);
-                  _mm256_store_ps(c_,hadd);
-         return (float)c_[0];
-
+#if defined(AVX1) || defined (AVX2)
+	 vRealF v1,v2;
+	 permute(v1,in,0); // avx 256; octo-double
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 permute(v2,v1,2); 
+	 v1=v1+v2;
+	 return v1.v[0];
 #endif
 #ifdef AVX512
            return _mm512_reduce_add_ps(in.v);
-/*
-             __attribute__ ((aligned(64))) float c_[16];
-             _mm512_store_ps(c_,in.v);
-             return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7]
-                    +c_[8]+c_[9]+c_[10]+c_[11]+c_[12]+c_[13]+c_[14]+c_[15];
-*/
 #endif
 #ifdef QPX
 #endif
@@ -291,7 +284,7 @@ friend inline void vstore(const vRealF &ret, float *a){
    public:
        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
    };
-    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
+    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conjugate(l)*r; }
    inline void  zeroit(vRealF &z){ vzero(z);}

    inline vRealF outerProduct(const vRealF &l, const vRealF& r)
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -79,7 +79,7 @@ namespace Grid {
    friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
-    friend inline Grid_simd adj(const Grid_simd &in){ return conj(in); }
+    friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
        
    //////////////////////////////////
    // Initialise to 1,0,i
@@ -193,7 +193,7 @@ namespace Grid {
    // Conjugate
    ///////////////////////
 								     
-    friend inline Grid_simd  conj(const Grid_simd  &in){
+    friend inline Grid_simd  conjugate(const Grid_simd  &in){
      Grid_simd  ret ; vzero(ret);
      // FIXME add operator
      return ret;
@@ -265,7 +265,7 @@ namespace Grid {
  template<class scalar_type, class vector_type > 
    inline Grid_simd< scalar_type, vector_type>  innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r) 
  {
-    return conj(l)*r; 
+    return conjugate(l)*r; 
  }

  template<class scalar_type, class vector_type >
--- a/scripts/build-all
+++ b/scripts/build-all
@@ -1,6 +1,6 @@
-#!/bin/bash 
+#!/bin/bash  -e

-DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi"
+DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse"
 EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512"
 BLACK="\033[30m"
 RED="\033[31m"
--- a/scripts/configure-all
+++ b/scripts/configure-all
@@ -1,6 +1,6 @@
 #!/bin/bash

-DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx"
+DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse"

 for D in $DIRS
 do
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -34,6 +34,9 @@ icpc-avx512)
 icpc-mic)
  CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3  -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
+clang-sse)
+CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
+  ;;
 clang-avx)
 CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
@@ -47,16 +50,16 @@ clang-avx2-openmp)
 CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx-openmp-mpi)
-CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=mpi
+CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx2-openmp-mpi)
-CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=mpi
+CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx-mpi)
-CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx "  LIBS="-lgmp -lmpfr" --enable-comms=mpi
+CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx2-mpi)
-CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx "  LIBS="-lgmp -lmpfr" --enable-comms=mpi
+CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
 ;;
 esac
 echo -e $NORMAL
--- a/tests/Grid_main.cc
+++ b/tests/Grid_main.cc
@@ -140,7 +140,7 @@ int main (int argc, char ** argv)

 //     -=,+=,*=,()
 //     add,+,sub,-,mult,mac,*
-//     adj,conj
+//     adj,conjugate
 //     real,imag
 //     transpose,transposeIndex  
 //     trace,traceIndex
@@ -437,7 +437,7 @@ int main (int argc, char ** argv)
        for(int r=0;r<3;r++){
        for(int c=0;c<3;c++){
            diff =shifted1()()(r,c)-shifted2()()(r,c);
-            nn=real(conj(diff)*diff);
+            nn=real(conjugate(diff)*diff);
            if ( nn > 0 )
                cout<<"Shift fail (shifted1/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
                    <<shifted1()()(r,c)<<" "<<shifted2()()(r,c)
@@ -451,7 +451,7 @@ int main (int argc, char ** argv)
        for(int r=0;r<3;r++){
        for(int c=0;c<3;c++){
            diff =shifted3()()(r,c)-shifted2()()(r,c);
-            nn=real(conj(diff)*diff);
+            nn=real(conjugate(diff)*diff);
            if ( nn > 0 )
                cout<<"Shift rb fail (shifted3/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
                <<shifted3()()(r,c)<<" "<<shifted2()()(r,c)
@@ -468,7 +468,7 @@ int main (int argc, char ** argv)
        for(int r=0;r<Nc;r++){
        for(int c=0;c<Nc;c++){
            diff =foobar2()()(r,c)-foobar1()()(r,c);
-            nrm = nrm + real(conj(diff)*diff);
+            nrm = nrm + real(conjugate(diff)*diff);
        }}
    }}}}
 	if( Fine.IsBoss() ){
--- a/tests/Grid_rng_fixed.cc
+++ b/tests/Grid_rng_fixed.cc
@@ -0,0 +1,52 @@
+#include <Grid.h>
+#include <parallelIO/GridNerscIO.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(4,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+     
+  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+
+  GridSerialRNG            fsRNG;  fsRNG.SeedFixedIntegers(seeds);
+  GridParallelRNG          fpRNG(&Grid);  fpRNG.SeedFixedIntegers(seeds);
+
+  vComplexF tmp; random(fsRNG,tmp);
+  std::cout<<"Random vComplexF (fixed seed)\n"<< tmp<<std::endl;
+  std::cout<<"conjugate(tmp)\n"<< conjugate(tmp)<<std::endl;
+  std::cout<<"conjugate(tmp)*tmp\n"<< conjugate(tmp)*tmp<<std::endl;
+  std::cout<<"innerProduct"<< innerProduct(tmp,tmp)<<std::endl;
+  std::cout<<"Reduce(innerProduct)"<< Reduce(innerProduct(tmp,tmp))<<std::endl;
+
+  SpinMatrix rnd  ; 
+  random(fsRNG,rnd);
+  std::cout<<"Random Spin Matrix (fixed seed)\n"<< rnd<<std::endl;
+
+  SpinVector rv; 
+  random(fsRNG,rv);
+  std::cout<<"Random Spin Vector (fixed seed)\n"<< rv<<std::endl;
+
+  gaussian(fsRNG,rv);
+  std::cout<<"Gaussian Spin Vector (fixed seed)\n"<< rv<<std::endl;
+
+  LatticeColourVector lcv(&Grid);
+
+  LatticeFermion src(&Grid); random(fpRNG,src);
+  std::cout << "src norm : " << norm2(src)<<std::endl;
+  std::cout << "src " << src<<std::endl;
+
+  random(fpRNG,lcv);
+  std::cout<<"Random Lattice Colour Vector (fixed seed)\n"<< lcv<<std::endl;
+
+
+  Grid_finalize();
+}
--- a/tests/Grid_simd.cc
+++ b/tests/Grid_simd.cc
@@ -26,7 +26,7 @@ public:
 class funcConj {
 public:
  funcConj() {};
-  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conj(i1);}
+  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conjugate(i1);}
  std::string name(void) const { return std::string("Conj"); }
 };
 class funcAdj {
@@ -49,6 +49,34 @@ public:
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesMinusI(i1);}
  std::string name(void) const { return std::string("timesMinusI"); }
 };
+class funcInnerProduct {
+public:
+  funcInnerProduct() {};
+  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = innerProduct(i1,i2);}
+  std::string name(void) const { return std::string("innerProduct"); }
+};
+
+// FIXME still to test:
+//
+//  innerProduct,
+//  norm2, 
+//  Reduce,
+//
+//  mac,mult,sub,add, vone,vzero,vcomplex_i, =zero,
+//  vset,vsplat,vstore,vstream,vload, scalar*vec, vec*scalar
+//  unary -,
+//  *= , -=, +=
+//  outerproduct, 
+//  zeroit
+//  permute
+
+class funcReduce {
+public:
+  funcReduce() {};
+template<class reduce,class vec>    void vfunc(reduce &rr,vec &i1,vec &i2)   const { rr = Reduce(i1);}
+template<class reduce,class scal>   void sfunc(reduce &rr,scal &i1,scal &i2) const { rr = i1;}
+  std::string name(void) const { return std::string("Reduce"); }
+};

 template<class scal, class vec,class functor > 
 void Tester(const functor &func)
@@ -96,8 +124,59 @@ void Tester(const functor &func)
      ok++;
    }
  }
-  if ( ok==0 ) std::cout << " OK!" <<std::endl;
+  if ( ok==0 ) {
+    std::cout << " OK!" <<std::endl;
+  }
+  assert(ok==0);
+}

+
+template<class reduced,class scal, class vec,class functor > 
+void ReductionTester(const functor &func)
+{
+  GridSerialRNG          sRNG;
+  sRNG.SeedRandomDevice();
+  
+  int Nsimd = vec::Nsimd();
+
+  std::vector<scal> input1(Nsimd);
+  std::vector<scal> input2(Nsimd);
+  reduced result(0);
+  reduced reference(0);
+  reduced tmp;
+
+  std::vector<vec,alignedAllocator<vec> > buf(3);
+  vec & v_input1 = buf[0];
+  vec & v_input2 = buf[1];
+
+
+  for(int i=0;i<Nsimd;i++){
+    random(sRNG,input1[i]);
+    random(sRNG,input2[i]);
+  }
+
+  merge<vec,scal>(v_input1,input1);
+  merge<vec,scal>(v_input2,input2);
+
+  func.template vfunc<reduced,vec>(result,v_input1,v_input2);
+
+  for(int i=0;i<Nsimd;i++) {
+    func.template sfunc<reduced,scal>(tmp,input1[i],input2[i]);
+    reference+=tmp;
+  }
+
+  std::cout << " " << func.name()<<std::endl;
+
+  int ok=0;
+  if ( abs(reference-result)/abs(reference) > 1.0e-6 ){ // rounding is possible for reduce order
+    std::cout<< "*****" << std::endl;
+    std::cout<< abs(reference-result) << " " <<reference<< " " << result<<std::endl;
+    ok++;
+  }
+  if ( ok==0 ) {
+    std::cout << " OK!" <<std::endl;
+  }
+  assert(ok==0);
 }


@@ -127,6 +206,8 @@ int main (int argc, char ** argv)
  Tester<ComplexF,vComplexF>(funcTimes());
  Tester<ComplexF,vComplexF>(funcConj());
  Tester<ComplexF,vComplexF>(funcAdj());
+  Tester<ComplexF,vComplexF>(funcInnerProduct());
+  ReductionTester<ComplexF,ComplexF,vComplexF>(funcReduce());

  std::cout << "==================================="<<  std::endl;
  std::cout << "Testing vComplexD "<<std::endl;
@@ -140,6 +221,8 @@ int main (int argc, char ** argv)
  Tester<ComplexD,vComplexD>(funcTimes());
  Tester<ComplexD,vComplexD>(funcConj());
  Tester<ComplexD,vComplexD>(funcAdj());
+  Tester<ComplexD,vComplexD>(funcInnerProduct());
+  ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());

  std::cout << "==================================="<<  std::endl;
  std::cout << "Testing vRealF "<<std::endl;
@@ -150,6 +233,9 @@ int main (int argc, char ** argv)
  Tester<RealF,vRealF>(funcMinus());
  Tester<RealF,vRealF>(funcTimes());
  Tester<RealF,vRealF>(funcAdj());
+  Tester<RealF,vRealF>(funcConj());
+  Tester<RealF,vRealF>(funcInnerProduct());
+  ReductionTester<RealF,RealF,vRealF>(funcReduce());

  std::cout << "==================================="<<  std::endl;
  std::cout << "Testing vRealD "<<std::endl;
@@ -159,6 +245,9 @@ int main (int argc, char ** argv)
  Tester<RealD,vRealD>(funcMinus());
  Tester<RealD,vRealD>(funcTimes());
  Tester<RealD,vRealD>(funcAdj());
+  Tester<RealD,vRealD>(funcConj());
+  Tester<RealD,vRealD>(funcInnerProduct());
+  ReductionTester<RealD,RealD,vRealD>(funcReduce());

  Grid_finalize();
 }
--- a/tests/Grid_stencil.cc
+++ b/tests/Grid_stencil.cc
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
 	  for(int r=0;r<3;r++){
 	  for(int c=0;c<3;c++){
            diff =check()()(r,c)-bar()()(r,c);
-            double nn=real(conj(diff)*diff);
+            double nn=real(conjugate(diff)*diff);
            if ( nn > 0){
 	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n",
 		     coor[0],coor[1],coor[2],coor[3],r,c,
@@ -103,8 +103,8 @@ int main (int argc, char ** argv)
 		     real(bar()()(r,c))
 		     );
 	    }
-	    snrmC=snrmC+real(conj(check()()(r,c))*check()()(r,c));
-	    snrmB=snrmB+real(conj(bar()()(r,c))*bar()()(r,c));
+	    snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c));
+	    snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c));
 	    snrm=snrm+nn;
 	  }}
 	 
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
-bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez
+bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez Grid_rng_fixed

 Grid_main_SOURCES = Grid_main.cc
 Grid_main_LDADD = -lGrid
@@ -13,6 +13,9 @@ Grid_main_LDADD = -lGrid
 Grid_rng_SOURCES = Grid_rng.cc
 Grid_rng_LDADD = -lGrid

+Grid_rng_fixed_SOURCES = Grid_rng_fixed.cc
+Grid_rng_fixed_LDADD = -lGrid
+
 Grid_remez_SOURCES = Grid_remez.cc
 Grid_remez_LDADD = -lGrid

--- a/tests/test_Grid_jacobi.cc
+++ b/tests/test_Grid_jacobi.cc
@@ -186,7 +186,7 @@ int main (int argc, char ** argv)
 	  for(int r=0;r<3;r++){
 	  for(int c=0;c<3;c++){
            diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
-            double nn=real(conj(diff)*diff);
+            double nn=real(conjugate(diff)*diff);
            if ( nn > 0 ){
 	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
 		     coor[0],coor[1],coor[2],coor[3],r,c,