From a6e1ea216d5d92e75a6f3c3b2ac4b291ae073a5d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 19 May 2015 13:57:35 +0100
Subject: [PATCH] Got unpreconditioned conjugate gradient to run and converge
 on a random (uniform random, not even SU(3) for now) gauge field. Convergence
 history is correctly indepdendent of decomposition on 1,2,4,8,16 mpi tasks.
 Found a couple of simd bugs which required fixed and enhanced the
 Grid_simd.cc test suite. Implemented the Mdag, M, MdagM, Meooe Mooee schur
 type stuff in the wilson dop.

---
 Makefile.in                                  |  44 +++--
 aclocal.m4                                   |  61 +++----
 benchmarks/Grid_wilson.cc                    |  35 +++-
 benchmarks/Grid_wilson_cg_unprec.cc          |  56 +++++++
 benchmarks/Makefile.am                       |   6 +-
 config.guess                                 |   2 +-
 config.sub                                   |   2 +-
 configure                                    |  13 +-
 lib/Grid_simd.h                              |  20 ++-
 lib/algorithms/LinearOperator.h              |  13 +-
 lib/algorithms/iterative/ConjugateGradient.h |  16 +-
 lib/lattice/Grid_lattice_ET.h                |   4 +-
 lib/lattice/Grid_lattice_arith.h             |  40 ++++-
 lib/lattice/Grid_lattice_base.h              |   2 +-
 lib/lattice/Grid_lattice_reality.h           |   4 +-
 lib/math/Grid_math_reality.h                 |  12 +-
 lib/qcd/Grid_qcd_wilson_dop.cc               | 160 ++++++++++---------
 lib/qcd/Grid_qcd_wilson_dop.h                |   5 +-
 lib/simd/Grid_vComplexD.h                    |  26 +--
 lib/simd/Grid_vComplexF.h                    |  66 +++-----
 lib/simd/Grid_vInteger.h                     |   2 +-
 lib/simd/Grid_vRealD.h                       |  55 ++-----
 lib/simd/Grid_vRealF.h                       |  49 +++---
 lib/simd/Grid_vector_types.h                 |   6 +-
 scripts/build-all                            |   4 +-
 scripts/configure-all                        |   2 +-
 scripts/configure-commands                   |  11 +-
 tests/Grid_main.cc                           |   8 +-
 tests/Grid_rng_fixed.cc                      |  52 ++++++
 tests/Grid_simd.cc                           |  93 ++++++++++-
 tests/Grid_stencil.cc                        |   6 +-
 tests/Makefile.am                            |   5 +-
 tests/test_Grid_jacobi.cc                    |   2 +-
 33 files changed, 566 insertions(+), 316 deletions(-)
 create mode 100644 benchmarks/Grid_wilson_cg_unprec.cc
 create mode 100644 tests/Grid_rng_fixed.cc

diff --git a/Makefile.in b/Makefile.in
index b6894ef6..b9fba168 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.14.1 from Makefile.am.
+# Makefile.in generated by automake 1.15 from Makefile.am.
 # @configure_input@
 
-# Copyright (C) 1994-2013 Free Software Foundation, Inc.
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
 
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -14,7 +14,17 @@
 
 @SET_MAKE@
 VPATH = @srcdir@
-am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
 am__make_running_with_option = \
   case $${target_option-} in \
       ?) ;; \
@@ -79,14 +89,12 @@ build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
 subdir = .
-DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
-	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
-	$(top_srcdir)/configure $(am__configure_deps) COPYING TODO \
-	compile config.guess config.sub depcomp install-sh missing
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \
+	$(am__configure_deps) $(am__DIST_COMMON)
 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
  configure.lineno config.status.lineno
 mkinstalldirs = $(install_sh) -d
@@ -149,6 +157,9 @@ ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
 DIST_SUBDIRS = $(SUBDIRS)
+am__DIST_COMMON = $(srcdir)/Makefile.in AUTHORS COPYING ChangeLog \
+	INSTALL NEWS README TODO compile config.guess config.sub \
+	depcomp install-sh missing
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -314,7 +325,6 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
 	$(am__cd) $(top_srcdir) && \
 	  $(AUTOMAKE) --gnu Makefile
-.PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
 	  *config.status*) \
@@ -521,15 +531,15 @@ dist-xz: distdir
 	$(am__post_remove_distdir)
 
 dist-tarZ: distdir
-	@echo WARNING: "Support for shar distribution archives is" \
-	               "deprecated." >&2
+	@echo WARNING: "Support for distribution archives compressed with" \
+		       "legacy program 'compress' is deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
 	$(am__post_remove_distdir)
 
 dist-shar: distdir
-	@echo WARNING: "Support for distribution archives compressed with" \
-		       "legacy program 'compress' is deprecated." >&2
+	@echo WARNING: "Support for shar distribution archives is" \
+	               "deprecated." >&2
 	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
 	$(am__post_remove_distdir)
@@ -565,17 +575,17 @@ distcheck: dist
 	esac
 	chmod -R a-w $(distdir)
 	chmod u+w $(distdir)
-	mkdir $(distdir)/_build $(distdir)/_inst
+	mkdir $(distdir)/_build $(distdir)/_build/sub $(distdir)/_inst
 	chmod a-w $(distdir)
 	test -d $(distdir)/_build || exit 0; \
 	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
 	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
 	  && am__cwd=`pwd` \
-	  && $(am__cd) $(distdir)/_build \
-	  && ../configure \
+	  && $(am__cd) $(distdir)/_build/sub \
+	  && ../../configure \
 	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
 	    $(DISTCHECK_CONFIGURE_FLAGS) \
-	    --srcdir=.. --prefix="$$dc_install_base" \
+	    --srcdir=../.. --prefix="$$dc_install_base" \
 	  && $(MAKE) $(AM_MAKEFLAGS) \
 	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
 	  && $(MAKE) $(AM_MAKEFLAGS) check \
@@ -749,6 +759,8 @@ uninstall-am:
 	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
 	pdf-am ps ps-am tags tags-am uninstall uninstall-am
 
+.PRECIOUS: Makefile
+
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/aclocal.m4 b/aclocal.m4
index 389763bf..bf79d078 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.15 -*- Autoconf -*-
 
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])
 
-# Copyright (C) 2002-2013 Free Software Foundation, Inc.
+# Copyright (C) 2002-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.14'
+[am__api_version='1.15'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.14.1], [],
+m4_if([$1], [1.15], [],
       [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])
 
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.14.1])dnl
+[AM_AUTOMAKE_VERSION([1.15])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -103,15 +103,14 @@ _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 # configured tree to be moved without reconfiguration.
 
 AC_DEFUN([AM_AUX_DIR_EXPAND],
-[dnl Rely on autoconf to set up CDPATH properly.
-AC_PREREQ([2.50])dnl
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
 ])
 
 # AM_CONDITIONAL                                            -*- Autoconf -*-
 
-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -142,7 +141,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])
 
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -333,7 +332,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
 
 # Generate code to set up dependency tracking.              -*- Autoconf -*-
 
-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -409,7 +408,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 
 # Do all the work for Automake.                             -*- Autoconf -*-
 
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -499,8 +498,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 AC_REQUIRE([AC_PROG_AWK])dnl
 AC_REQUIRE([AC_PROG_MAKE_SET])dnl
 AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@@ -573,7 +572,11 @@ to "yes", and re-run configure.
 END
     AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
   fi
-fi])
+fi
+dnl The trailing newline in this macro's definition is deliberate, for
+dnl backward compatibility and to allow trailing 'dnl'-style comments
+dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
+])
 
 dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@@ -602,7 +605,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -613,7 +616,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
 # Define $install_sh.
 AC_DEFUN([AM_PROG_INSTALL_SH],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
   case $am_aux_dir in
   *\ * | *\	*)
     install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -623,7 +626,7 @@ if test x"${install_sh}" != xset; then
 fi
 AC_SUBST([install_sh])])
 
-# Copyright (C) 2003-2013 Free Software Foundation, Inc.
+# Copyright (C) 2003-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -644,7 +647,7 @@ AC_SUBST([am__leading_dot])])
 
 # Check to see how 'make' treats includes.	            -*- Autoconf -*-
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -694,7 +697,7 @@ rm -f confinc confmf
 
 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
 
-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -733,7 +736,7 @@ fi
 
 # Helper functions for option handling.                     -*- Autoconf -*-
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -764,7 +767,7 @@ AC_DEFUN([_AM_IF_OPTION],
 
 # Check to make sure that the build environment is sane.    -*- Autoconf -*-
 
-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -845,7 +848,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])
 
-# Copyright (C) 2009-2013 Free Software Foundation, Inc.
+# Copyright (C) 2009-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -905,7 +908,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])
 
-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -933,7 +936,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])
 
-# Copyright (C) 2006-2013 Free Software Foundation, Inc.
+# Copyright (C) 2006-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -952,7 +955,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
 
 # Check how to create a tarball.                            -*- Autoconf -*-
 
-# Copyright (C) 2004-2013 Free Software Foundation, Inc.
+# Copyright (C) 2004-2014 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
diff --git a/benchmarks/Grid_wilson.cc b/benchmarks/Grid_wilson.cc
index e49d4515..69f5b48f 100644
--- a/benchmarks/Grid_wilson.cc
+++ b/benchmarks/Grid_wilson.cc
@@ -46,6 +46,13 @@ int main (int argc, char ** argv)
     volume=volume*latt_size[mu];
   }  
 
+  // Only one non-zero (y)
+  Umu=zero;
+  for(int nn=0;nn<Nd;nn++){
+    random(pRNG,U[nn]);
+    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
+  }
+
   for(int mu=0;mu<Nd;mu++){
     U[mu] = peekIndex<LorentzIndex>(Umu,mu);
   }
@@ -53,7 +60,6 @@ int main (int argc, char ** argv)
   { // Naive wilson implementation
     ref = zero;
     for(int mu=0;mu<Nd;mu++){
-
       //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
       tmp = U[mu]*Cshift(src,mu,1);
       for(int i=0;i<ref._odata.size();i++){
@@ -75,7 +81,7 @@ int main (int argc, char ** argv)
   int ncall=1000;
   double t0=usecond();
   for(int i=0;i<ncall;i++){
-    Dw.M(src,result);
+    Dw.Dhop(src,result,0);
   }
   double t1=usecond();
   double flops=1320*volume*ncall;
@@ -99,5 +105,30 @@ int main (int argc, char ** argv)
     }
   }
 
+  { // Naive wilson dag implementation
+    ref = zero;
+    for(int mu=0;mu<Nd;mu++){
+
+      //    ref =  src - Gamma(Gamma::GammaX)* src ; // 1+gamma_x
+      tmp = U[mu]*Cshift(src,mu,1);
+      for(int i=0;i<ref._odata.size();i++){
+	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
+      }
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu,-1);
+      for(int i=0;i<ref._odata.size();i++){
+	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
+      }
+    }
+  }
+  Dw.Dhop(src,result,1);
+  std::cout << "Called DwDag"<<std::endl;
+  std::cout << "norm result "<< norm2(result)<<std::endl;
+  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  err = ref -result;
+  std::cout << "norm diff   "<< norm2(err)<<std::endl;
+
+
   Grid_finalize();
 }
diff --git a/benchmarks/Grid_wilson_cg_unprec.cc b/benchmarks/Grid_wilson_cg_unprec.cc
new file mode 100644
index 00000000..2a91864b
--- /dev/null
+++ b/benchmarks/Grid_wilson_cg_unprec.cc
@@ -0,0 +1,56 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeFermion src(&Grid); random(pRNG,src);
+  std::cout << "src norm" << norm2(src)<<std::endl;
+  LatticeFermion result(&Grid); result=zero;
+  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+
+  std::vector<LatticeColourMatrix> U(4,&Grid);
+
+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
+  }
+  
+  RealD mass=0.5;
+  WilsonMatrix Dw(Umu,mass);
+  HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
+  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  CG(HermOp,src,result);
+
+  
+  Grid_finalize();
+}
diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am
index 2fe40feb..77ce7663 100644
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -5,12 +5,14 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
-bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3
-
+bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec
 
 Grid_wilson_SOURCES = Grid_wilson.cc
 Grid_wilson_LDADD = -lGrid
 
+Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc
+Grid_wilson_cg_unprec_LDADD = -lGrid
+
 Grid_comms_SOURCES = Grid_comms.cc
 Grid_comms_LDADD = -lGrid
 
diff --git a/config.guess b/config.guess
index 5f6aa02d..a12faba2 120000
--- a/config.guess
+++ b/config.guess
@@ -1 +1 @@
-/usr/share/automake-1.14/config.guess
\ No newline at end of file
+/opt/local/share/automake-1.15/config.guess
\ No newline at end of file
diff --git a/config.sub b/config.sub
index 0abfe18c..e3c9b5ca 120000
--- a/config.sub
+++ b/config.sub
@@ -1 +1 @@
-/usr/share/automake-1.14/config.sub
\ No newline at end of file
+/opt/local/share/automake-1.15/config.sub
\ No newline at end of file
diff --git a/configure b/configure
index e7d9f32f..b42143e2 100755
--- a/configure
+++ b/configure
@@ -2466,7 +2466,7 @@ test -n "$target_alias" &&
     NONENONEs,x,x, &&
   program_prefix=${target_alias}-
 
-am__api_version='1.14'
+am__api_version='1.15'
 
 # Find a good install program.  We prefer a C program (faster),
 # so one script is as good as another.  But avoid the broken or
@@ -2638,8 +2638,8 @@ test "$program_suffix" != NONE &&
 ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
 program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
 
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
 
 if test x"${MISSING+set}" != xset; then
   case $am_aux_dir in
@@ -2658,7 +2658,7 @@ else
 $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
 fi
 
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
   case $am_aux_dir in
   *\ * | *\	*)
     install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -2986,8 +2986,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 mkdir_p='$(MKDIR_P)'
 
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 # Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AMTAR='$${TAR-tar}'
@@ -3046,6 +3046,7 @@ END
 fi
 
 
+
 ac_config_headers="$ac_config_headers lib/Grid_config.h"
 
 # Check whether --enable-silent-rules was given.
diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h
index af77591c..6d7411d3 100644
--- a/lib/Grid_simd.h
+++ b/lib/Grid_simd.h
@@ -50,15 +50,20 @@ namespace Grid {
   typedef std::complex<Real>  Complex;
 
   inline RealF adj(const RealF  & r){ return r; }
-  inline RealF conj(const RealF  & r){ return r; }
+  inline RealF conjugate(const RealF  & r){ return r; }
   inline RealF real(const RealF  & r){ return r; }
 
   inline RealD adj(const RealD  & r){ return r; }
-  inline RealD conj(const RealD  & r){ return r; }
+  inline RealD conjugate(const RealD  & r){ return r; }
   inline RealD real(const RealD  & r){ return r; }
 
-  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
-  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
+  inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); }
+  inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
+  inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); }
+  inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
+
+  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; }
+  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
   inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
   inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
 
@@ -70,15 +75,14 @@ namespace Grid {
     inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
     inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
     inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
-    inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
-    // conj already supported for complex
+    // conjugate already supported for complex
     
     inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
     inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
     inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
     inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
-    inline ComplexF  adj(const ComplexF& r ){ return(conj(r)); }
-    //conj already supported for complex
+
+    //conjugate already supported for complex
 
     inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
     inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h
index bb948b95..bdb6c387 100644
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -7,8 +7,8 @@ namespace Grid {
   // LinearOperators Take a something and return a something.
   /////////////////////////////////////////////////////////////////////////////////////////////
   //
-  // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugate (transpose if real):
-  //
+  // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
+  //SBase
   //   i)  F(a x + b y) = aF(x) + b F(y).
   //  ii)  <x|Op|y> = <y|AdjOp|x>^\ast
   //
@@ -25,12 +25,13 @@ namespace Grid {
   /////////////////////////////////////////////////////////////////////////////////////////////
     template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> {
     public:
-      virtual RealD OpAndNorm(const Field &in, Field &out);
+      virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2);
       void AdjOp(const Field &in, Field &out) {
 	Op(in,out);
       };
       void Op(const Field &in, Field &out) {
-	OpAndNorm(in,out);
+	double n1,n2;
+	OpAndNorm(in,out,n1,n2);
       };
     };
 
@@ -80,8 +81,8 @@ namespace Grid {
       Matrix &_Mat;
     public:
       HermitianOperator(Matrix &Mat): _Mat(Mat) {};
-      RealD OpAndNorm(const Field &in, Field &out){
-	return _Mat.MdagM(in,out);
+      void OpAndNorm(const Field &in, Field &out,double &n1,double &n2){
+	return _Mat.MdagM(in,out,n1,n2);
       }
     };
 
diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h
index bcb6ab60..f7f8559e 100644
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -18,6 +18,7 @@ public:
       std::cout << Tolerance<<std::endl;
     };
 
+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi) {assert(0);};
     void operator() (HermitianOperatorBase<Field> &Linop,const Field &src, Field &psi){
 
       RealD cp,c,a,d,b,ssq,qq,b_pred;
@@ -61,21 +62,27 @@ public:
 	
 	Linop.OpAndNorm(p,mmp,d,qq);
 	  
+	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<std::endl;
+      
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
 	
-	cp = axpy_norm(r,mmp,r,-a);
+	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;
+
+	cp = axpy_norm(r,-a,mmp,r);
 	b = cp/c;
+	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,b "<<cp<< " "<<b <<std::endl;
 	
 	// Fuse these loops ; should be really easy
 	psi= a*p+psi;
 	p  = p*b+r;
 	  
-	std::cout << "Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+
 	// Stopping condition
 	if ( cp <= rsq ) { 
 	  
-	  Linop.Op(p,mmp);
+	  Linop.Op(psi,mmp);
 	  p=mmp-src;
 	  
 	  RealD mmpnorm = sqrt(norm2(mmp));
@@ -83,8 +90,11 @@ public:
 	  RealD srcnorm = sqrt(norm2(src));
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm/srcnorm;
+
+	  std::cout<<"ConjugateGradient: Converged on iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 	  std::cout<<"ConjugateGradient: true   residual  is "<<true_residual<<" sol "<<psinorm<<" src "<<srcnorm<<std::endl;
 	  std::cout<<"ConjugateGradient: target residual was "<<Tolerance<<std::endl;
+	  return;
 	}
       }
       std::cout<<"ConjugateGradient did NOT converge"<<std::endl;
diff --git a/lib/lattice/Grid_lattice_ET.h b/lib/lattice/Grid_lattice_ET.h
index c0b5f97d..b0205f2f 100644
--- a/lib/lattice/Grid_lattice_ET.h
+++ b/lib/lattice/Grid_lattice_ET.h
@@ -102,7 +102,7 @@ template <class arg> struct name\
 
 GridUnopClass(UnarySub,-a);
 GridUnopClass(UnaryAdj,adj(a));
-GridUnopClass(UnaryConj,conj(a));
+GridUnopClass(UnaryConj,conjugate(a));
 GridUnopClass(UnaryTrace,trace(a));
 GridUnopClass(UnaryTranspose,transpose(a));
 
@@ -178,7 +178,7 @@ template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,con
 
 GRID_DEF_UNOP(operator -,UnarySub);
 GRID_DEF_UNOP(adj,UnaryAdj);
-GRID_DEF_UNOP(conj,UnaryConj);
+GRID_DEF_UNOP(conjugate,UnaryConj);
 GRID_DEF_UNOP(trace,UnaryTrace);
 GRID_DEF_UNOP(transpose,UnaryTranspose);
 
diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h
index c0bbb2b6..f1e566a2 100644
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -144,14 +144,44 @@ PARALLEL_FOR_LOOP
   }
   
   template<class sobj,class vobj> strong_inline
-  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
-    conformable(lhs,rhs);
+  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    conformable(x,y);
 #pragma omp parallel for
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
-      vobj tmp = a*lhs._odata[ss];
-      vstream(ret._odata[ss],tmp+rhs._odata[ss]);
+    for(int ss=0;ss<x._grid->oSites();ss++){
+      vobj tmp = a*x._odata[ss]+y._odata[ss];
+      vstream(ret._odata[ss],tmp);
     }
   }
+  template<class sobj,class vobj> strong_inline
+  void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    conformable(x,y);
+#pragma omp parallel for
+    for(int ss=0;ss<x._grid->oSites();ss++){
+      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
+      vstream(ret._odata[ss],tmp);
+    }
+  }
+
+  template<class sobj,class vobj> strong_inline
+  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    conformable(x,y);
+#pragma omp parallel for
+    for(int ss=0;ss<x._grid->oSites();ss++){
+      vobj tmp = a*x._odata[ss]+y._odata[ss];
+      vstream(ret._odata[ss],tmp);
+    }
+    return norm2(ret);
+  }
+  template<class sobj,class vobj> strong_inline
+  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    conformable(x,y);
+#pragma omp parallel for
+    for(int ss=0;ss<x._grid->oSites();ss++){
+      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
+      vstream(ret._odata[ss],tmp);
+    }
+    return norm2(ret); // FIXME implement parallel norm in ss loop
+  }
 
 }
 #endif
diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h
index ae606ae7..0016bafa 100644
--- a/lib/lattice/Grid_lattice_base.h
+++ b/lib/lattice/Grid_lattice_base.h
@@ -9,7 +9,7 @@ namespace Grid {
 // Functionality:
 //     -=,+=,*=,()
 //     add,+,sub,-,mult,mac,*
-//     adj,conj
+//     adj,conjugate
 //     real,imag
 //     transpose,transposeIndex  
 //     trace,traceIndex
diff --git a/lib/lattice/Grid_lattice_reality.h b/lib/lattice/Grid_lattice_reality.h
index cf86d700..2f52ed22 100644
--- a/lib/lattice/Grid_lattice_reality.h
+++ b/lib/lattice/Grid_lattice_reality.h
@@ -18,11 +18,11 @@ PARALLEL_FOR_LOOP
         return ret;
     };
 
-    template<class vobj> inline Lattice<vobj> conj(const Lattice<vobj> &lhs){
+    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
         Lattice<vobj> ret(lhs._grid);
 PARALLEL_FOR_LOOP
         for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = conj(lhs._odata[ss]);
+            ret._odata[ss] = conjugate(lhs._odata[ss]);
         }
         return ret;
     };
diff --git a/lib/math/Grid_math_reality.h b/lib/math/Grid_math_reality.h
index 18a2f89b..40d4c624 100644
--- a/lib/math/Grid_math_reality.h
+++ b/lib/math/Grid_math_reality.h
@@ -98,26 +98,26 @@ template<class vtype,int N> inline void  timesMinusI(iMatrix<vtype,N> &ret,const
 /////////////////////////////////////////////// 
 // Conj function for scalar, vector, matrix
 /////////////////////////////////////////////// 
-template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r)
+template<class vtype> inline iScalar<vtype> conjugate(const iScalar<vtype>&r)
 {
     iScalar<vtype> ret;
-    ret._internal = conj(r._internal);
+    ret._internal = conjugate(r._internal);
     return ret;
 }
-template<class vtype,int N> inline iVector<vtype,N> conj(const iVector<vtype,N>&r)
+template<class vtype,int N> inline iVector<vtype,N> conjugate(const iVector<vtype,N>&r)
 {
   iVector<vtype,N> ret;
   for(int i=0;i<N;i++){
-    ret._internal[i] = conj(r._internal[i]);
+    ret._internal[i] = conjugate(r._internal[i]);
   }
   return ret;
 }
-template<class vtype,int N> inline iMatrix<vtype,N> conj(const iMatrix<vtype,N>&r)
+template<class vtype,int N> inline iMatrix<vtype,N> conjugate(const iMatrix<vtype,N>&r)
 {
   iMatrix<vtype,N> ret;
   for(int i=0;i<N;i++){
   for(int j=0;j<N;j++){
-    ret._internal[i][j] = conj(r._internal[i][j]);
+    ret._internal[i][j] = conjugate(r._internal[i][j]);
   }}
   return ret;
 }
diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc
index 6a67efe3..5143daf3 100644
--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -21,7 +21,13 @@ const int WilsonMatrix::Tm = 7;
   class WilsonCompressor {
   public:
     int mu;
-    
+    int dag;
+
+    WilsonCompressor(int _dag){
+      mu=0;
+      dag=_dag;
+      assert((dag==0)||(dag==1));
+    }
     void Point(int p) { 
       mu=p;
     };
@@ -29,7 +35,11 @@ const int WilsonMatrix::Tm = 7;
     vHalfSpinColourVector operator () (const vSpinColourVector &in)
     {
       vHalfSpinColourVector ret;
-      switch(mu) {
+      int mudag=mu;
+      if (dag) {
+	mudag=(mu+Nd)%(2*Nd);
+      }
+      switch(mudag) {
       case WilsonMatrix::Xp:
 	spProjXp(ret,in);
 	break;
@@ -87,43 +97,52 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF
 
 RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out);
-  return 0.0;
+  Dhop(in,out,0);
+  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
+  return norm2(out);
 }
 RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out);
-  return 0.0;
+  Dhop(in,out,1);
+  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
+  return norm2(out);
 }
 
 void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out);
+  Dhop(in,out,0);
+  out = 0.5*out; // FIXME : scale factor in Dhop
 }
 void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
 {
-  Dhop(in,out);
+  Dhop(in,out,1);
 }
 void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out)
 {
+  out = (4.0+mass)*in;
   return ;
 }
 void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
 {
+  out = (1.0/(4.0+mass))*in;
   return ;
 }
 void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
 {
+  out = (1.0/(4.0+mass))*in;
   return ;
 }
 void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 {
+  out = (1.0/(4.0+mass))*in;
   return ;
 }
 
-void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
+void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
 {
-  WilsonCompressor compressor;
+  assert((dag==0) ||(dag==1));
+
+  WilsonCompressor compressor(dag);
   Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
 
 PARALLEL_FOR_LOOP
@@ -140,13 +159,13 @@ PARALLEL_FOR_LOOP
     int ssu= ss;
 
     // Xp
-    offset = Stencil._offsets [Xp][ss];
-    local  = Stencil._is_local[Xp][ss];
-    perm   = Stencil._permute[Xp][ss];
-    ptype  = Stencil._permute_type[Xp];
+    int idx = (Xp+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
 
-    if ( local && perm ) 
-    {
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
       spProjXp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
@@ -154,18 +173,17 @@ PARALLEL_FOR_LOOP
     } else { 
       chi=comm_buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
-    //prefetch(Umu._odata[ssu](Yp));
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
     spReconXp(result,Uchi);
 
     // Yp
-    offset = Stencil._offsets [Yp][ss];
-    local  = Stencil._is_local[Yp][ss];
-    perm   = Stencil._permute[Yp][ss];
-    ptype  = Stencil._permute_type[Yp];
+    idx = (Yp+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
 
-    if ( local && perm ) 
-    {
+    if ( local && perm ) {
       spProjYp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
@@ -173,18 +191,16 @@ PARALLEL_FOR_LOOP
     } else { 
       chi=comm_buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
-    //    prefetch(Umu._odata[ssu](Zp));
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
     accumReconYp(result,Uchi);
 
     // Zp
-    offset = Stencil._offsets [Zp][ss];
-    local  = Stencil._is_local[Zp][ss];
-    perm   = Stencil._permute[Zp][ss];
-    ptype  = Stencil._permute_type[Zp];
-
-    if ( local && perm ) 
-    {
+    idx = (Zp+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
       spProjZp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
@@ -192,18 +208,16 @@ PARALLEL_FOR_LOOP
     } else { 
       chi=comm_buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
-    //    prefetch(Umu._odata[ssu](Tp));
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
     accumReconZp(result,Uchi);
 
     // Tp
-    offset = Stencil._offsets [Tp][ss];
-    local  = Stencil._is_local[Tp][ss];
-    perm   = Stencil._permute[Tp][ss];
-    ptype  = Stencil._permute_type[Tp];
-
-    if ( local && perm ) 
-    {
+    idx = (Tp+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
       spProjTp(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
@@ -211,15 +225,15 @@ PARALLEL_FOR_LOOP
     } else { 
       chi=comm_buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
-    //    prefetch(Umu._odata[ssu](Xm));
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
     accumReconTp(result,Uchi);
 
     // Xm
-    offset = Stencil._offsets [Xm][ss];
-    local  = Stencil._is_local[Xm][ss];
-    perm   = Stencil._permute[Xm][ss];
-    ptype  = Stencil._permute_type[Xm];
+    idx = (Xm+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
 
     if ( local && perm ) 
     {
@@ -230,18 +244,18 @@ PARALLEL_FOR_LOOP
     } else { 
       chi=comm_buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
     accumReconXm(result,Uchi);
 
 
     // Ym
-    offset = Stencil._offsets [Ym][ss];
-    local  = Stencil._is_local[Ym][ss];
-    perm   = Stencil._permute[Ym][ss];
-    ptype  = Stencil._permute_type[Ym];
+    idx = (Ym+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
 
-    if ( local && perm ) 
-    {
+    if ( local && perm ) {
       spProjYm(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
@@ -249,17 +263,16 @@ PARALLEL_FOR_LOOP
     } else { 
       chi=comm_buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
     accumReconYm(result,Uchi);
 
     // Zm
-    offset = Stencil._offsets [Zm][ss];
-    local  = Stencil._is_local[Zm][ss];
-    perm   = Stencil._permute[Zm][ss];
-    ptype  = Stencil._permute_type[Zm];
-
-    if ( local && perm ) 
-    {
+    idx = (Zm+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
       spProjZm(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
@@ -267,17 +280,16 @@ PARALLEL_FOR_LOOP
     } else { 
       chi=comm_buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
     accumReconZm(result,Uchi);
 
     // Tm
-    offset = Stencil._offsets [Tm][ss];
-    local  = Stencil._is_local[Tm][ss];
-    perm   = Stencil._permute[Tm][ss];
-    ptype  = Stencil._permute_type[Tm];
-
-    if ( local && perm ) 
-    {
+    idx = (Tm+dag*4)%8;
+    offset = Stencil._offsets [idx][ss];
+    local  = Stencil._is_local[idx][ss];
+    perm   = Stencil._permute[idx][ss];
+    ptype  = Stencil._permute_type[idx];
+    if ( local && perm ) {
       spProjTm(tmp,in._odata[offset]);
       permute(chi,tmp,ptype);
     } else if ( local ) {
@@ -285,7 +297,7 @@ PARALLEL_FOR_LOOP
     } else { 
       chi=comm_buf[offset];
     }
-    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
     accumReconTm(result,Uchi);
 
     vstream(out._odata[ss],result);
@@ -294,10 +306,6 @@ PARALLEL_FOR_LOOP
 }
 
 
-void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out)
-{
-  return;
-}
 
 
 
diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h
index 1098d330..1db95b33 100644
--- a/lib/qcd/Grid_qcd_wilson_dop.h
+++ b/lib/qcd/Grid_qcd_wilson_dop.h
@@ -45,10 +45,7 @@ namespace Grid {
       virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
 
       // non-hermitian hopping term; half cb or both
-      void Dhop(const LatticeFermion &in, LatticeFermion &out);
-
-      // m+4r -1/2 Dhop; both cb's
-      void Dw(const LatticeFermion &in, LatticeFermion &out);
+      void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
 
       typedef iScalar<iMatrix<vComplex, Nc> > matrix;
 
diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h
index 41c8509f..2b03c825 100644
--- a/lib/simd/Grid_vComplexD.h
+++ b/lib/simd/Grid_vComplexD.h
@@ -32,7 +32,7 @@ namespace Grid {
         friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
         friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
         friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
-        friend inline vComplexD adj(const vComplexD &in){ return conj(in); }
+        friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); }
 
         //////////////////////////////////
         // Initialise to 1,0,i
@@ -166,11 +166,11 @@ namespace Grid {
 	// all subtypes; may not be a good assumption, but could
 	// add the vector width as a template param for BG/Q for example
 	////////////////////////////////////////////////////////////////////
-	/*
 	friend inline void permute(vComplexD &y,vComplexD b,int perm)
 	{
 	  Gpermute<vComplexD>(y,b,perm);
 	}
+	/*
 	friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
 	{
 	  Gmerge<vComplexD,ComplexD >(y,extracted);
@@ -269,7 +269,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
         ////////////////////////
         // Conjugate
         ////////////////////////
-        friend inline vComplexD conj(const vComplexD &in){
+        friend inline vComplexD conjugate(const vComplexD &in){
             vComplexD ret ; vzero(ret);
 #if defined (AVX1)|| defined (AVX2)
 	    //	    addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
@@ -345,17 +345,17 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 // REDUCE FIXME must be a cleaner implementation
        friend inline ComplexD Reduce(const vComplexD & in)
        { 
-#if defined SSE4
-         return ComplexD(in.v[0], in.v[1]); // inefficient
+#ifdef SSE4
+	 return ComplexD(in.v[0],in.v[1]);
+#endif
+#if defined(AVX1) || defined (AVX2)
+	 vComplexD v1;
+	 permute(v1,in,0); // sse 128; paired complex single
+	 v1=v1+in;
+	 return ComplexD(v1.v[0],v1.v[1]);
 #endif
-#if defined (AVX1) || defined(AVX2)
-	 //            return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
-	 __attribute__ ((aligned(32))) double c_[4];
-         _mm256_store_pd(c_,in.v);
-	 return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
-#endif 
 #ifdef AVX512
-            return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
+	 return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
 #endif 
 #ifdef QPX
 #endif
@@ -387,7 +387,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
     };
 
 
-    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
+    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; }
 
 
     typedef  vComplexD vDComplex;
diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h
index 8a892d24..713bbdd8 100644
--- a/lib/simd/Grid_vComplexF.h
+++ b/lib/simd/Grid_vComplexF.h
@@ -47,7 +47,7 @@ namespace Grid {
         friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
         friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
         friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
-        friend inline vComplexF adj(const vComplexF &in){ return conj(in); }
+        friend inline vComplexF adj(const vComplexF &in){ return conjugate(in); }
         
         //////////////////////////////////
         // Initialise to 1,0,i
@@ -228,42 +228,25 @@ namespace Grid {
             ret.v = {a,b,a,b};
 #endif
         }
-       friend inline ComplexF Reduce(const vComplexF & in)
-       {
+	friend inline void permute(vComplexF &y,vComplexF b,int perm)
+	{
+	  Gpermute<vComplexF>(y,b,perm);
+	}
+	friend inline ComplexF Reduce(const vComplexF & in)
+	{
 #ifdef SSE4
-	   union {
-	     cvec v1;    // SSE 4 x float vector
-	     float f[4];  // scalar array of 4 floats
-	   } u128;
-	   u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
-	   return ComplexF(u128.f[0], u128.f[1]);
+	 vComplexF v1;
+	 permute(v1,in,0); // sse 128; paired complex single
+	 v1=v1+in;
+	 return ComplexF(v1.v[0],v1.v[1]);
 #endif
-#ifdef AVX1
-	   //it would be better passing 2 arguments to saturate the vector lanes
-	   union {
-	     __m256 v1;    
-	     float f[8];  
-	   } u256;
-	   //SWAP lanes
-	   // FIXME .. icc complains with lib/lattice/Grid_lattice_reduction.h (49): (col. 20) warning #13211: Immediate parameter to intrinsic call too large
-	   __m256 t0 = _mm256_permute2f128_ps(in.v, in.v, 1);
-	   __m256 t1 = _mm256_permute_ps(in.v  , 0b11011000);//real (0,2,1,3)
-	   __m256 t2 = _mm256_permute_ps(t0 , 0b10001101);//imag (1,3,0,2)
-	   t0 = _mm256_blend_ps(t1, t2, 0b0101000001010000);// (0,0,1,1,0,0,1,1)
-	   t1 = _mm256_hadd_ps(t0,t0);
-	   u256.v1 = _mm256_hadd_ps(t1, t1);
-	   return ComplexF(u256.f[0], u256.f[4]);
-#endif
-#ifdef AVX2
-	   union {
-	     __m256 v1;    
-	     float f[8];  
-	   } u256;
-	   const __m256i mask= _mm256_set_epi32( 7, 5, 3, 1, 6, 4, 2, 0);
-	   __m256 tmp1 = _mm256_permutevar8x32_ps(in.v, mask);
-	   __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
-	   u256.v1 = _mm256_hadd_ps(tmp2, tmp2);
-	   return ComplexF(u256.f[0], u256.f[4]);
+#if defined(AVX1) || defined (AVX2)
+	 vComplexF v1,v2;
+	 permute(v1,in,0); // sse 128; paired complex single
+	 v1=v1+in;
+	 permute(v2,v1,1); // avx 256; quad complex single
+	 v1=v1+v2;
+	 return ComplexF(v1.v[0],v1.v[1]);
 #endif
 #ifdef AVX512
             return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
@@ -345,13 +328,10 @@ namespace Grid {
         // Conjugate
         ///////////////////////
 								     
-        friend inline vComplexF conj(const vComplexF &in){
+        friend inline vComplexF conjugate(const vComplexF &in){
             vComplexF ret ; vzero(ret);
 #if defined (AVX1)|| defined (AVX2)
-	    cvec tmp;
-	    tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
-	    ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
-
+	    ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f));
 #endif
 #ifdef SSE4
 	    ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f));
@@ -433,10 +413,6 @@ namespace Grid {
             return *this;
         }
 
-      friend inline void permute(vComplexF &y,vComplexF b,int perm)
-      {
-	Gpermute<vComplexF>(y,b,perm);
-      }
       /*
       friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
       {
@@ -460,7 +436,7 @@ namespace Grid {
 
     inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) 
     {
-      return conj(l)*r; 
+      return conjugate(l)*r; 
     }
 
     inline void zeroit(vComplexF &z){ vzero(z);}
diff --git a/lib/simd/Grid_vInteger.h b/lib/simd/Grid_vInteger.h
index 63e8c720..ee4a3633 100644
--- a/lib/simd/Grid_vInteger.h
+++ b/lib/simd/Grid_vInteger.h
@@ -117,7 +117,7 @@ namespace Grid {
         };
         
         ///////////////////////////////////////////////
-        // mult, sub, add, adj,conj, mac functions
+        // mult, sub, add, adj,conjugate, mac functions
         ///////////////////////////////////////////////
         friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
         friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Grid_vRealD.h
index dbfacb0c..83979d14 100644
--- a/lib/simd/Grid_vRealD.h
+++ b/lib/simd/Grid_vRealD.h
@@ -26,7 +26,7 @@ namespace Grid {
         friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
         friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
         friend inline vRealD adj(const vRealD &in) { return in; }
-        friend inline vRealD conj(const vRealD &in){ return in; }
+        friend inline vRealD conjugate(const vRealD &in){ return in; }
         
         friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
 #if defined (AVX1) || defined (SSE4)
@@ -112,11 +112,12 @@ namespace Grid {
 	// all subtypes; may not be a good assumption, but could
 	// add the vector width as a template param for BG/Q for example
 	////////////////////////////////////////////////////////////////////
-	/*
+
 	friend inline void permute(vRealD &y,vRealD b,int perm)
 	{
 	  Gpermute<vRealD>(y,b,perm);
 	}
+	/*
 	friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
 	{
 	  Gmerge<vRealD,RealD >(y,extracted);
@@ -209,48 +210,26 @@ namespace Grid {
 
        friend inline RealD Reduce(const vRealD & in)
        {
-#if defined (SSE4)
-	 // FIXME Hack
-	 const RealD * ptr =(const RealD *)  &in;
-	 RealD ret = 0; 
-	 for(int i=0;i<vRealD::Nsimd();i++){
-	   ret = ret+ptr[i];
-	 }
-	 return ret;
+#ifdef SSE4
+	 vRealD v1;
+	 permute(v1,in,0); // sse 128; paired real double
+	 v1=v1+in;
+	 return RealD(v1.v[0]);
 #endif
-#if defined (AVX1) || defined(AVX2)
-	 typedef union  {
-	   uint64_t l;
-	   double   d;
-	 } my_conv_t;
-	 my_conv_t converter;
-// more reduce_add
-/*
-            __attribute__ ((aligned(32))) double c_[16];
-	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
-            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
-  	             tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp  = 3+2,1+0,1+0,3+2
-                    hadd = _mm256_hadd_pd(tmp,tmp);               // tmp  = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
-                    _mm256_store_pd(c_,hadd);�
-             return c[0]
-*/
-	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
-            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
-                    hadd = _mm256_hadd_pd(hadd,hadd);             // hadd = 1+0+3+2...
-		    converter.l = _mm256_extract_epi64((ivec)hadd,0);
-            return converter.d;
+#if defined(AVX1) || defined (AVX2)
+	 vRealD v1,v2;
+	 permute(v1,in,0); // avx 256; quad double
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 return v1.v[0];
 #endif
 #ifdef AVX512
             return _mm512_reduce_add_pd(in.v);
-/*
-            __attribute__ ((aligned(32))) double c_[8];
-           _mm512_store_pd(c_,in.v);
-            return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
-*/
 #endif
 #ifdef QPX
 #endif
-        }
+       }
 
         // *=,+=,-= operators
         inline vRealD &operator *=(const vRealD &r) {
@@ -270,7 +249,7 @@ namespace Grid {
         static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
     };
 
-    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
+    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; }
     inline void zeroit(vRealD &z){ vzero(z);}
 
     inline vRealD outerProduct(const vRealD &l, const vRealD& r)
diff --git a/lib/simd/Grid_vRealF.h b/lib/simd/Grid_vRealF.h
index 559a9eda..754c5121 100644
--- a/lib/simd/Grid_vRealF.h
+++ b/lib/simd/Grid_vRealF.h
@@ -92,13 +92,13 @@ namespace Grid {
         };
         
         ///////////////////////////////////////////////
-        // mult, sub, add, adj,conj, mac functions
+        // mult, sub, add, adj,conjugate, mac functions
         ///////////////////////////////////////////////
         friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
         friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
         friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
         friend inline vRealF adj(const vRealF &in) { return in; }
-        friend inline vRealF conj(const vRealF &in){ return in; }
+        friend inline vRealF conjugate(const vRealF &in){ return in; }
 
         friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
 #if defined (AVX1) || defined (SSE4)
@@ -133,11 +133,12 @@ namespace Grid {
 	// all subtypes; may not be a good assumption, but could
 	// add the vector width as a template param for BG/Q for example
 	////////////////////////////////////////////////////////////////////
-	/*
+
 	friend inline void permute(vRealF &y,vRealF b,int perm)
 	{
 	  Gpermute<vRealF>(y,b,perm);
 	}
+	/*
 	friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
 	{
 	  Gmerge<vRealF,RealF >(y,extracted);
@@ -155,7 +156,6 @@ namespace Grid {
 	  Gextract<vRealF,RealF>(y,extracted);
 	}
 	*/
-
         
         /////////////////////////////////////////////////////
         // Broadcast a value across Nsimd copies.
@@ -243,33 +243,26 @@ friend inline void vstore(const vRealF &ret, float *a){
         }
        friend inline RealF Reduce(const vRealF & in)
        {
-#if defined (SSE4)
-	 // FIXME Hack
-	 const RealF * ptr = (const RealF *) &in;
-	 RealF ret = 0; 
-	 for(int i=0;i<vRealF::Nsimd();i++){
-	   ret = ret+ptr[i];
-	 }
-	 return ret;
+#ifdef SSE4
+	 vRealF v1,v2;
+	 permute(v1,in,0); // sse 128; quad single
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 return v1.v[0];
 #endif
-#if defined (AVX1) || defined(AVX2)
-            __attribute__ ((aligned(32))) float c_[16];
-            __m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
-            __m256 hadd = _mm256_hadd_ps(in.v,tmp);
-                   tmp = _mm256_permute2f128_ps(hadd,hadd,0x01);
-                   hadd = _mm256_hadd_ps(tmp,tmp);
-                  _mm256_store_ps(c_,hadd);
-         return (float)c_[0];
-
+#if defined(AVX1) || defined (AVX2)
+	 vRealF v1,v2;
+	 permute(v1,in,0); // avx 256; octo-double
+	 v1=v1+in;
+	 permute(v2,v1,1); 
+	 v1=v1+v2;
+	 permute(v2,v1,2); 
+	 v1=v1+v2;
+	 return v1.v[0];
 #endif
 #ifdef AVX512
             return _mm512_reduce_add_ps(in.v);
-/*
-             __attribute__ ((aligned(64))) float c_[16];
-             _mm512_store_ps(c_,in.v);
-             return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7]
-                    +c_[8]+c_[9]+c_[10]+c_[11]+c_[12]+c_[13]+c_[14]+c_[15];
-*/
 #endif
 #ifdef QPX
 #endif
@@ -291,7 +284,7 @@ friend inline void vstore(const vRealF &ret, float *a){
     public:
         static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
     };
-    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
+    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conjugate(l)*r; }
     inline void  zeroit(vRealF &z){ vzero(z);}
 
     inline vRealF outerProduct(const vRealF &l, const vRealF& r)
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 14b2ddf0..4576abe3 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -79,7 +79,7 @@ namespace Grid {
     friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
     friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
     friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
-    friend inline Grid_simd adj(const Grid_simd &in){ return conj(in); }
+    friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
         
     //////////////////////////////////
     // Initialise to 1,0,i
@@ -193,7 +193,7 @@ namespace Grid {
     // Conjugate
     ///////////////////////
 								     
-    friend inline Grid_simd  conj(const Grid_simd  &in){
+    friend inline Grid_simd  conjugate(const Grid_simd  &in){
       Grid_simd  ret ; vzero(ret);
       // FIXME add operator
       return ret;
@@ -265,7 +265,7 @@ namespace Grid {
   template<class scalar_type, class vector_type > 
     inline Grid_simd< scalar_type, vector_type>  innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r) 
   {
-    return conj(l)*r; 
+    return conjugate(l)*r; 
   }
 
   template<class scalar_type, class vector_type >
diff --git a/scripts/build-all b/scripts/build-all
index d3a978b7..b97dca19 100755
--- a/scripts/build-all
+++ b/scripts/build-all
@@ -1,6 +1,6 @@
-#!/bin/bash 
+#!/bin/bash  -e
 
-DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi"
+DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse"
 EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512"
 BLACK="\033[30m"
 RED="\033[31m"
diff --git a/scripts/configure-all b/scripts/configure-all
index 2f80b60a..f42b1960 100755
--- a/scripts/configure-all
+++ b/scripts/configure-all
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx"
+DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse"
 
 for D in $DIRS
 do
diff --git a/scripts/configure-commands b/scripts/configure-commands
index 89b72294..c7c35e1c 100755
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -34,6 +34,9 @@ icpc-avx512)
 icpc-mic)
   CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3  -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;
+clang-sse)
+CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
+  ;;
 clang-avx)
 CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;
@@ -47,16 +50,16 @@ clang-avx2-openmp)
 CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
   ;;
 clang-avx-openmp-mpi)
-CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=mpi
+CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx2-openmp-mpi)
-CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=mpi
+CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx-mpi)
-CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx "  LIBS="-lgmp -lmpfr" --enable-comms=mpi
+CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx2-mpi)
-CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx "  LIBS="-lgmp -lmpfr" --enable-comms=mpi
+CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
 ;;
 esac
 echo -e $NORMAL
diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc
index 45778922..1aa4fc43 100644
--- a/tests/Grid_main.cc
+++ b/tests/Grid_main.cc
@@ -140,7 +140,7 @@ int main (int argc, char ** argv)
 
 //     -=,+=,*=,()
 //     add,+,sub,-,mult,mac,*
-//     adj,conj
+//     adj,conjugate
 //     real,imag
 //     transpose,transposeIndex  
 //     trace,traceIndex
@@ -437,7 +437,7 @@ int main (int argc, char ** argv)
         for(int r=0;r<3;r++){
         for(int c=0;c<3;c++){
             diff =shifted1()()(r,c)-shifted2()()(r,c);
-            nn=real(conj(diff)*diff);
+            nn=real(conjugate(diff)*diff);
             if ( nn > 0 )
                 cout<<"Shift fail (shifted1/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
                     <<shifted1()()(r,c)<<" "<<shifted2()()(r,c)
@@ -451,7 +451,7 @@ int main (int argc, char ** argv)
         for(int r=0;r<3;r++){
         for(int c=0;c<3;c++){
             diff =shifted3()()(r,c)-shifted2()()(r,c);
-            nn=real(conj(diff)*diff);
+            nn=real(conjugate(diff)*diff);
             if ( nn > 0 )
                 cout<<"Shift rb fail (shifted3/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
                 <<shifted3()()(r,c)<<" "<<shifted2()()(r,c)
@@ -468,7 +468,7 @@ int main (int argc, char ** argv)
         for(int r=0;r<Nc;r++){
         for(int c=0;c<Nc;c++){
             diff =foobar2()()(r,c)-foobar1()()(r,c);
-            nrm = nrm + real(conj(diff)*diff);
+            nrm = nrm + real(conjugate(diff)*diff);
         }}
     }}}}
 	if( Fine.IsBoss() ){
diff --git a/tests/Grid_rng_fixed.cc b/tests/Grid_rng_fixed.cc
new file mode 100644
index 00000000..01e3315b
--- /dev/null
+++ b/tests/Grid_rng_fixed.cc
@@ -0,0 +1,52 @@
+#include <Grid.h>
+#include <parallelIO/GridNerscIO.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout = GridDefaultSimd(4,vComplexF::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+     
+  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+  std::vector<int> seeds({1,2,3,4});
+
+  GridSerialRNG            fsRNG;  fsRNG.SeedFixedIntegers(seeds);
+  GridParallelRNG          fpRNG(&Grid);  fpRNG.SeedFixedIntegers(seeds);
+
+  vComplexF tmp; random(fsRNG,tmp);
+  std::cout<<"Random vComplexF (fixed seed)\n"<< tmp<<std::endl;
+  std::cout<<"conjugate(tmp)\n"<< conjugate(tmp)<<std::endl;
+  std::cout<<"conjugate(tmp)*tmp\n"<< conjugate(tmp)*tmp<<std::endl;
+  std::cout<<"innerProduct"<< innerProduct(tmp,tmp)<<std::endl;
+  std::cout<<"Reduce(innerProduct)"<< Reduce(innerProduct(tmp,tmp))<<std::endl;
+
+  SpinMatrix rnd  ; 
+  random(fsRNG,rnd);
+  std::cout<<"Random Spin Matrix (fixed seed)\n"<< rnd<<std::endl;
+
+  SpinVector rv; 
+  random(fsRNG,rv);
+  std::cout<<"Random Spin Vector (fixed seed)\n"<< rv<<std::endl;
+
+  gaussian(fsRNG,rv);
+  std::cout<<"Gaussian Spin Vector (fixed seed)\n"<< rv<<std::endl;
+
+  LatticeColourVector lcv(&Grid);
+
+  LatticeFermion src(&Grid); random(fpRNG,src);
+  std::cout << "src norm : " << norm2(src)<<std::endl;
+  std::cout << "src " << src<<std::endl;
+
+  random(fpRNG,lcv);
+  std::cout<<"Random Lattice Colour Vector (fixed seed)\n"<< lcv<<std::endl;
+
+
+  Grid_finalize();
+}
diff --git a/tests/Grid_simd.cc b/tests/Grid_simd.cc
index 32f4cdfb..db600fe4 100644
--- a/tests/Grid_simd.cc
+++ b/tests/Grid_simd.cc
@@ -26,7 +26,7 @@ public:
 class funcConj {
 public:
   funcConj() {};
-  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conj(i1);}
+  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = conjugate(i1);}
   std::string name(void) const { return std::string("Conj"); }
 };
 class funcAdj {
@@ -49,6 +49,34 @@ public:
   template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesMinusI(i1);}
   std::string name(void) const { return std::string("timesMinusI"); }
 };
+class funcInnerProduct {
+public:
+  funcInnerProduct() {};
+  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = innerProduct(i1,i2);}
+  std::string name(void) const { return std::string("innerProduct"); }
+};
+
+// FIXME still to test:
+//
+//  innerProduct,
+//  norm2, 
+//  Reduce,
+//
+//  mac,mult,sub,add, vone,vzero,vcomplex_i, =zero,
+//  vset,vsplat,vstore,vstream,vload, scalar*vec, vec*scalar
+//  unary -,
+//  *= , -=, +=
+//  outerproduct, 
+//  zeroit
+//  permute
+
+class funcReduce {
+public:
+  funcReduce() {};
+template<class reduce,class vec>    void vfunc(reduce &rr,vec &i1,vec &i2)   const { rr = Reduce(i1);}
+template<class reduce,class scal>   void sfunc(reduce &rr,scal &i1,scal &i2) const { rr = i1;}
+  std::string name(void) const { return std::string("Reduce"); }
+};
 
 template<class scal, class vec,class functor > 
 void Tester(const functor &func)
@@ -96,8 +124,59 @@ void Tester(const functor &func)
       ok++;
     }
   }
-  if ( ok==0 ) std::cout << " OK!" <<std::endl;
+  if ( ok==0 ) {
+    std::cout << " OK!" <<std::endl;
+  }
+  assert(ok==0);
+}
 
+
+template<class reduced,class scal, class vec,class functor > 
+void ReductionTester(const functor &func)
+{
+  GridSerialRNG          sRNG;
+  sRNG.SeedRandomDevice();
+  
+  int Nsimd = vec::Nsimd();
+
+  std::vector<scal> input1(Nsimd);
+  std::vector<scal> input2(Nsimd);
+  reduced result(0);
+  reduced reference(0);
+  reduced tmp;
+
+  std::vector<vec,alignedAllocator<vec> > buf(3);
+  vec & v_input1 = buf[0];
+  vec & v_input2 = buf[1];
+
+
+  for(int i=0;i<Nsimd;i++){
+    random(sRNG,input1[i]);
+    random(sRNG,input2[i]);
+  }
+
+  merge<vec,scal>(v_input1,input1);
+  merge<vec,scal>(v_input2,input2);
+
+  func.template vfunc<reduced,vec>(result,v_input1,v_input2);
+
+  for(int i=0;i<Nsimd;i++) {
+    func.template sfunc<reduced,scal>(tmp,input1[i],input2[i]);
+    reference+=tmp;
+  }
+
+  std::cout << " " << func.name()<<std::endl;
+
+  int ok=0;
+  if ( abs(reference-result)/abs(reference) > 1.0e-6 ){ // rounding is possible for reduce order
+    std::cout<< "*****" << std::endl;
+    std::cout<< abs(reference-result) << " " <<reference<< " " << result<<std::endl;
+    ok++;
+  }
+  if ( ok==0 ) {
+    std::cout << " OK!" <<std::endl;
+  }
+  assert(ok==0);
 }
 
 
@@ -127,6 +206,8 @@ int main (int argc, char ** argv)
   Tester<ComplexF,vComplexF>(funcTimes());
   Tester<ComplexF,vComplexF>(funcConj());
   Tester<ComplexF,vComplexF>(funcAdj());
+  Tester<ComplexF,vComplexF>(funcInnerProduct());
+  ReductionTester<ComplexF,ComplexF,vComplexF>(funcReduce());
 
   std::cout << "==================================="<<  std::endl;
   std::cout << "Testing vComplexD "<<std::endl;
@@ -140,6 +221,8 @@ int main (int argc, char ** argv)
   Tester<ComplexD,vComplexD>(funcTimes());
   Tester<ComplexD,vComplexD>(funcConj());
   Tester<ComplexD,vComplexD>(funcAdj());
+  Tester<ComplexD,vComplexD>(funcInnerProduct());
+  ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());
 
   std::cout << "==================================="<<  std::endl;
   std::cout << "Testing vRealF "<<std::endl;
@@ -150,6 +233,9 @@ int main (int argc, char ** argv)
   Tester<RealF,vRealF>(funcMinus());
   Tester<RealF,vRealF>(funcTimes());
   Tester<RealF,vRealF>(funcAdj());
+  Tester<RealF,vRealF>(funcConj());
+  Tester<RealF,vRealF>(funcInnerProduct());
+  ReductionTester<RealF,RealF,vRealF>(funcReduce());
 
   std::cout << "==================================="<<  std::endl;
   std::cout << "Testing vRealD "<<std::endl;
@@ -159,6 +245,9 @@ int main (int argc, char ** argv)
   Tester<RealD,vRealD>(funcMinus());
   Tester<RealD,vRealD>(funcTimes());
   Tester<RealD,vRealD>(funcAdj());
+  Tester<RealD,vRealD>(funcConj());
+  Tester<RealD,vRealD>(funcInnerProduct());
+  ReductionTester<RealD,RealD,vRealD>(funcReduce());
 
   Grid_finalize();
 }
diff --git a/tests/Grid_stencil.cc b/tests/Grid_stencil.cc
index 1fdb7265..9770ed7c 100644
--- a/tests/Grid_stencil.cc
+++ b/tests/Grid_stencil.cc
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
 	  for(int r=0;r<3;r++){
 	  for(int c=0;c<3;c++){
             diff =check()()(r,c)-bar()()(r,c);
-            double nn=real(conj(diff)*diff);
+            double nn=real(conjugate(diff)*diff);
             if ( nn > 0){
 	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n",
 		     coor[0],coor[1],coor[2],coor[3],r,c,
@@ -103,8 +103,8 @@ int main (int argc, char ** argv)
 		     real(bar()()(r,c))
 		     );
 	    }
-	    snrmC=snrmC+real(conj(check()()(r,c))*check()()(r,c));
-	    snrmB=snrmB+real(conj(bar()()(r,c))*bar()()(r,c));
+	    snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c));
+	    snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c));
 	    snrm=snrm+nn;
 	  }}
 	 
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 82d1b3be..80f3a34c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 #
 # Test code
 #
-bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez
+bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma  Grid_simd Grid_rng Grid_remez Grid_rng_fixed
 
 Grid_main_SOURCES = Grid_main.cc
 Grid_main_LDADD = -lGrid
@@ -13,6 +13,9 @@ Grid_main_LDADD = -lGrid
 Grid_rng_SOURCES = Grid_rng.cc
 Grid_rng_LDADD = -lGrid
 
+Grid_rng_fixed_SOURCES = Grid_rng_fixed.cc
+Grid_rng_fixed_LDADD = -lGrid
+
 Grid_remez_SOURCES = Grid_remez.cc
 Grid_remez_LDADD = -lGrid
 
diff --git a/tests/test_Grid_jacobi.cc b/tests/test_Grid_jacobi.cc
index 95e808f1..0cf15a6f 100644
--- a/tests/test_Grid_jacobi.cc
+++ b/tests/test_Grid_jacobi.cc
@@ -186,7 +186,7 @@ int main (int argc, char ** argv)
 	  for(int r=0;r<3;r++){
 	  for(int c=0;c<3;c++){
             diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
-            double nn=real(conj(diff)*diff);
+            double nn=real(conjugate(diff)*diff);
             if ( nn > 0 ){
 	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
 		     coor[0],coor[1],coor[2],coor[3],r,c,