Merge branch 'release/v0.6.0'

Merge branch 'hotfix/v0.5.1'
Double precision compile fix
2025-06-15 14:27:06 +01:00 · 2016-11-09 12:43:14 +00:00 · 2016-07-01 16:33:59 +01:00
47 changed files with 3138 additions and 2057 deletions
--- a/.gitignore
+++ b/.gitignore
@ -47,9 +47,7 @@ Config.h.in
 config.log
 config.status
 .deps
-Make.inc
+*.inc
 eigen.inc
 Eigen.inc
 # http://www.gnu.org/software/autoconf #
 ########################################
--- a/Makefile.am
+++ b/Makefile.am
@ -1,12 +1,10 @@
 # additional include paths necessary to compile the C++ library
 SUBDIRS = lib benchmarks tests
-include $(top_srcdir)/doxygen.inc
+.PHONY: tests
 tests: all
 	$(MAKE) -C tests tests
 .PHONY: tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
--- a/README.md
+++ b/README.md
@ -116,15 +116,13 @@ If you want to build all the tests at once just use `make tests`.
 - `--with-fftw=<path>`: look for FFTW in the UNIX prefix `<path>`
 - `--enable-lapack[=<path>]`: enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional).
 - `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
- `--enable-numa`: enable NUMA first touch optimisation
+- `--enable-numa`: ???
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
 - `--enable-precision={single|double}`: set the default precision (default: `double`).
 - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
 - `--enable-doxygen-doc`: enable the Doxygen documentation generation (build with `make doxygen-doc`)
 ### Possible communication interfaces
@ -138,7 +136,7 @@ The following options can be use with the `--enable-comms=` option to target dif
 | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
-For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
+For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
 ### Possible SIMD types
@ -167,7 +165,6 @@ Alternatively, some CPU codenames can be directly used:
 - We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
 - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.
 ### Build setup for Intel Knights Landing platform
--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@ -1,183 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making Vec5d innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  std::cout << GridLogMessage << "Seeded"<<std::endl;
  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
  std::cout << GridLogMessage << "made random gauge fields"<<std::endl;
  RealD mass=0.1;
  RealD M5  =1.8;
  RealD NP = UGrid->_Nprocessors;
  if (1)
  {
    const int ncall=1000;
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    GridParallelRNG RNG5(FGrid);
    LatticeFermion src(FGrid); random(RNG5,src);
    LatticeFermion result(FGrid);
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
    double t0,t1;
    LatticeFermion r_eo(FGrid);
    LatticeFermion src_e (FrbGrid);
    LatticeFermion src_o (FrbGrid);
    LatticeFermion r_e   (FrbGrid);
    LatticeFermion r_o   (FrbGrid);
    pickCheckerboard(Even,src_e,src);
    pickCheckerboard(Odd,src_o,src);
    setCheckerboard(r_eo,src_o);
    setCheckerboard(r_eo,src_e);
    r_e = zero;
    r_o = zero;
 #define BENCH_DW(A,in,out)			\
    Dw.CayleyZeroCounters();			\
    Dw. A (in,out);				\
    FGrid->Barrier();				\
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      Dw. A (in,out);				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;
 #define BENCH_DW_MEO(A,in,out)			\
    Dw.CayleyZeroCounters();			\
    Dw. A (in,out,0);				\
    FGrid->Barrier();				\
    t0=usecond();				\
    for(int i=0;i<ncall;i++){			\
      Dw. A (in,out,0);				\
    }						\
    t1=usecond();				\
    FGrid->Barrier();				\
    Dw.CayleyReport();					\
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;
    BENCH_DW_MEO(Dhop    ,src,result);
    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
    BENCH_DW(Meooe   ,src_o,r_e);
    BENCH_DW(Mooee   ,src_o,r_o);
    BENCH_DW(MooeeInv,src_o,r_o);
  }
  if (1)
  {
    const int ncall=1000;
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    GridParallelRNG RNG5(sFGrid);
    LatticeFermion src(sFGrid); random(RNG5,src);
    LatticeFermion sref(sFGrid);
    LatticeFermion result(sFGrid);
    std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
    DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);
    std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
    FGrid->Barrier();
    double t0,t1;
    LatticeFermion r_eo(sFGrid);
    LatticeFermion src_e (sFrbGrid);
    LatticeFermion src_o (sFrbGrid);
    LatticeFermion r_e   (sFrbGrid);
    LatticeFermion r_o   (sFrbGrid);
    pickCheckerboard(Even,src_e,src);
    pickCheckerboard(Odd,src_o,src);
    setCheckerboard(r_eo,src_o);
    setCheckerboard(r_eo,src_e);
    r_e = zero;
    r_o = zero;
    BENCH_DW_MEO(Dhop    ,src,result);
    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
    BENCH_DW(Meooe   ,src_o,r_e);
    BENCH_DW(Mooee   ,src_o,r_o);
    BENCH_DW(MooeeInv,src_o,r_o);
  }
  Grid_finalize();
 }
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@ -1,11 +1 @@
 include Make.inc
 simple: simple_su3_test.o simple_su3_expr.o simple_simd_test.o
 EXTRA_LIBRARIES = libsimple_su3_test.a libsimple_su3_expr.a libsimple_simd_test.a
 libsimple_su3_test_a_SOURCES = simple_su3_test.cc
 libsimple_su3_expr_a_SOURCES = simple_su3_expr.cc
 libsimple_simd_test_a_SOURCES = simple_simd_test.cc
--- a/benchmarks/simple_simd_test.cc
+++ b/benchmarks/simple_simd_test.cc
@ -1,11 +0,0 @@
 #include <Grid/Grid.h>
 Grid::vRealD add(const Grid::vRealD &x, const Grid::vRealD &y)
 {
 	return x+y;
 }
 Grid::vRealD sub(const Grid::vRealD &x, const Grid::vRealD &y)
 {
 	return x-y;
 }
--- a/benchmarks/simple_su3_expr.cc
+++ b/benchmarks/simple_su3_expr.cc
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 using namespace std;
 using namespace Grid;
--- a/benchmarks/simple_su3_test.cc
+++ b/benchmarks/simple_su3_test.cc
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>
 using namespace std;
 using namespace Grid;
--- a/configure.ac
+++ b/configure.ac
@ -149,15 +149,9 @@ CXXFLAGS=$CXXFLAGS_CPY
 LDFLAGS=$LDFLAGS_CPY
 ############### SIMD instruction selection
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=<code>],
 	            [select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])
 AC_ARG_ENABLE([gen-simd-width],
            [AS_HELP_STRING([--enable-gen-simd-width=size],
            [size (in bytes) of the generic SIMD vectors (default: 32)])],
            [ac_gen_simd_width=$enable_gen_simd_width],
            [ac_gen_simd_width=32])
 case ${ax_cv_cxx_compiler_vendor} in
  clang|gnu)
    case ${ac_SIMD} in
@ -186,10 +180,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-march=knl';;
      GEN)
-        AC_DEFINE([GEN],[1],[generic vector code])
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
                           [generic SIMD vector width (in bytes)])
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
        SIMD_FLAGS='';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
@ -206,8 +197,8 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX1],[1],[AVX intrinsics])
        SIMD_FLAGS='-mavx -xavx';;
      AVXFMA)
-        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
+        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
-        SIMD_FLAGS='-mavx -fma';;
+        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
@ -221,10 +212,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
        SIMD_FLAGS='-xmic-avx512';;
      GEN)
-        AC_DEFINE([GEN],[1],[generic vector code])
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
                           [generic SIMD vector width (in bytes)])
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
        SIMD_FLAGS='';;
      *)
        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
@ -290,7 +278,7 @@ esac
 case ${ac_COMMS} in
    *-auto)
        LX_FIND_MPI
-        if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi
+        if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
        AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
        AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
        AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
@ -354,17 +342,12 @@ esac
 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
 ############### Doxygen
-DX_DOXYGEN_FEATURE([OFF])
+AC_PROG_DOXYGEN
-DX_DOT_FEATURE([OFF])
+
-DX_HTML_FEATURE([ON])
+if test -n "$DOXYGEN"
-DX_CHM_FEATURE([OFF])
+then
-DX_CHI_FEATURE([OFF])
+AC_CONFIG_FILES([docs/doxy.cfg])
-DX_MAN_FEATURE([OFF])
+fi
 DX_RTF_FEATURE([OFF])
 DX_XML_FEATURE([OFF])
 DX_PDF_FEATURE([OFF])
 DX_PS_FEATURE([OFF])
 DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])
 ############### Ouput
 cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
@ -399,7 +382,7 @@ os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
-SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
+SIMD                        : ${ac_SIMD}
 Threading                   : ${ac_openmp} 
 Communications type         : ${comms_type}
 Default precision           : ${ac_PRECISION}
@ -407,7 +390,8 @@ RNG choice                  : ${ac_RNG}
 GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 LAPACK                      : ${ac_LAPACK}
 FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
-build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
+build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 graphs and diagrams         : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
 CXXFLAGS:
 `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
--- a/docs/doxy.cfg
+++ b/docs/doxy.cfg
--- a/docs/doxy.cfg.in
+++ b/docs/doxy.cfg.in
--- a/doxygen.inc
+++ b/doxygen.inc
@ -1,184 +0,0 @@
 # Copyright (C) 2004 Oren Ben-Kiki
 # This file is distributed under the same terms as the Automake macro files.
 # Generate automatic documentation using Doxygen. Goals and variables values
 # are controlled by the various DX_COND_??? conditionals set by autoconf.
 #
 # The provided goals are:
 # doxygen-doc: Generate all doxygen documentation.
 # doxygen-run: Run doxygen, which will generate some of the documentation
 #              (HTML, CHM, CHI, MAN, RTF, XML) but will not do the post
 #              processing required for the rest of it (PS, PDF, and some MAN).
 # doxygen-man: Rename some doxygen generated man pages.
 # doxygen-ps: Generate doxygen PostScript documentation.
 # doxygen-pdf: Generate doxygen PDF documentation.
 #
 # Note that by default these are not integrated into the automake goals. If
 # doxygen is used to generate man pages, you can achieve this integration by
 # setting man3_MANS to the list of man pages generated and then adding the
 # dependency:
 #
 #   $(man3_MANS): doxygen-doc
 #
 # This will cause make to run doxygen and generate all the documentation.
 #
 # The following variable is intended for use in Makefile.am:
 #
 # DX_CLEANFILES = everything to clean.
 #
 # This is usually added to MOSTLYCLEANFILES.
 ## --------------------------------- ##
 ## Format-independent Doxygen rules. ##
 ## --------------------------------- ##
 if DX_COND_doc
 ## ------------------------------- ##
 ## Rules specific for HTML output. ##
 ## ------------------------------- ##
 if DX_COND_html
 DX_CLEAN_HTML = @DX_DOCDIR@/html
 endif DX_COND_html
 ## ------------------------------ ##
 ## Rules specific for CHM output. ##
 ## ------------------------------ ##
 if DX_COND_chm
 DX_CLEAN_CHM = @DX_DOCDIR@/chm
 if DX_COND_chi
 DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi
 endif DX_COND_chi
 endif DX_COND_chm
 ## ------------------------------ ##
 ## Rules specific for MAN output. ##
 ## ------------------------------ ##
 if DX_COND_man
 DX_CLEAN_MAN = @DX_DOCDIR@/man
 endif DX_COND_man
 ## ------------------------------ ##
 ## Rules specific for RTF output. ##
 ## ------------------------------ ##
 if DX_COND_rtf
 DX_CLEAN_RTF = @DX_DOCDIR@/rtf
 endif DX_COND_rtf
 ## ------------------------------ ##
 ## Rules specific for XML output. ##
 ## ------------------------------ ##
 if DX_COND_xml
 DX_CLEAN_XML = @DX_DOCDIR@/xml
 endif DX_COND_xml
 ## ----------------------------- ##
 ## Rules specific for PS output. ##
 ## ----------------------------- ##
 if DX_COND_ps
 DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE@.ps
 DX_PS_GOAL = doxygen-ps
 doxygen-ps: @DX_DOCDIR@/@PACKAGE@.ps
@DX_DOCDIR@/@PACKAGE@.ps: @DX_DOCDIR@/@PACKAGE@.tag
 	cd @DX_DOCDIR@/latex; \
 	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
 	$(DX_LATEX) refman.tex; \
 	$(MAKEINDEX_PATH) refman.idx; \
 	$(DX_LATEX) refman.tex; \
 	countdown=5; \
 	while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
 	                  refman.log > /dev/null 2>&1 \
 	   && test $$countdown -gt 0; do \
 	    $(DX_LATEX) refman.tex; \
 	    countdown=`expr $$countdown - 1`; \
 	done; \
 	$(DX_DVIPS) -o ../@PACKAGE@.ps refman.dvi
 endif DX_COND_ps
 ## ------------------------------ ##
 ## Rules specific for PDF output. ##
 ## ------------------------------ ##
 if DX_COND_pdf
 DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE@.pdf
 DX_PDF_GOAL = doxygen-pdf
 doxygen-pdf: @DX_DOCDIR@/@PACKAGE@.pdf
@DX_DOCDIR@/@PACKAGE@.pdf: @DX_DOCDIR@/@PACKAGE@.tag
 	cd @DX_DOCDIR@/latex; \
 	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
 	$(DX_PDFLATEX) refman.tex; \
 	$(DX_MAKEINDEX) refman.idx; \
 	$(DX_PDFLATEX) refman.tex; \
 	countdown=5; \
 	while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
 	                  refman.log > /dev/null 2>&1 \
 	   && test $$countdown -gt 0; do \
 	    $(DX_PDFLATEX) refman.tex; \
 	    countdown=`expr $$countdown - 1`; \
 	done; \
 	mv refman.pdf ../@PACKAGE@.pdf
 endif DX_COND_pdf
 ## ------------------------------------------------- ##
 ## Rules specific for LaTeX (shared for PS and PDF). ##
 ## ------------------------------------------------- ##
 if DX_COND_latex
 DX_CLEAN_LATEX = @DX_DOCDIR@/latex
 endif DX_COND_latex
 .INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
 doxygen-run: @DX_DOCDIR@/@PACKAGE@.tag
 doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
@DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS)
 	rm -rf @DX_DOCDIR@
 	$(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG)
 DX_CLEANFILES = \
    @DX_DOCDIR@/@PACKAGE@.tag \
    -r \
    $(DX_CLEAN_HTML) \
    $(DX_CLEAN_CHM) \
    $(DX_CLEAN_CHI) \
    $(DX_CLEAN_MAN) \
    $(DX_CLEAN_RTF) \
    $(DX_CLEAN_XML) \
    $(DX_CLEAN_PS) \
    $(DX_CLEAN_PDF) \
    $(DX_CLEAN_LATEX)
 endif DX_COND_doc
--- a/lib/Bitwise.cc
+++ b/lib/Bitwise.cc
@ -1,49 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/Bitwise.cc
 Copyright (C) 2016
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <iostream>
 #include <Bitwise.h>
 #include <bitset>
 #include <climits>
 namespace Grid {
 void show_binaryrep(const unsigned char* a, size_t size) {
  const unsigned char* beg = a;
  const unsigned char* end = a + size;
  unsigned int ctr = 0;
  while (beg != end) {
    std::cout << std::bitset<CHAR_BIT>(*beg++) << ' ';
    ctr++;
    if (ctr % GRID_REAL_BYTES == 0) std::cout << '\n';
  }
  std::cout << '\n';
 }
 } // namespace 
--- a/lib/Bitwise.h
+++ b/lib/Bitwise.h
@ -1,76 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/Bitwise.h
 Copyright (C) 2016
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_BITWISE_H
 #define GRID_BITWISE_H
 #include <cassert>
 #include <cfloat>
 #include <bitset>
 #include <climits>
 #include <Config.h>
 #ifdef GRID_DEFAULT_PRECISION_SINGLE
 #define GRID_REAL_BYTES 4
 #endif
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define GRID_REAL_BYTES 8
 #endif
 namespace Grid {
 void show_binaryrep(const unsigned char* a, size_t size);
 template <typename T>
 void show_binaryrep(const T& a) {
  const char* beg = reinterpret_cast<const char*>(&a);
  const char* end = beg + sizeof(a);
  unsigned int ctr = 0;
  while (beg != end) {
  	std::cout << std::bitset<CHAR_BIT>(*beg++) << ' ';
  	ctr++;
  	if (ctr % GRID_REAL_BYTES == 0) std::cout << '\n';
  }
  std::cout << '\n';
 }
 template <typename T>
 void bitwise_xor(T& l, T& r, unsigned char* xors) {
  assert(sizeof(l) == sizeof(r));
  unsigned char* org = reinterpret_cast<unsigned char*>(&l);
  unsigned char* cur = reinterpret_cast<unsigned char*>(&r);
  int words = sizeof(l) / sizeof(*org);
  unsigned char result = 0;
  for (int w = 0; w < words; w++) xors[w] = (org[w] ^ cur[w]);
 }
 }; // namespace 
 #endif
--- a/lib/FFT.h
+++ b/lib/FFT.h
@ -244,11 +244,8 @@ namespace Grid {
            pokeLocalSite(s,pgbuf,cbuf);
          }
        }
        if (p != processors[dim] - 1)
        {
        result = Cshift(result,dim,L);
      }
      }
      // Loop over orthog coords
      int NN=pencil_g.lSites();
@ -290,10 +287,10 @@ namespace Grid {
          cgbuf = clbuf;
          cgbuf[dim] = clbuf[dim]+L*pc;
          peekLocalSite(s,pgbuf,cgbuf);
          s = s * div;
          pokeLocalSite(s,result,clbuf);
        }
      }
      result = result*div;
      // destroying plan
      FFTW<scalar>::fftw_destroy_plan(p);
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -62,7 +62,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/serialisation/Serialisation.h>
 #include "Config.h"
 #include <Grid/Timer.h>
 #include <Grid/Bitwise.h>
 #include <Grid/PerfCount.h>
 #include <Grid/Log.h>
 #include <Grid/AlignedAllocator.h>
--- a/lib/Log.cc
+++ b/lib/Log.cc
@ -100,7 +100,7 @@ void Grid_quiesce_nodes(void) {
  me = shmem_my_pe();
 #endif
  if (me) {
-    std::cout.setstate(std::ios::badbit);// mute all nodes except 0
+    std::cout.setstate(std::ios::badbit);
  }
 }
--- a/lib/Threads.h
+++ b/lib/Threads.h
@ -38,21 +38,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_OMP
 #include <omp.h>
 #ifdef GRID_NUMA
-  #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
+#define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
-  #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
+#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
-  #else
+#else
-  #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(runtime)")
+#define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(runtime)")
-  #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
+#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
 #endif
 #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
 #define PARALLEL_FOR_LOOP_STATIC _Pragma("omp parallel for schedule(static)")
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_REGION
 #define PARALLEL_FOR_LOOP_STATIC
 #endif
 namespace Grid {
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@ -9,7 +9,6 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -34,21 +33,6 @@ directory
 namespace Grid {
 struct CG_state {
  bool do_repro;
  std::vector<RealD> residuals;
  CG_state() {reset();}
  void reset(){
    do_repro = false;
    residuals.clear();
  }
 };
 enum CGexec_mode{ Default, ReproducibilityTest };
 /////////////////////////////////////////////////////////////
 // Base classes for iterative processes based on operators
 // single input vec, single output vec.
@ -61,30 +45,10 @@ class ConjugateGradient : public OperatorFunction<Field> {
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
-
+  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
-  // Reproducibility controls
+      : Tolerance(tol),
-  bool ReproTest;
+        MaxIterations(maxit),
-  CG_state CGState; //to check reproducibility by repeating the CG
+        ErrorOnNoConverge(err_on_no_conv){};
  ReproducibilityState<typename Field::vector_object> ReprTest; // for the inner proucts
  // Constructor
  ConjugateGradient(RealD tol, Integer maxit, CGexec_mode Mode = Default)
    : Tolerance(tol),MaxIterations(maxit){
    switch(Mode)
    {
      case Default  : 
      ErrorOnNoConverge = true;
      ReproTest = false;
      case ReproducibilityTest :
      ErrorOnNoConverge = false;
      ReproTest = true; 
    }
  };
  void set_reproducibility_interval(unsigned int interval){
    ReprTest.interval = interval;
  }
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
                  Field &psi) {
@ -96,37 +60,34 @@ class ConjugateGradient : public OperatorFunction<Field> {
    Field p(src);
    Field mmp(src);
    Field r(src);
    Field psi_start(psi);// save for the repro test
    if (CGState.do_repro && ReproTest)
        std::cout << GridLogMessage << "Starting reproducibility test, full check every "
                  << ReprTest.interval << " calls" << std::endl;
    if(!ReprTest.do_check)
        ReprTest.reset();
    ReprTest.enable_reprocheck=ReproTest;
    // Initial residual computation & set up
-    RealD guess = norm2(psi, ReprTest);
+    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);
-    Linop.HermOpAndNorm(psi, mmp, d, b);// eventually split this for the norm check
+    
    Linop.HermOpAndNorm(psi, mmp, d, b);
    r = src - mmp;
    p = r;
-    a = norm2(p, ReprTest);
+    a = norm2(p);
    cp = a;
-    ssq = norm2(src, ReprTest);
+    ssq = norm2(src);
-    std::cout << GridLogIterative << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
-    std::cout << GridLogIterative << "ConjugateGradient:   src " << ssq << std::endl;
+              << "ConjugateGradient: guess " << guess << std::endl;
-    std::cout << GridLogIterative << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
-    std::cout << GridLogIterative << "ConjugateGradient:   mmp " << b << std::endl;
+              << "ConjugateGradient:   src " << ssq << std::endl;
-    std::cout << GridLogIterative << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
-    std::cout << GridLogIterative << "ConjugateGradient:     p " << a << std::endl;
+              << "ConjugateGradient:    mp " << d << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:   mmp " << b << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:  cp,r " << cp << std::endl;
    std::cout << GridLogIterative << std::setprecision(4)
              << "ConjugateGradient:     p " << a << std::endl;
    RealD rsq = Tolerance * Tolerance * ssq;
@ -146,10 +107,10 @@ class ConjugateGradient : public OperatorFunction<Field> {
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
-      c = cp;// old residual
+      c = cp;
      MatrixTimer.Start();
-      Linop.HermOpAndNorm(p, mmp, d, qq);// mmp = Ap, d=pAp
+      Linop.HermOpAndNorm(p, mmp, d, qq);
      MatrixTimer.Stop();
      LinalgTimer.Start();
@ -157,31 +118,14 @@ class ConjugateGradient : public OperatorFunction<Field> {
      //  ComplexD dck  = innerProduct(p,mmp);
      a = c / d;
-      b_pred = a * (a * qq - d) / c;// a check
+      b_pred = a * (a * qq - d) / c;
-
+      cp = axpy_norm(r, -a, mmp, r);
      axpy(r, -a, mmp, r);// new residual r = r_old - a * Ap
      cp = norm2(r, ReprTest); // bookkeeping this norm
      if (ReproTest && !CGState.do_repro) {
        CGState.residuals.push_back(cp);  // save residuals state
                std::cout << GridLogIterative << "ReproTest: Saving state" << std::endl;
        }
      if (ReproTest && CGState.do_repro){
        // check that the residual agrees with the previous run
        std::cout << GridLogIterative << "ReproTest: Checking state k=" << k << std::endl;
        if (cp != CGState.residuals[k-1]){
                std::cout << GridLogMessage << "Failing reproducibility test";
                std::cout << GridLogMessage << " at k=" << k << std::endl;
                std::cout << GridLogMessage << "saved residual = " << CGState.residuals[k-1] 
                        << " cp = " << cp << std::endl;
                exit(1);  // exit after the first failure
        }
      }
      b = cp / c;
      // Fuse these loops ; should be really easy
-      psi = a * p + psi; // update solution
+      psi = a * p + psi;
-      p = p * b + r;  // update search direction
+      p = p * b + r;
      LinalgTimer.Stop();
      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
@ -212,22 +156,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
        if (! (CGState.do_repro && ReproTest)){
                CGState.do_repro = true;
                ReprTest.do_check = true;
                ReprTest.reset_counter();
                this->operator()(Linop, src, psi_start);// run the repro test
                if (ReprTest.success)
                	std::cout << GridLogMessage << "Reproducibility test passed" << std::endl;
                else{
                	std::cout << GridLogMessage << "Reproducibility test failed" << std::endl;
                	exit(1);
                }
        }
        // Clear state
        CGState.reset();
        ReprTest.reset();
        return;
      }
    }
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -59,7 +59,7 @@ public:
    int Nstop;   // Number of evecs checked for convergence
    int Nk;      // Number of converged sought
-    int Np;      // Np -- Number of spare vecs in krylov space
+    int Np;      // Np -- Number of spare vecs in kryloc space
    int Nm;      // Nm -- total number of vectors
    RealD eresid;
@ -1080,10 +1080,10 @@ say con = 2
 **/
 template<class T>
-static void Lock(DenseMatrix<T> &H, 	// Hess mtx	
+static void Lock(DenseMatrix<T> &H, 	///Hess mtx	
-		 DenseMatrix<T> &Q, 	// Lock Transform
+		 DenseMatrix<T> &Q, 	///Lock Transform
-		 T val, 		// value to be locked
+		 T val, 		///value to be locked
-		 int con, 	// number already locked
+		 int con, 	///number already locked
 		 RealD small,
 		 int dfg,
 		 bool herm)
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@ -65,7 +65,6 @@ const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return
 const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
 int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -68,8 +68,6 @@ class CartesianCommunicator {
  static MPI_Comm communicator_world;
         MPI_Comm communicator;
  typedef MPI_Request CommsRequest_t;
  static char name[MPI_MAX_PROCESSOR_NAME]; // processing node physical name
  static int length; 
 #else 
  typedef int CommsRequest_t;
 #endif
@ -151,7 +149,6 @@ class CartesianCommunicator {
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  void        		 PrintRankInfo(void)     ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -35,8 +35,6 @@ namespace Grid {
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 char CartesianCommunicator::name[MPI_MAX_PROCESSOR_NAME]; // processing node physical name
 int CartesianCommunicator::length; 
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
@ -46,8 +44,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    MPI_Init(argc,argv);
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  MPI_Get_processor_name(name, &length);
  ShmInitGeneric();
 }
@ -210,10 +206,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  assert(ierr==0);
 }
 void CartesianCommunicator::PrintRankInfo(){
    std::cout << "Grid: Rank "<< _processor << "  -  Physical node name: " << name << std::endl;
 }
 }// end of namespace
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@ -576,10 +576,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  assert(ierr==0);
 }
  void CartesianCommunicator::PrintRankInfo(){
    std::cout << "Grid: Rank "<< _processor << "  -  Physical node name: " << name << std::endl;
  }
 }
--- a/lib/communicator/Communicator_mpi3_leader.cc
+++ b/lib/communicator/Communicator_mpi3_leader.cc
@ -869,9 +869,6 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
  return NULL;
 }
  void CartesianCommunicator::PrintRankInfo(){
    std::cout << "Grid: Rank "<< _processor << "  -  Physical node name: " << name << std::endl;
  }
 };
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@ -37,11 +37,6 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
  ShmInitGeneric();
 }
 void CartesianCommunicator::PrintRankInfo(){
    std::cout << GridLogMessage << "No Rank Info available" << std::endl;
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@ -333,9 +333,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  }
 }
  void CartesianCommunicator::PrintRankInfo(){
    std::cout << GridLogMessage << "SHMEM: Rank Info not implemented yet" << std::endl;
  }
 }
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@ -31,87 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define GRID_LATTICE_REDUCTION_H
 namespace Grid {
 template <class T>
 class ReproducibilityState {
 public:
  typedef typename T::vector_type vector_type;
  typedef std::vector<vector_type, alignedAllocator<vector_type> >  sum_type;
  unsigned int  n_call;
  bool          do_check;
  bool          enable_reprocheck;
  bool          success;
  unsigned int  interval; 
  std::vector<sum_type> th_states;
  void reset_counter() { n_call = 0; }
  void reset() {
    th_states.clear();
    do_check          = false;
    enable_reprocheck = false;
    success           = true;
    n_call            = 0;
  };
  ReproducibilityState():interval(1) { 
    reset(); 
  }
  void check(GridBase* grid, sum_type &sumarray){
    ///////////////////////  Reproducibility section, not threaded on purpouse
    if (enable_reprocheck) {
      if (do_check && (n_call % interval) == 0) {
        for (int thread = 0; thread < sumarray.size(); thread++) {
          int words = sizeof(sumarray[thread])/sizeof(unsigned char);
          unsigned char xors[words];
          bitwise_xor(sumarray[thread], th_states[n_call][thread],xors);
          // OR all words
          unsigned char res = 0;
          for (int w = 0; w < words; w++) res = res | xors[w];
          Grid_unquiesce_nodes();
          int rank = 0;
          while (rank < grid->_Nprocessors){
            if (rank == grid->ThisRank() ){
              if ( res ) {
                std::cout << "Reproducibility failure report" << std::endl;
                grid->PrintRankInfo();
                std::cout << "Call: "<< n_call << " Thread: " << thread << std::endl;
                std::cout << "Size of states: " << th_states.size() << std::endl;
                std::cout << std::setprecision(GRID_REAL_DIGITS+1) << std::scientific;
                std::cout << "Saved partial sum  : " << th_states[n_call][thread] << std::endl;
                std::cout << "Current partial sum: " << sumarray[thread] << std::endl;
                std::cout << "Saved state "  << std::endl; show_binaryrep(th_states[n_call][thread]);
                std::cout << "Current state" << std::endl; show_binaryrep(sumarray[thread]);
                std::cout << "XOR result"    << std::endl; show_binaryrep(xors, words);
              		//std::cout << std::defaultfloat;  //not supported by some compilers
                std::cout << std::setprecision(6);
                success = false; 
              }
            }
            rank++;
            grid->Barrier();
          }
          Grid_quiesce_nodes();                         
        }
      } else if (!do_check)
      {
        std::cout << GridLogDebug << "Saving thread state for inner product. Call n. " 
        << n_call << std::endl;
        th_states.resize(n_call+1);
        th_states[n_call].resize(grid->SumArraySize());
          th_states[n_call] = sumarray;  // save threads state
          //n_call++;
        }
        n_call++;
      }
    }
 };
 #ifdef GRID_WARN_SUBOPTIMAL
 #warning "Optimisation alert all these reduction loops are NOT threaded "
 #endif     
@ -120,25 +39,12 @@ class ReproducibilityState {
    // Deterministic Reduction operations
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
-        ReproducibilityState<vobj> repr;
+    ComplexD nrm = innerProduct(arg,arg);
        ComplexD nrm = innerProduct(arg, arg, repr);
        return std::real(nrm); 
    }
    template <class vobj>
    inline RealD norm2(const Lattice<vobj> &arg, ReproducibilityState<vobj>& rep) {
      ComplexD nrm = innerProduct(arg, arg, rep);
    return std::real(nrm); 
  }
    template<class vobj>
-    inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right){
+    inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) 
      ReproducibilityState<vobj> repr;
      return innerProduct(left, right, repr);
    } 
    template<class vobj>
    inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right, ReproducibilityState<vobj>& repr) 
    {
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
@ -151,22 +57,18 @@ class ReproducibilityState {
 	sumarray[i]=zero;
      }
-      PARALLEL_FOR_LOOP_STATIC //request statically scheduled threads for reproducibility
+PARALLEL_FOR_LOOP
      for(int thr=0;thr<grid->SumArraySize();thr++){
 	int nwork, mywork, myoff;
 	GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
-        decltype(innerProduct(left._odata[0],right._odata[0])) vnrm = zero; // private to thread; sub summation
+	decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
        for(int ss=myoff;ss<mywork+myoff; ss++){
 	  vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
 	}
 	sumarray[thr]=TensorRemove(vnrm) ;
      }
      /////////// Reproducibility
      repr.check(grid, sumarray);
      ///////////////////////////
      vector_type vvnrm; vvnrm=zero;  // sum across threads
      for(int i=0;i<grid->SumArraySize();i++){
 	vvnrm = vvnrm+sumarray[i];
@ -212,7 +114,7 @@ class ReproducibilityState {
 	sumarray[i]=zero;
      }
-      PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
      for(int thr=0;thr<grid->SumArraySize();thr++){
 	int nwork, mywork, myoff;
 	GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
@ -244,7 +146,7 @@ class ReproducibilityState {
 template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
-    {
+{
  typedef typename vobj::scalar_object sobj;
  GridBase  *grid = Data._grid;
  assert(grid!=NULL);
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@ -195,7 +195,6 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
@ -204,20 +203,6 @@ typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 // Ls vectorised 
 typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
 typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
 typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
 typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
 typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
 typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
 typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
 typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
 typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@ -269,7 +254,6 @@ typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -62,50 +62,6 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 {
  this->Report();
  std::vector<int> latt = GridDefaultLatt();          
  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  RealD NP     = this->_FourDimGrid->_Nprocessors;
  if ( M5Dcalls > 0 ) {
    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
    // Flops = 6.0*(Nc*Ns) *Ls*vol
    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  }
  if ( MooeeInvCalls > 0 ) {
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
    // Flops = 9*12*Ls*vol/2
    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  }
 }
 template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
 {
  this->ZeroCounters();
  M5Dflops=0;
  M5Dcalls=0;
  M5Dtime=0;
  MooeeInvFlops=0;
  MooeeInvCalls=0;
  MooeeInvTime=0;
 }
 template<class Impl>  
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@ -121,18 +121,6 @@ namespace Grid {
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
     void CayleyReport(void);
     void CayleyZeroCounters(void);
     double M5Dflops;
     double M5Dcalls;
     double M5Dtime;
     double MooeeInvFlops;
     double MooeeInvCalls;
     double MooeeInvTime;
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@ -51,9 +51,6 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  GridBase *grid=psi._grid;
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  M5Dcalls++;
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    for(int s=0;s<Ls;s++){
@ -79,7 +76,6 @@ PARALLEL_FOR_LOOP
      }
    }
  }
  M5Dtime+=usecond();
 }
 template<class Impl>  
@ -95,9 +91,6 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  // Flops = 6.0*(Nc*Ns) *Ls*vol
  M5Dcalls++;
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
@ -123,7 +116,6 @@ PARALLEL_FOR_LOOP
      }
    }
  }
  M5Dtime+=usecond();
 }
 template<class Impl>
@ -134,14 +126,10 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
  chi.checkerboard=psi.checkerboard;
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
    // Apply (L^{\prime})^{-1}
    chi[ss]=psi[ss]; // chi[0]=psi[0]
    for(int s=1;s<Ls;s++){
@ -167,9 +155,6 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
    }
  }
  MooeeInvTime+=usecond();
 }
 template<class Impl>
@ -181,8 +166,6 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
  assert(psi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
@ -214,9 +197,6 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
    }
  }
  MooeeInvTime+=usecond();
 }
 #ifdef CAYLEY_DPERP_CACHE
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@ -60,7 +60,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
-  const int nsimd= Simd::Nsimd();
+  int nsimd= Simd::Nsimd();
  Vector<iSinglet<Simd> > u(LLs);
  Vector<iSinglet<Simd> > l(LLs);
@ -86,15 +86,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
    d_p[ss] = diag[s];
  }}
  M5Dcalls++;
  M5Dtime-=usecond();
  assert(Nc==3);
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-#if 0
+
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
@ -111,113 +105,16 @@ PARALLEL_FOR_LOOP
      if ( vp<=v ) rotate(hp,hp,1);
      if ( vm>=v ) rotate(hm,hm,nsimd-1);
-	hp=0.5*hp;
+      hp=hp*0.5;
-        hm=0.5*hm;
+      hm=hm*0.5;
      spRecon5m(fp,hp);
      spRecon5p(fm,hm);
-	chi[ss+v] = d[v]*phi[ss+v];
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
 	chi[ss+v] = chi[ss+v]     +u[v]*fp;
      chi[ss+v] = chi[ss+v]     +l[v]*fm;
    }
 #else
      for(int v=0;v<LLs;v++){
 	vprefetch(psi[ss+v+LLs]);
 	//	vprefetch(phi[ss+v+LLs]);
 	int vp= (v==LLs-1) ? 0     : v+1;
 	int vm= (v==0    ) ? LLs-1 : v-1;
 	Simd hp_00 = psi[ss+vp]()(2)(0); 
 	Simd hp_01 = psi[ss+vp]()(2)(1); 
 	Simd hp_02 = psi[ss+vp]()(2)(2); 
 	Simd hp_10 = psi[ss+vp]()(3)(0); 
 	Simd hp_11 = psi[ss+vp]()(3)(1); 
 	Simd hp_12 = psi[ss+vp]()(3)(2); 
 	Simd hm_00 = psi[ss+vm]()(0)(0); 
 	Simd hm_01 = psi[ss+vm]()(0)(1); 
 	Simd hm_02 = psi[ss+vm]()(0)(2); 
 	Simd hm_10 = psi[ss+vm]()(1)(0); 
 	Simd hm_11 = psi[ss+vm]()(1)(1); 
 	Simd hm_12 = psi[ss+vm]()(1)(2); 
 	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
 	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
 	if ( vp<=v ) {
 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
  }
 	if ( vm>=v ) {
 	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 	}
 	/*
 	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
 	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
 	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
 	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
 	*/	
 	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
 	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
 	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
 	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
 	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
 	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
 	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
 	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
 	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
 	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
 	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
 	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
 	//	if ( ss==0){
 	/*
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
 	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
 	}
 	*/
 	vstream(chi[ss+v]()(0)(0),p_00);
 	vstream(chi[ss+v]()(0)(1),p_01);
 	vstream(chi[ss+v]()(0)(2),p_02);
 	vstream(chi[ss+v]()(1)(0),p_10);
 	vstream(chi[ss+v]()(1)(1),p_11);
 	vstream(chi[ss+v]()(1)(2),p_12);
 	vstream(chi[ss+v]()(2)(0),p_20);
 	vstream(chi[ss+v]()(2)(1),p_21);
 	vstream(chi[ss+v]()(2)(2),p_22);
 	vstream(chi[ss+v]()(3)(0),p_30);
 	vstream(chi[ss+v]()(3)(1),p_31);
 	vstream(chi[ss+v]()(3)(2),p_32);
      }
 #endif
  }
  M5Dtime+=usecond();
 }
 template<class Impl>  
@ -257,8 +154,6 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
    d_p[ss] = diag[s];
  }}
  M5Dcalls++;
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
@ -288,8 +183,8 @@ PARALLEL_FOR_LOOP
    }
  }
  M5Dtime+=usecond();
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
@ -355,11 +250,13 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
    }
  }
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
  // Dynamic allocate on stack to get per thread without serialised heap acces
-#pragma omp parallel  
+PARALLEL_FOR_LOOP
-  {
+  for(auto site=0;site<vol;site++){
    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
    Vector<SiteHalfSpinor> SitePplus(LLs);
    Vector<SiteHalfSpinor> SitePminus(LLs);
@ -370,9 +267,6 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;
 #pragma omp for 
  for(auto site=0;site<vol;site++){
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spProj5p(SitePplus[s] ,psi[lex]);
@ -400,8 +294,6 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
      chi[lex] = SiteChi[s]*0.5;
    }
  }
  }
  MooeeInvTime+=usecond();
 }
 INSTANTIATE_DPERP(DomainWallVec5dImplD);
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -194,11 +194,6 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
    RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
   }
  if ( DerivCalls > 0 ) {
@ -214,15 +209,12 @@ void WilsonFermion5D<Impl>::Report(void)
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
-
+  }
    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }
  if (DerivCalls > 0 || DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
  }
 }
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@ -66,9 +66,6 @@ namespace Optimization {
    double f[4];
  };
  struct Vsplat{
    //Complex float
    inline __m256 operator()(float a, float b){
@ -170,7 +167,7 @@ namespace Optimization {
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA4)
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
@ -198,7 +195,7 @@ namespace Optimization {
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA4)
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
@ -236,7 +233,7 @@ namespace Optimization {
      a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
      return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2)  || defined (AVXFMA)
+#if defined (AVX2)
      __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
      __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@ -282,7 +279,7 @@ namespace Optimization {
      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
      return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2) || defined (AVXFMA)
+#if defined (AVX2)
      __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@ -323,7 +320,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
      a= _mm256_macc_ps(b,c,a);
 #endif
-#if defined (AVX2) || defined (AVXFMA)
+#if defined (AVX2)
      a= _mm256_fmadd_ps( b, c, a);
 #endif
    }
@ -335,7 +332,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
      a= _mm256_macc_pd(b,c,a);
 #endif
-#if defined (AVX2) || defined (AVXFMA)
+#if defined (AVX2)
      a= _mm256_fmadd_pd( b, c, a);
 #endif
    }
@ -350,7 +347,7 @@ namespace Optimization {
    }
    // Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA)
+#if defined (AVX1) 
      __m128i a0,a1;
      __m128i b0,b1;
      a0 = _mm256_extractf128_si256(a,0);
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@ -27,6 +27,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 //----------------------------------------------------------------------
 /*! @file Grid_knc.h
  @brief Optimization libraries for AVX512 instructions set for KNC
  Using intrinsics
 */
 // Time-stamp: <2015-06-09 14:27:28 neo>
 //----------------------------------------------------------------------
 #include <immintrin.h>
@ -86,13 +95,13 @@ namespace Optimization {
  struct Vstream{
    //Float
    inline void operator()(float * a, __m512 b){
-      _mm512_stream_ps(a,b);
+      //_mm512_stream_ps(a,b);
-      //      _mm512_store_ps(a,b);
+      _mm512_store_ps(a,b);
    }
    //Double
    inline void operator()(double * a, __m512d b){
-      _mm512_stream_pd(a,b);
+      //_mm512_stream_pd(a,b);
-      //      _mm512_store_pd(a,b);
+      _mm512_store_pd(a,b);
    }
  };
--- a/lib/simd/Grid_generic.h
+++ b/lib/simd/Grid_generic.h
@ -6,7 +6,8 @@
    Copyright (C) 2015
-Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -26,352 +27,133 @@ Author: Antonin Portelli <antonin.portelli@me.com>
    *************************************************************************************/
    /*  END LEGAL */
 static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
 //#define VECTOR_LOOPS
 // playing with compiler pragmas
 #ifdef VECTOR_LOOPS
 #ifdef __clang__
 #define VECTOR_FOR(i, w, inc)\
 _Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
 for (unsigned int i = 0; i < w; i += inc)
 #elif defined __INTEL_COMPILER
 #define VECTOR_FOR(i, w, inc)\
 _Pragma("simd vectorlength(w*8)")\
 for (unsigned int i = 0; i < w; i += inc)
 #else
 #define VECTOR_FOR(i, w, inc)\
 for (unsigned int i = 0; i < w; i += inc)
 #endif
 #else
 #define VECTOR_FOR(i, w, inc)\
 for (unsigned int i = 0; i < w; i += inc)
 #endif
 namespace Grid {
 namespace Optimization {
-  // type traits giving the number of elements for each vector type
+  template<class vtype>
-  template <typename T> struct W;
+  union uconv {
-  template <> struct W<double> {
+    float f;
-    constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
+    vtype v;
    constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
  };
  template <> struct W<float> {
    constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
    constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
  };
-  // SIMD vector types
+  union u128f {
-  template <typename T>
+    float v;
-  struct vec {
+    float f[4];
-    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
+  };
  union u128d {
    double v;
    double f[2];
  };
  typedef vec<float>   vecf;
  typedef vec<double>  vecd;
  struct Vsplat{
-    // Complex
+    //Complex float
-    template <typename T>
+    inline u128f operator()(float a, float b){
-    inline vec<T> operator()(T a, T b){
+      u128f out; 
-      vec<T> out;
+      out.f[0] = a;
-      
+      out.f[1] = b;
-      VECTOR_FOR(i, W<T>::r, 2)
+      out.f[2] = a;
-      {
+      out.f[3] = b;
        out.v[i]   = a;
        out.v[i+1] = b;
      }
      return out;
    }
-    
+    // Real float
-    // Real
+    inline u128f operator()(float a){
-    template <typename T>
+      u128f out; 
-    inline vec<T> operator()(T a){
+      out.f[0] = a;
-      vec<T> out;
+      out.f[1] = a;
-      
+      out.f[2] = a;
-      VECTOR_FOR(i, W<T>::r, 1)
+      out.f[3] = a;
      {
        out.v[i] = a;
      }
      return out;
    }
-    
+    //Complex double
-    // Integer
+    inline u128d operator()(double a, double b){
      u128d out; 
      out.f[0] = a;
      out.f[1] = b;
      return out;
    }
    //Real double
    inline u128d operator()(double a){
      u128d out; 
      out.f[0] = a;
      out.f[1] = a;
      return out;
    }
    //Integer
    inline int operator()(Integer a){
      return a;
    }
  };
  struct Vstore{
-    // Real
+    //Float 
-    template <typename T>
+    inline void operator()(u128f a, float* F){
-    inline void operator()(vec<T> a, T *D){
+      memcpy(F,a.f,4*sizeof(float));
-      *((vec<T> *)D) = a;
+    }
    //Double
    inline void operator()(u128d a, double* D){
      memcpy(D,a.f,2*sizeof(double));
    }
    //Integer
-    inline void operator()(int a, Integer *I){
+    inline void operator()(int a, Integer* I){
-      *I = a;
+      I[0] = a;
    }
  };
  struct Vstream{
-    // Real
+    //Float
-    template <typename T>
+    inline void operator()(float * a, u128f b){
-    inline void operator()(T * a, vec<T> b){
+      memcpy(a,b.f,4*sizeof(float));
      *((vec<T> *)a) = b;
    }
    //Double
    inline void operator()(double * a, u128d b){
      memcpy(a,b.f,2*sizeof(double));
    }
  };
  struct Vset{
-    // Complex
+    // Complex float 
-    template <typename T>
+    inline u128f operator()(Grid::ComplexF *a){
-    inline vec<T> operator()(std::complex<T> *a){
+      u128f out; 
-      vec<T> out;
+      out.f[0] = a[0].real();
-      
+      out.f[1] = a[0].imag();
-      VECTOR_FOR(i, W<T>::c, 1)
+      out.f[2] = a[1].real();
-      {
+      out.f[3] = a[1].imag();
        out.v[2*i]   = a[i].real();
        out.v[2*i+1] = a[i].imag();
      }
      return out;
    }
-    
+    // Complex double 
-    // Real
+    inline u128d operator()(Grid::ComplexD *a){
-    template <typename T>
+      u128d out; 
-    inline vec<T> operator()(T *a){
+      out.f[0] = a[0].real();
-      vec<T> out;
+      out.f[1] = a[0].imag();
-      
+      return out;
-      out = *((vec<T> *)a);
+    }
-      
+    // Real float 
    inline u128f operator()(float *a){
      u128f out; 
      out.f[0] = a[0];
      out.f[1] = a[1];
      out.f[2] = a[2];
      out.f[3] = a[3];
      return out;
    }
    // Real double
    inline u128d operator()(double *a){
      u128d out; 
      out.f[0] = a[0];
      out.f[1] = a[1];
      return out;
    }
    // Integer
    inline int operator()(Integer *a){
-      return *a;
+      return a[0];
    }
  };
  /////////////////////////////////////////////////////
  // Arithmetic operations
  /////////////////////////////////////////////////////
  struct Sum{
    // Complex/Real
    template <typename T>
    inline vec<T> operator()(vec<T> a, vec<T> b){
      vec<T> out;
      VECTOR_FOR(i, W<T>::r, 1)
      {
        out.v[i] = a.v[i] + b.v[i];
      }
      return out;
    }
    //I nteger
    inline int operator()(int a, int b){
      return a + b;
    }
  };
  struct Sub{
    // Complex/Real
    template <typename T>
    inline vec<T> operator()(vec<T> a, vec<T> b){
      vec<T> out;
      VECTOR_FOR(i, W<T>::r, 1)
      {
        out.v[i] = a.v[i] - b.v[i];
      }
      return out;
    }
    //Integer
    inline int operator()(int a, int b){
      return a-b;
    }
  };
  struct Mult{
    // Real
    template <typename T>
    inline vec<T> operator()(vec<T> a, vec<T> b){
      vec<T> out;
      VECTOR_FOR(i, W<T>::r, 1)
      {
        out.v[i] = a.v[i]*b.v[i];
      }
      return out;
    }
    // Integer
    inline int operator()(int a, int b){
      return a*b;
    }
  };
  #define cmul(a, b, c, i)\
  c[i]   = a[i]*b[i]   - a[i+1]*b[i+1];\
  c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
  struct MultComplex{
    // Complex
    template <typename T>
    inline vec<T> operator()(vec<T> a, vec<T> b){
      vec<T> out;
      VECTOR_FOR(i, W<T>::c, 1)
      {
        cmul(a.v, b.v, out.v, 2*i);
      }      
      return out;
    }
  };
  #undef cmul
  struct Div{
    // Real
    template <typename T>
    inline vec<T> operator()(vec<T> a, vec<T> b){
      vec<T> out;
      VECTOR_FOR(i, W<T>::r, 1)
      {
        out.v[i] = a.v[i]/b.v[i];
      }
      return out;
    }
  };
  #define conj(a, b, i)\
  b[i]   = a[i];\
  b[i+1] = -a[i+1];
  struct Conj{
    // Complex
    template <typename T>
    inline vec<T> operator()(vec<T> a){
      vec<T> out;
      VECTOR_FOR(i, W<T>::c, 1)
      {
        conj(a.v, out.v, 2*i);
      }
      return out;
    }
  };
  #undef conj
  #define timesmi(a, b, i)\
  b[i]   = a[i+1];\
  b[i+1] = -a[i];
  struct TimesMinusI{
    // Complex
    template <typename T>
    inline vec<T> operator()(vec<T> a, vec<T> b){
      vec<T> out;
      VECTOR_FOR(i, W<T>::c, 1)
      {
        timesmi(a.v, out.v, 2*i);
      }
      return out;
    }
  };
  #undef timesmi
  #define timesi(a, b, i)\
  b[i]   = -a[i+1];\
  b[i+1] = a[i];
  struct TimesI{
    // Complex
    template <typename T>
    inline vec<T> operator()(vec<T> a, vec<T> b){
      vec<T> out;
      VECTOR_FOR(i, W<T>::c, 1)
      {
        timesi(a.v, out.v, 2*i);
      }
      return out;
    }
  };
  #undef timesi
  //////////////////////////////////////////////
  // Some Template specialization
  #define perm(a, b, n, w)\
  unsigned int _mask = w >> (n + 1);\
  VECTOR_FOR(i, w, 1)\
  {\
    b[i] = a[i^_mask];\
  }
  #define DECL_PERMUTE_N(n)\
  template <typename T>\
  static inline vec<T> Permute##n(vec<T> in) {\
    vec<T> out;\
    perm(in.v, out.v, n, W<T>::r);\
    return out;\
  }
  struct Permute{
    DECL_PERMUTE_N(0);
    DECL_PERMUTE_N(1);
    DECL_PERMUTE_N(2);
    DECL_PERMUTE_N(3);
  };
  #undef perm
  #undef DECL_PERMUTE_N
  #define rot(a, b, n, w)\
  VECTOR_FOR(i, w, 1)\
  {\
    b[i] = a[(i + n)%w];\
  }
  struct Rotate{
    template <typename T>
    static inline vec<T> rotate(vec<T> in, int n){
      vec<T> out;
      rot(in.v, out.v, n, W<T>::r);
      return out;
    }
  };
  #undef rot
  #define acc(v, a, off, step, n)\
  for (unsigned int i = off; i < n; i += step)\
  {\
    a += v[i];\
  }
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
@ -383,66 +165,315 @@ namespace Optimization {
    }
  };
-  //Complex float Reduce
+  /////////////////////////////////////////////////////
-  template <>
+  // Arithmetic operations
-  inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
+  /////////////////////////////////////////////////////
-    float a = 0.f, b = 0.f;
+  struct Sum{
-    
+    //Complex/Real float
-    acc(in.v, a, 0, 2, W<float>::r);
+    inline u128f operator()(u128f a, u128f b){
-    acc(in.v, b, 1, 2, W<float>::r);
+      u128f out;
-    
+      out.f[0] = a.f[0] + b.f[0];
-    return Grid::ComplexF(a, b);
+      out.f[1] = a.f[1] + b.f[1];
      out.f[2] = a.f[2] + b.f[2];
      out.f[3] = a.f[3] + b.f[3];
      return out;
    }
    //Complex/Real double
    inline u128d operator()(u128d a, u128d b){
      u128d out;
      out.f[0] = a.f[0] + b.f[0];
      out.f[1] = a.f[1] + b.f[1];
      return out;
    }
    //Integer
    inline int operator()(int a, int b){
      return a + b;
    }
  };
  struct Sub{
    //Complex/Real float
    inline u128f operator()(u128f a, u128f b){
      u128f out;
      out.f[0] = a.f[0] - b.f[0];
      out.f[1] = a.f[1] - b.f[1];
      out.f[2] = a.f[2] - b.f[2];
      out.f[3] = a.f[3] - b.f[3];
      return out;
    }
    //Complex/Real double
    inline u128d operator()(u128d a, u128d b){
      u128d out;
      out.f[0] = a.f[0] - b.f[0];
      out.f[1] = a.f[1] - b.f[1];
      return out;
    }
    //Integer
    inline int operator()(int a, int b){
      return a-b;
    }
  };
  struct MultComplex{
    // Complex float
    inline u128f operator()(u128f a, u128f b){
      u128f out;
      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
      out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
      out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
      return out;
    }
    // Complex double
    inline u128d operator()(u128d a, u128d b){
      u128d out;
      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
      return out;
    }
  };
  struct Mult{
    //CK: Appear unneeded
    // inline float  mac(float a, float b,double c){
    //   return 0;
    // }
    // inline double mac(double a, double b,double c){
    //   return 0;
    // }
    // Real float
    inline u128f operator()(u128f a, u128f b){
      u128f out;
      out.f[0] = a.f[0]*b.f[0];
      out.f[1] = a.f[1]*b.f[1];
      out.f[2] = a.f[2]*b.f[2];
      out.f[3] = a.f[3]*b.f[3];
      return out;
    }
    // Real double
    inline u128d operator()(u128d a, u128d b){
      u128d out;
      out.f[0] = a.f[0]*b.f[0];
      out.f[1] = a.f[1]*b.f[1];
      return out;
    }
    // Integer
    inline int operator()(int a, int b){
      return a*b;
    }
  };
  struct Conj{
    // Complex single
    inline u128f operator()(u128f in){
      u128f out;
      out.f[0] = in.f[0];
      out.f[1] = -in.f[1];
      out.f[2] = in.f[2];
      out.f[3] = -in.f[3];
      return out;
    }
    // Complex double
    inline u128d operator()(u128d in){
      u128d out;
      out.f[0] = in.f[0];
      out.f[1] = -in.f[1];
      return out;
    }
    // do not define for integer input
  };
  struct TimesMinusI{
    //Complex single
    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
      u128f out;
      out.f[0] = in.f[1];
      out.f[1] = -in.f[0];
      out.f[2] = in.f[3];
      out.f[3] = -in.f[2];
      return out;
    }
    //Complex double
    inline u128d operator()(u128d in, u128d ret){
      u128d out;
      out.f[0] = in.f[1];
      out.f[1] = -in.f[0];
      return out;
    }
  };
  struct TimesI{
    //Complex single
    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
      u128f out;
      out.f[0] = -in.f[1];
      out.f[1] = in.f[0];
      out.f[2] = -in.f[3];
      out.f[3] = in.f[2];
      return out;
    }
    //Complex double
    inline u128d operator()(u128d in, u128d ret){
      u128d out;
      out.f[0] = -in.f[1];
      out.f[1] = in.f[0];
      return out;
    }
  };
  //////////////////////////////////////////////
  // Some Template specialization
  struct Permute{
    //We just have to mirror the permutes of Grid_sse4.h
    static inline u128f Permute0(u128f in){ //AB CD -> CD AB
      u128f out;
      out.f[0] = in.f[2];
      out.f[1] = in.f[3];
      out.f[2] = in.f[0];
      out.f[3] = in.f[1];
      return out;
    };
    static inline u128f Permute1(u128f in){ //AB CD -> BA DC
      u128f out;
      out.f[0] = in.f[1];
      out.f[1] = in.f[0];
      out.f[2] = in.f[3];
      out.f[3] = in.f[2];
      return out;
    };
    static inline u128f Permute2(u128f in){
      return in;
    };
    static inline u128f Permute3(u128f in){
      return in;
    };
    static inline u128d Permute0(u128d in){ //AB -> BA
      u128d out;
      out.f[0] = in.f[1];
      out.f[1] = in.f[0];
      return out;      
    };
    static inline u128d Permute1(u128d in){
      return in;
    };
    static inline u128d Permute2(u128d in){
      return in;
    };
    static inline u128d Permute3(u128d in){
      return in;
    };
  };
  template < typename vtype > 
    void permute(vtype &a, vtype b, int perm) {
   };
  struct Rotate{
    static inline u128f rotate(u128f in,int n){
      u128f out;
      switch(n){
      case 0:
        out.f[0] = in.f[0];
        out.f[1] = in.f[1];
        out.f[2] = in.f[2];
        out.f[3] = in.f[3];
        break;
      case 1:
        out.f[0] = in.f[1];
        out.f[1] = in.f[2];
        out.f[2] = in.f[3];
        out.f[3] = in.f[0];
        break;
      case 2:
        out.f[0] = in.f[2];
        out.f[1] = in.f[3];
        out.f[2] = in.f[0];
        out.f[3] = in.f[1];
        break;
      case 3:
        out.f[0] = in.f[3];
        out.f[1] = in.f[0];
        out.f[2] = in.f[1];
        out.f[3] = in.f[2];
        break;
      default: assert(0);
      }
      return out;
    }
    static inline u128d rotate(u128d in,int n){
      u128d out;
      switch(n){
      case 0:
        out.f[0] = in.f[0];
        out.f[1] = in.f[1];
        break;
      case 1:
        out.f[0] = in.f[1];
        out.f[1] = in.f[0];
        break;
      default: assert(0);
      }
      return out;
    }
  };
  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
    return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
  }
  //Real float Reduce
  template<>
-  inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
+  inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
-    float a = 0.;
+    return in.f[0] + in.f[1] + in.f[2] + in.f[3];
    acc(in.v, a, 0, 1, W<float>::r);
    return a;
  }
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
+  inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
-    double a = 0., b = 0.;
+    return Grid::ComplexD(in.f[0],in.f[1]);
    acc(in.v, a, 0, 2, W<double>::r);
    acc(in.v, b, 1, 2, W<double>::r);
    return Grid::ComplexD(a, b);
  }
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
+  inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
-    double a = 0.f;
+    return in.f[0] + in.f[1];
    acc(in.v, a, 0, 1, W<double>::r);
    return a;
  }
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, int>::operator()(int in){
-    return in;
+    // FIXME unimplemented
   printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 }
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
-  typedef Optimization::vecf SIMD_Ftype; // Single precision type
+  typedef Optimization::u128f SIMD_Ftype;  // Single precision type
-  typedef Optimization::vecd SIMD_Dtype; // Double precision type
+  typedef Optimization::u128d SIMD_Dtype; // Double precision type
  typedef int SIMD_Itype; // Integer type
  // prefetch utilities
  inline void v_prefetch0(int size, const char *ptr){};
  inline void prefetch_HINT_T0(const char *ptr){};
  // Gpermute function
  template < typename VectorSIMD > 
    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
    Optimization::permute(y.v,b.v,perm);
  }
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
@ -450,13 +481,16 @@ namespace Optimization {
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
 }
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@ -26,6 +26,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 //----------------------------------------------------------------------
 /*! @file Grid_knc.h
  @brief Optimization libraries for AVX512 instructions set for KNC
  Using intrinsics
 */
 // Time-stamp: <2015-06-09 14:27:28 neo>
 //----------------------------------------------------------------------
 #include <immintrin.h>
 #include <zmmintrin.h>
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@ -245,21 +245,6 @@ namespace Optimization {
    }
  };
  struct Div{
    // Real double
    inline vector4double operator()(vector4double a, vector4double b){
      return vec_swdiv(a, b);
    }
    // Real float
    FLOAT_WRAP_2(operator(), inline)
    // Integer
    inline int operator()(int a, int b){
      return a/b;
    }
  };
  struct Conj{
    // Complex double
    inline vector4double operator()(vector4double v){
@ -428,7 +413,6 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 typedef Optimization::Sum         SumSIMD;
 typedef Optimization::Sub         SubSIMD;
 typedef Optimization::Mult        MultSIMD;
 typedef Optimization::Div         DivSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@ -38,13 +38,13 @@ directory
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES
-#ifdef GEN
+#ifdef GENERIC_VEC
 #include "Grid_generic.h"
 #endif
 #ifdef SSE4
 #include "Grid_sse4.h"
 #endif
-#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
+#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4)
 #include "Grid_avx.h"
 #endif
 #if defined AVX512
@ -60,13 +60,6 @@ directory
 #include "Grid_neon.h"
 #endif
 #ifdef GRID_DEFAULT_PRECISION_SINGLE
 #define GRID_REAL_DIGITS FLT_DIG
 #endif
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define GRID_REAL_DIGITS DBL_DIG
 #endif
 namespace Grid {
 //////////////////////////////////////
@ -137,7 +130,7 @@ class Grid_simd {
  Vector_type v;
-  static inline constexpr int Nsimd(void) {
+  static inline int Nsimd(void) {
    return sizeof(Vector_type) / sizeof(Scalar_type);
  }
@ -153,23 +146,6 @@ class Grid_simd {
  Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
  Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
  /////////////////////////////
  // Comparisons
  /////////////////////////////
  inline bool operator==(const Grid_simd& lhs){ 
    Grid_simd::conv_t conv1;
    Grid_simd::conv_t conv2;
    conv1.v = v;
    conv2.v = lhs.v;
    return std::equal(std::begin(conv1.s), std::end(conv1.s), std::begin(conv2.s));
  }
  inline bool operator!=(const Grid_simd& lhs){ 
    return !(*this == lhs);
  }
  /////////////////////////////
  // Constructors
  /////////////////////////////
--- a/m4/ac_prog_doxygen.m4
+++ b/m4/ac_prog_doxygen.m4
@ -0,0 +1,54 @@
 dnl Check for doxygen to create API docs
 dnl
 AC_DEFUN([AC_PROG_DOXYGEN],
 [
 AC_ARG_ENABLE(doxygen,
 AS_HELP_STRING([--enable-doxygen],[enable documentation generation with doxygen (auto)]))
 AC_ARG_ENABLE(dot,
 AS_HELP_STRING([--enable-dot],[use 'dot' to generate graphs in doxygen (auto)]))              
 AC_ARG_ENABLE(html-docs,
 AS_HELP_STRING([--enable-html-docs],[enable HTML generation with doxygen (yes)]),
 [],[ enable_html_docs=yes])              
 AC_ARG_ENABLE(latex-docs,
 AS_HELP_STRING([--enable-latex-docs],
 [enable LaTeX documentation generation with doxygen (no)]),[],[enable_latex_docs=no])              
 if test "x$enable_doxygen" = xno; then
 enable_doc=no	
 else 
 AC_CHECK_PROG(DOXYGEN, doxygen, doxygen)
 if test x$DOXYGEN = x; then
  if test "x$enable_doxygen" = xyes; then
   AC_MSG_ERROR([could not find doxygen])
  fi
  enable_doc=no
 else
  doxy_ver=`doxygen --version`
  doxy_major=`expr "$doxy_ver" : '\(@<:@0-9@:>@\)\..*'`
  doxy_minor=`expr "$doxy_ver" : '@<:@0-9@:>@\.\(@<:@0-9@:>@\).*'`
  if test $doxy_major -eq "1" -a $doxy_minor -ge "3" ; then
   enable_doc=yes
   AC_CHECK_PROG(DOT, dot, dot)
  else
   AC_MSG_WARN([doxygen version $doxy_ver too old, doxygen will not be used.])
   enable_doc=no
  fi
 fi
 fi
 AM_CONDITIONAL(DOXYGEN_DOC, test x$enable_doc = xyes)
 if test x$DOT = x; then
 if test "x$enable_dot" = xyes; then
  AC_MSG_ERROR([could not find dot])
 fi
 enable_dot=no
 else
 enable_dot=yes
 fi
 AC_SUBST(enable_dot)
 AC_SUBST(enable_html_docs)
 AC_SUBST(enable_latex_docs)
 ])
--- a/m4/m4-ax_prog_doxygen.m4
+++ b/m4/m4-ax_prog_doxygen.m4
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@ -50,12 +50,6 @@ public:
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
  std::string name(void) const { return std::string("Times"); }
 };
 class funcDivide {
 public:
  funcDivide() {};
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;}
  std::string name(void) const { return std::string("Divide"); }
 };
 class funcConj {
 public:
  funcConj() {};
@ -347,7 +341,6 @@ int main (int argc, char ** argv)
  Tester<RealF,vRealF>(funcPlus());
  Tester<RealF,vRealF>(funcMinus());
  Tester<RealF,vRealF>(funcTimes());
  Tester<RealF,vRealF>(funcDivide());
  Tester<RealF,vRealF>(funcAdj());
  Tester<RealF,vRealF>(funcConj());
  Tester<RealF,vRealF>(funcInnerProduct());
@ -378,7 +371,6 @@ int main (int argc, char ** argv)
  Tester<RealD,vRealD>(funcPlus());
  Tester<RealD,vRealD>(funcMinus());
  Tester<RealD,vRealD>(funcTimes());
  Tester<RealD,vRealD>(funcDivide());
  Tester<RealD,vRealD>(funcAdj());
  Tester<RealD,vRealD>(funcConj());
  Tester<RealD,vRealD>(funcInnerProduct());
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@ -68,7 +68,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<4;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
-    C = C + (TwoPiL * p[mu]) * coor;
+    C = C - (TwoPiL * p[mu]) * coor;
  }
  C = exp(C*ci);
@ -78,11 +78,10 @@ int main (int argc, char ** argv)
  FFT theFFT(&Fine);
-  Ctilde = C;
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
  //  C=zero;
  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
@ -94,11 +93,10 @@ int main (int argc, char ** argv)
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;
-  Stilde = S;
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
  theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
  SpinMatrixF Sp; 
  Sp = zero; Sp = Sp+cVol;
--- a/tests/debug/Test_cayley_cg_reproducibility.cc
+++ b/tests/debug/Test_cayley_cg_reproducibility.cc
@ -1,215 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_cayley_cg_reproducibility.cc
    Copyright (C) 2015
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 #define REPRODUCIBILITY_INTERVAL 1
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };
 template<class What> 
 void  TestCGinversions(What & Ddwf, 
                       GridCartesian         * FGrid,          GridRedBlackCartesian * FrbGrid,
                       GridCartesian         * UGrid,          GridRedBlackCartesian * UrbGrid,
                       RealD mass, RealD M5,
                       GridParallelRNG *RNG4,
                       GridParallelRNG *RNG5);
 template<class What> 
 void  TestCGschur(What & Ddwf, 
                  GridCartesian         * FGrid,               GridRedBlackCartesian * FrbGrid,
                  GridCartesian         * UGrid,               GridRedBlackCartesian * UrbGrid,
                  RealD mass, RealD M5,
                  GridParallelRNG *RNG4,
                  GridParallelRNG *RNG5);
 template<class What> 
 void  TestCGunprec(What & Ddwf, 
                   GridCartesian         * FGrid,              GridRedBlackCartesian * FrbGrid,
                   GridCartesian         * UGrid,              GridRedBlackCartesian * UrbGrid,
                   RealD mass, RealD M5,
                   GridParallelRNG *RNG4,
                   GridParallelRNG *RNG5);
 template<class What> 
 void  TestCGprec(What & Ddwf, 
                 GridCartesian         * FGrid,        GridRedBlackCartesian * FrbGrid,
                 GridCartesian         * UGrid,        GridRedBlackCartesian * UrbGrid,
                 RealD mass, RealD M5,
                 GridParallelRNG *RNG4,
                 GridParallelRNG *RNG5);
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::cout << GridLogMessage << "###########################################" << std::endl;
  std::cout << GridLogMessage << "This test disables the convergence alert" << std::endl;
  std::cout << GridLogMessage << "and checks whether in consecutive runs of the CG" << std::endl;
  std::cout << GridLogMessage << "the internal reductions are bit reproducible." << std::endl;
  std::cout << GridLogMessage << "A warning and a report is produced for every failure" << std::endl;
  std::cout << GridLogMessage << "###########################################" << std::endl;
  const int Ls=8;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
  SU3::HotConfiguration(RNG4,Umu);
  std::vector<LatticeColourMatrix> U(4,UGrid);
  RealD mass=0.1;
  RealD M5  =1.8;
  std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  TestCGinversions<DomainWallFermionR>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
  RealD b=1.5;// Scale factor b+c=2, b-c=1
  RealD c=0.5;
  std::cout<<GridLogMessage <<"MobiusFermion test"<<std::endl;
  MobiusFermionR Dmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
  TestCGinversions<MobiusFermionR>(Dmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
  std::cout<<GridLogMessage <<"MobiusZolotarevFermion test"<<std::endl;
  MobiusZolotarevFermionR Dzolo(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,0.1,2.0);
  TestCGinversions<MobiusZolotarevFermionR>(Dzolo,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
  std::cout<<GridLogMessage <<"ScaledShamirFermion test"<<std::endl;
  ScaledShamirFermionR Dsham(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,2.0);
  TestCGinversions<ScaledShamirFermionR>(Dsham,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
  std::cout<<GridLogMessage <<"ShamirZolotarevFermion test"<<std::endl;
  ShamirZolotarevFermionR Dshamz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
  TestCGinversions<ShamirZolotarevFermionR>(Dshamz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
  std::cout<<GridLogMessage <<"OverlapWilsonCayleyTanhFermion test"<<std::endl;
  OverlapWilsonCayleyTanhFermionR Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0);
  TestCGinversions<OverlapWilsonCayleyTanhFermionR>(Dov,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
  std::cout<<GridLogMessage <<"OverlapWilsonCayleyZolotarevFermion test"<<std::endl;
  OverlapWilsonCayleyZolotarevFermionR Dovz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
  TestCGinversions<OverlapWilsonCayleyZolotarevFermionR>(Dovz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
  Grid_finalize();
 }
 template<class What> 
 void  TestCGinversions(What & Ddwf, 
                       GridCartesian         * FGrid,          GridRedBlackCartesian * FrbGrid,
                       GridCartesian         * UGrid,          GridRedBlackCartesian * UrbGrid,
                       RealD mass, RealD M5,
                       GridParallelRNG *RNG4,
                       GridParallelRNG *RNG5)
 {
  std::cout<<GridLogMessage << "Testing unpreconditioned inverter"<<std::endl;
  TestCGunprec<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
  std::cout<<GridLogMessage << "Testing red black preconditioned inverter"<<std::endl;
  TestCGprec<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
  std::cout<<GridLogMessage << "Testing red black Schur inverter"<<std::endl;
  TestCGschur<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
 }
 template<class What> 
 void  TestCGunprec(What & Ddwf, 
                   GridCartesian         * FGrid,              GridRedBlackCartesian * FrbGrid,
                   GridCartesian         * UGrid,              GridRedBlackCartesian * UrbGrid,
                   RealD mass, RealD M5,
                   GridParallelRNG *RNG4,
                   GridParallelRNG *RNG5)
 {
  LatticeFermion src   (FGrid); random(*RNG5,src);
  LatticeFermion result(FGrid); result=zero;
  MdagMLinearOperator<What,LatticeFermion> HermOp(Ddwf);
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
  CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
  CG(HermOp,src,result);
 }
 template<class What> 
 void  TestCGprec(What & Ddwf, 
                 GridCartesian         * FGrid,        GridRedBlackCartesian * FrbGrid,
                 GridCartesian         * UGrid,        GridRedBlackCartesian * UrbGrid,
                 RealD mass, RealD M5,
                 GridParallelRNG *RNG4,
                 GridParallelRNG *RNG5)
 {
  LatticeFermion src   (FGrid); random(*RNG5,src);
  LatticeFermion    src_o(FrbGrid);
  LatticeFermion result_o(FrbGrid);
  pickCheckerboard(Odd,src_o,src);
  result_o=zero;
  SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
  CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
  CG(HermOpEO,src_o,result_o);
 }
 template<class What> 
 void  TestCGschur(What & Ddwf, 
                   GridCartesian         * FGrid,              GridRedBlackCartesian * FrbGrid,
                   GridCartesian         * UGrid,              GridRedBlackCartesian * UrbGrid,
                   RealD mass, RealD M5,
                   GridParallelRNG *RNG4,
                   GridParallelRNG *RNG5)
 {
  LatticeFermion src   (FGrid); random(*RNG5,src);
  LatticeFermion result(FGrid); result=zero;
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
  CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
  SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
  SchurSolver(Ddwf,src,result);
 }
--- a/tests/hmc/Test_hmc_WilsonFermionGauge.cc
+++ b/tests/hmc/Test_hmc_WilsonFermionGauge.cc
@ -63,9 +63,6 @@ class HmcRunner : public NerscHmcRunner {
    Real mass = -0.77;
    FermionAction FermOp(U, *FGrid, *FrbGrid, mass);
    // To enable the CG reproducibility tests use 
    //ConjugateGradient<FermionField> CG(1.0e-8, 10000, ReproducibilityTest);
    // This is the plain version
    ConjugateGradient<FermionField> CG(1.0e-8, 10000);
    TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG);
--- a/tests/solver/Test_wilson_cg_unprec.cc
+++ b/tests/solver/Test_wilson_cg_unprec.cc
@ -71,7 +71,7 @@ int main (int argc, char ** argv)
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
  MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000,true, true);
+  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
  CG(HermOp,src,result);
  Grid_finalize();
Author	SHA1	Message	Date
paboyle	c363bdd784	Merge branch 'release/v0.6.0'	2016-11-09 12:43:14 +00:00
paboyle	446c768cd3	Merge branch 'hotfix/v0.5.1' Double precision compile fix	2016-07-01 16:33:59 +01:00