Merge branch 'release/v0.6.0'

Merge branch 'hotfix/v0.5.1'
Double precision compile fix
2025-06-14 05:07:05 +01:00 · 2016-11-09 12:43:14 +00:00 · 2016-07-01 16:33:59 +01:00
47 changed files with 3138 additions and 2057 deletions
--- a/.gitignore
+++ b/.gitignore
@ -47,9 +47,7 @@ Config.h.in
 config.log
 config.status
 .deps
-Make.inc
-eigen.inc
-Eigen.inc
+*.inc

 # http://www.gnu.org/software/autoconf #
 ########################################
--- a/Makefile.am
+++ b/Makefile.am
@ -1,12 +1,10 @@
 # additional include paths necessary to compile the C++ library
 SUBDIRS = lib benchmarks tests

-include $(top_srcdir)/doxygen.inc
+.PHONY: tests

 tests: all
 	$(MAKE) -C tests tests

-.PHONY: tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
-
 AM_CXXFLAGS += -I$(top_builddir)/include
 ACLOCAL_AMFLAGS = -I m4
--- a/README.md
+++ b/README.md
@ -116,15 +116,13 @@ If you want to build all the tests at once just use `make tests`.
 - `--with-fftw=<path>`: look for FFTW in the UNIX prefix `<path>`
 - `--enable-lapack[=<path>]`: enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional).
 - `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
- `--enable-numa`: enable NUMA first touch optimisation
+- `--enable-numa`: ???
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
 - `--enable-precision={single|double}`: set the default precision (default: `double`).
 - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
 - `--disable-timers`: disable system dependent high-resolution timers.
 - `--enable-chroma`: enable Chroma regression tests.
- `--enable-doxygen-doc`: enable the Doxygen documentation generation (build with `make doxygen-doc`)

 ### Possible communication interfaces

@ -138,7 +136,7 @@ The following options can be use with the `--enable-comms=` option to target dif
 | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |

-For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
+For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).

 ### Possible SIMD types

@ -167,7 +165,6 @@ Alternatively, some CPU codenames can be directly used:
 - We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
- The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.

 ### Build setup for Intel Knights Landing platform

--- a/benchmarks/Benchmark_mooee.cc
+++ b/benchmarks/Benchmark_mooee.cc
@ -1,183 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./benchmarks/Benchmark_dwf.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-using namespace Grid::QCD;
-
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-
-  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  std::cout << GridLogMessage << "Making Vec5d innermost grids"<<std::endl;
-  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
-  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
-  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-  std::cout << GridLogMessage << "Seeded"<<std::endl;
-
-  LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
-
-  std::cout << GridLogMessage << "made random gauge fields"<<std::endl;
-
-  RealD mass=0.1;
-  RealD M5  =1.8;
-  RealD NP = UGrid->_Nprocessors;
-
-
-  if (1)
-  {
-    const int ncall=1000;
-
-    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
-    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-
-    GridParallelRNG RNG5(FGrid);
-    LatticeFermion src(FGrid); random(RNG5,src);
-    LatticeFermion result(FGrid);
-
-    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-    double t0,t1;
-
-    LatticeFermion r_eo(FGrid);
-    LatticeFermion src_e (FrbGrid);
-    LatticeFermion src_o (FrbGrid);
-    LatticeFermion r_e   (FrbGrid);
-    LatticeFermion r_o   (FrbGrid);
-    
-    pickCheckerboard(Even,src_e,src);
-    pickCheckerboard(Odd,src_o,src);
-    
-    setCheckerboard(r_eo,src_o);
-    setCheckerboard(r_eo,src_e);
-    
-    r_e = zero;
-    r_o = zero;
-
-
-#define BENCH_DW(A,in,out)			\
-    Dw.CayleyZeroCounters();			\
-    Dw. A (in,out);				\
-    FGrid->Barrier();				\
-    t0=usecond();				\
-    for(int i=0;i<ncall;i++){			\
-      Dw. A (in,out);				\
-    }						\
-    t1=usecond();				\
-    FGrid->Barrier();				\
-    Dw.CayleyReport();					\
-    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
-    std::cout<<GridLogMessage << "******************"<<std::endl;
-
-#define BENCH_DW_MEO(A,in,out)			\
-    Dw.CayleyZeroCounters();			\
-    Dw. A (in,out,0);				\
-    FGrid->Barrier();				\
-    t0=usecond();				\
-    for(int i=0;i<ncall;i++){			\
-      Dw. A (in,out,0);				\
-    }						\
-    t1=usecond();				\
-    FGrid->Barrier();				\
-    Dw.CayleyReport();					\
-    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
-    std::cout<<GridLogMessage << "******************"<<std::endl;
-
-    BENCH_DW_MEO(Dhop    ,src,result);
-    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
-    BENCH_DW(Meooe   ,src_o,r_e);
-    BENCH_DW(Mooee   ,src_o,r_o);
-    BENCH_DW(MooeeInv,src_o,r_o);
-
-  }
-
-  if (1)
-  {
-    const int ncall=1000;
-
-    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
-    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-
-    GridParallelRNG RNG5(sFGrid);
-    LatticeFermion src(sFGrid); random(RNG5,src);
-    LatticeFermion sref(sFGrid);
-    LatticeFermion result(sFGrid);
-
-    std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
-    DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);
-
-    std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
-    FGrid->Barrier();
-
-    double t0,t1;
-
-    LatticeFermion r_eo(sFGrid);
-    LatticeFermion src_e (sFrbGrid);
-    LatticeFermion src_o (sFrbGrid);
-    LatticeFermion r_e   (sFrbGrid);
-    LatticeFermion r_o   (sFrbGrid);
-    
-    pickCheckerboard(Even,src_e,src);
-    pickCheckerboard(Odd,src_o,src);
-    
-    setCheckerboard(r_eo,src_o);
-    setCheckerboard(r_eo,src_e);
-    
-    r_e = zero;
-    r_o = zero;
-
-    BENCH_DW_MEO(Dhop    ,src,result);
-    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
-    BENCH_DW(Meooe   ,src_o,r_e);
-    BENCH_DW(Mooee   ,src_o,r_o);
-    BENCH_DW(MooeeInv,src_o,r_o);
-
-  }
-
-  Grid_finalize();
-}
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@ -1,11 +1 @@
 include Make.inc
-
-simple: simple_su3_test.o simple_su3_expr.o simple_simd_test.o
-
-EXTRA_LIBRARIES = libsimple_su3_test.a libsimple_su3_expr.a libsimple_simd_test.a
-
-libsimple_su3_test_a_SOURCES = simple_su3_test.cc
-
-libsimple_su3_expr_a_SOURCES = simple_su3_expr.cc
-
-libsimple_simd_test_a_SOURCES = simple_simd_test.cc
--- a/benchmarks/simple_simd_test.cc
+++ b/benchmarks/simple_simd_test.cc
@ -1,11 +0,0 @@
-#include <Grid/Grid.h>
-
-Grid::vRealD add(const Grid::vRealD &x, const Grid::vRealD &y)
-{
-	return x+y;
-}
-
-Grid::vRealD sub(const Grid::vRealD &x, const Grid::vRealD &y)
-{
-	return x-y;
-}
--- a/benchmarks/simple_su3_expr.cc
+++ b/benchmarks/simple_su3_expr.cc
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>

 using namespace std;
 using namespace Grid;
--- a/benchmarks/simple_su3_test.cc
+++ b/benchmarks/simple_su3_test.cc
@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid/Grid.h>
+#include <Grid.h>

 using namespace std;
 using namespace Grid;
--- a/configure.ac
+++ b/configure.ac
@ -149,14 +149,8 @@ CXXFLAGS=$CXXFLAGS_CPY
 LDFLAGS=$LDFLAGS_CPY

 ############### SIMD instruction selection
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=code],
-              [select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])
-
-AC_ARG_ENABLE([gen-simd-width],
-            [AS_HELP_STRING([--enable-gen-simd-width=size],
-            [size (in bytes) of the generic SIMD vectors (default: 32)])],
-            [ac_gen_simd_width=$enable_gen_simd_width],
-            [ac_gen_simd_width=32])
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=<code>],
+	            [select SIMD target (cf. README.md)])], [ac_SIMD=${enable_simd}], [ac_SIMD=GEN])

 case ${ax_cv_cxx_compiler_vendor} in
  clang|gnu)
@ -186,10 +180,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-march=knl';;
      GEN)
-        AC_DEFINE([GEN],[1],[generic vector code])
-        AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
-                           [generic SIMD vector width (in bytes)])
-        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        SIMD_FLAGS='';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
@ -206,8 +197,8 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX1],[1],[AVX intrinsics])
        SIMD_FLAGS='-mavx -xavx';;
      AVXFMA)
-        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
-        SIMD_FLAGS='-mavx -fma';;
+        AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
+        SIMD_FLAGS='-mavx -mfma';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
@ -221,10 +212,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
        SIMD_FLAGS='-xmic-avx512';;
      GEN)
-        AC_DEFINE([GEN],[1],[generic vector code])
-        AC_DEFINE_UNQUOTED([GEN_SIMD_WIDTH],[$ac_gen_simd_width],
-                           [generic SIMD vector width (in bytes)])
-        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        SIMD_FLAGS='';;
      *)
        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
@ -290,7 +278,7 @@ esac
 case ${ac_COMMS} in
    *-auto)
        LX_FIND_MPI
-        if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi
+        if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
        AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
        AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
        AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
@ -354,17 +342,12 @@ esac
 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])

 ############### Doxygen
-DX_DOXYGEN_FEATURE([OFF])
-DX_DOT_FEATURE([OFF])
-DX_HTML_FEATURE([ON])
-DX_CHM_FEATURE([OFF])
-DX_CHI_FEATURE([OFF])
-DX_MAN_FEATURE([OFF])
-DX_RTF_FEATURE([OFF])
-DX_XML_FEATURE([OFF])
-DX_PDF_FEATURE([OFF])
-DX_PS_FEATURE([OFF])
-DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])
+AC_PROG_DOXYGEN
+
+if test -n "$DOXYGEN"
+then
+AC_CONFIG_FILES([docs/doxy.cfg])
+fi

 ############### Ouput
 cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
@ -399,7 +382,7 @@ os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
-SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
+SIMD                        : ${ac_SIMD}
 Threading                   : ${ac_openmp} 
 Communications type         : ${comms_type}
 Default precision           : ${ac_PRECISION}
@ -407,7 +390,8 @@ RNG choice                  : ${ac_RNG}
 GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 LAPACK                      : ${ac_LAPACK}
 FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
-build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
+build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
+graphs and diagrams         : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
 CXXFLAGS:
 `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/    -/g'`
--- a/docs/doxy.cfg
+++ b/docs/doxy.cfg
--- a/docs/doxy.cfg.in
+++ b/docs/doxy.cfg.in
--- a/doxygen.inc
+++ b/doxygen.inc
@ -1,184 +0,0 @@
-# Copyright (C) 2004 Oren Ben-Kiki
-# This file is distributed under the same terms as the Automake macro files.
-
-# Generate automatic documentation using Doxygen. Goals and variables values
-# are controlled by the various DX_COND_??? conditionals set by autoconf.
-#
-# The provided goals are:
-# doxygen-doc: Generate all doxygen documentation.
-# doxygen-run: Run doxygen, which will generate some of the documentation
-#              (HTML, CHM, CHI, MAN, RTF, XML) but will not do the post
-#              processing required for the rest of it (PS, PDF, and some MAN).
-# doxygen-man: Rename some doxygen generated man pages.
-# doxygen-ps: Generate doxygen PostScript documentation.
-# doxygen-pdf: Generate doxygen PDF documentation.
-#
-# Note that by default these are not integrated into the automake goals. If
-# doxygen is used to generate man pages, you can achieve this integration by
-# setting man3_MANS to the list of man pages generated and then adding the
-# dependency:
-#
-#   $(man3_MANS): doxygen-doc
-#
-# This will cause make to run doxygen and generate all the documentation.
-#
-# The following variable is intended for use in Makefile.am:
-#
-# DX_CLEANFILES = everything to clean.
-#
-# This is usually added to MOSTLYCLEANFILES.
-
-## --------------------------------- ##
-## Format-independent Doxygen rules. ##
-## --------------------------------- ##
-
-if DX_COND_doc
-
-## ------------------------------- ##
-## Rules specific for HTML output. ##
-## ------------------------------- ##
-
-if DX_COND_html
-
-DX_CLEAN_HTML = @DX_DOCDIR@/html
-
-endif DX_COND_html
-
-## ------------------------------ ##
-## Rules specific for CHM output. ##
-## ------------------------------ ##
-
-if DX_COND_chm
-
-DX_CLEAN_CHM = @DX_DOCDIR@/chm
-
-if DX_COND_chi
-
-DX_CLEAN_CHI = @DX_DOCDIR@/@PACKAGE@.chi
-
-endif DX_COND_chi
-
-endif DX_COND_chm
-
-## ------------------------------ ##
-## Rules specific for MAN output. ##
-## ------------------------------ ##
-
-if DX_COND_man
-
-DX_CLEAN_MAN = @DX_DOCDIR@/man
-
-endif DX_COND_man
-
-## ------------------------------ ##
-## Rules specific for RTF output. ##
-## ------------------------------ ##
-
-if DX_COND_rtf
-
-DX_CLEAN_RTF = @DX_DOCDIR@/rtf
-
-endif DX_COND_rtf
-
-## ------------------------------ ##
-## Rules specific for XML output. ##
-## ------------------------------ ##
-
-if DX_COND_xml
-
-DX_CLEAN_XML = @DX_DOCDIR@/xml
-
-endif DX_COND_xml
-
-## ----------------------------- ##
-## Rules specific for PS output. ##
-## ----------------------------- ##
-
-if DX_COND_ps
-
-DX_CLEAN_PS = @DX_DOCDIR@/@PACKAGE@.ps
-
-DX_PS_GOAL = doxygen-ps
-
-doxygen-ps: @DX_DOCDIR@/@PACKAGE@.ps
-
-@DX_DOCDIR@/@PACKAGE@.ps: @DX_DOCDIR@/@PACKAGE@.tag
-	cd @DX_DOCDIR@/latex; \
-	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
-	$(DX_LATEX) refman.tex; \
-	$(MAKEINDEX_PATH) refman.idx; \
-	$(DX_LATEX) refman.tex; \
-	countdown=5; \
-	while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
-	                  refman.log > /dev/null 2>&1 \
-	   && test $$countdown -gt 0; do \
-	    $(DX_LATEX) refman.tex; \
-	    countdown=`expr $$countdown - 1`; \
-	done; \
-	$(DX_DVIPS) -o ../@PACKAGE@.ps refman.dvi
-
-endif DX_COND_ps
-
-## ------------------------------ ##
-## Rules specific for PDF output. ##
-## ------------------------------ ##
-
-if DX_COND_pdf
-
-DX_CLEAN_PDF = @DX_DOCDIR@/@PACKAGE@.pdf
-
-DX_PDF_GOAL = doxygen-pdf
-
-doxygen-pdf: @DX_DOCDIR@/@PACKAGE@.pdf
-
-@DX_DOCDIR@/@PACKAGE@.pdf: @DX_DOCDIR@/@PACKAGE@.tag
-	cd @DX_DOCDIR@/latex; \
-	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
-	$(DX_PDFLATEX) refman.tex; \
-	$(DX_MAKEINDEX) refman.idx; \
-	$(DX_PDFLATEX) refman.tex; \
-	countdown=5; \
-	while $(DX_EGREP) 'Rerun (LaTeX|to get cross-references right)' \
-	                  refman.log > /dev/null 2>&1 \
-	   && test $$countdown -gt 0; do \
-	    $(DX_PDFLATEX) refman.tex; \
-	    countdown=`expr $$countdown - 1`; \
-	done; \
-	mv refman.pdf ../@PACKAGE@.pdf
-
-endif DX_COND_pdf
-
-## ------------------------------------------------- ##
-## Rules specific for LaTeX (shared for PS and PDF). ##
-## ------------------------------------------------- ##
-
-if DX_COND_latex
-
-DX_CLEAN_LATEX = @DX_DOCDIR@/latex
-
-endif DX_COND_latex
-
-.INTERMEDIATE: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
-
-doxygen-run: @DX_DOCDIR@/@PACKAGE@.tag
-
-doxygen-doc: doxygen-run $(DX_PS_GOAL) $(DX_PDF_GOAL)
-
-@DX_DOCDIR@/@PACKAGE@.tag: $(DX_CONFIG) $(pkginclude_HEADERS)
-	rm -rf @DX_DOCDIR@
-	$(DX_ENV) $(DX_DOXYGEN) $(srcdir)/$(DX_CONFIG)
-
-DX_CLEANFILES = \
-    @DX_DOCDIR@/@PACKAGE@.tag \
-    -r \
-    $(DX_CLEAN_HTML) \
-    $(DX_CLEAN_CHM) \
-    $(DX_CLEAN_CHI) \
-    $(DX_CLEAN_MAN) \
-    $(DX_CLEAN_RTF) \
-    $(DX_CLEAN_XML) \
-    $(DX_CLEAN_PS) \
-    $(DX_CLEAN_PDF) \
-    $(DX_CLEAN_LATEX)
-
-endif DX_COND_doc
--- a/lib/Bitwise.cc
+++ b/lib/Bitwise.cc
@ -1,49 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/Bitwise.cc
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#include <iostream>
-#include <Bitwise.h>
-#include <bitset>
-#include <climits>
-
-namespace Grid {
-
-void show_binaryrep(const unsigned char* a, size_t size) {
-  const unsigned char* beg = a;
-  const unsigned char* end = a + size;
-  unsigned int ctr = 0;
-  while (beg != end) {
-    std::cout << std::bitset<CHAR_BIT>(*beg++) << ' ';
-    ctr++;
-    if (ctr % GRID_REAL_BYTES == 0) std::cout << '\n';
-  }
-  std::cout << '\n';
-}
-
-} // namespace 
-
--- a/lib/Bitwise.h
+++ b/lib/Bitwise.h
@ -1,76 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid
-
-Source file: ./lib/Bitwise.h
-
-Copyright (C) 2016
-
-Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution
-directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef GRID_BITWISE_H
-#define GRID_BITWISE_H
-
-#include <cassert>
-#include <cfloat>
-#include <bitset>
-#include <climits>
-#include <Config.h>
-
-#ifdef GRID_DEFAULT_PRECISION_SINGLE
-#define GRID_REAL_BYTES 4
-#endif
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-#define GRID_REAL_BYTES 8
-#endif
-
-
-namespace Grid {
-
-void show_binaryrep(const unsigned char* a, size_t size);
-
-template <typename T>
-void show_binaryrep(const T& a) {
-  const char* beg = reinterpret_cast<const char*>(&a);
-  const char* end = beg + sizeof(a);
-  unsigned int ctr = 0;
-  while (beg != end) {
-  	std::cout << std::bitset<CHAR_BIT>(*beg++) << ' ';
-  	ctr++;
-  	if (ctr % GRID_REAL_BYTES == 0) std::cout << '\n';
-  }
-  std::cout << '\n';
-}
-
-template <typename T>
-void bitwise_xor(T& l, T& r, unsigned char* xors) {
-  assert(sizeof(l) == sizeof(r));
-  unsigned char* org = reinterpret_cast<unsigned char*>(&l);
-  unsigned char* cur = reinterpret_cast<unsigned char*>(&r);
-  int words = sizeof(l) / sizeof(*org);
-  unsigned char result = 0;
-  for (int w = 0; w < words; w++) xors[w] = (org[w] ^ cur[w]);
-}
-
-}; // namespace 
-
-
-#endif
--- a/lib/FFT.h
+++ b/lib/FFT.h
@ -244,10 +244,7 @@ namespace Grid {
            pokeLocalSite(s,pgbuf,cbuf);
          }
        }
-        if (p != processors[dim] - 1)
-        {
-          result = Cshift(result,dim,L);
-        }
+        result = Cshift(result,dim,L);
      }
      
      // Loop over orthog coords
@ -290,10 +287,10 @@ namespace Grid {
          cgbuf = clbuf;
          cgbuf[dim] = clbuf[dim]+L*pc;
          peekLocalSite(s,pgbuf,cgbuf);
+          s = s * div;
          pokeLocalSite(s,result,clbuf);
        }
      }
-      result = result*div;
      
      // destroying plan
      FFTW<scalar>::fftw_destroy_plan(p);
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -62,7 +62,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/serialisation/Serialisation.h>
 #include "Config.h"
 #include <Grid/Timer.h>
-#include <Grid/Bitwise.h>
 #include <Grid/PerfCount.h>
 #include <Grid/Log.h>
 #include <Grid/AlignedAllocator.h>
--- a/lib/Log.cc
+++ b/lib/Log.cc
@ -100,7 +100,7 @@ void Grid_quiesce_nodes(void) {
  me = shmem_my_pe();
 #endif
  if (me) {
-    std::cout.setstate(std::ios::badbit);// mute all nodes except 0
+    std::cout.setstate(std::ios::badbit);
  }
 }

--- a/lib/Threads.h
+++ b/lib/Threads.h
@ -38,21 +38,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_OMP
 #include <omp.h>
 #ifdef GRID_NUMA
-  #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
-  #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
-  #else
-  #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(runtime)")
-  #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
+#define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
+#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
+#else
+#define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(runtime)")
+#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
 #endif
-#define PARALLEL_NESTED_LOOP2    _Pragma("omp parallel for collapse(2)")
-#define PARALLEL_REGION          _Pragma("omp parallel")
-#define PARALLEL_FOR_LOOP_STATIC _Pragma("omp parallel for schedule(static)")
+#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
+#define PARALLEL_REGION       _Pragma("omp parallel")
 #else
 #define PARALLEL_FOR_LOOP
 #define PARALLEL_FOR_LOOP_INTERN
 #define PARALLEL_NESTED_LOOP2
 #define PARALLEL_REGION
-#define PARALLEL_FOR_LOOP_STATIC
 #endif

 namespace Grid {
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@ -9,7 +9,6 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
-Author: Guido Cossu <guido.cossu@ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -34,21 +33,6 @@ directory

 namespace Grid {

-struct CG_state {
-  bool do_repro;
-  std::vector<RealD> residuals;
-
-  CG_state() {reset();}
-
-  void reset(){
-    do_repro = false;
-    residuals.clear();
-  }
-};
-
-
-enum CGexec_mode{ Default, ReproducibilityTest };
-
 /////////////////////////////////////////////////////////////
 // Base classes for iterative processes based on operators
 // single input vec, single output vec.
@ -61,30 +45,10 @@ class ConjugateGradient : public OperatorFunction<Field> {
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
-
-  // Reproducibility controls
-  bool ReproTest;
-  CG_state CGState; //to check reproducibility by repeating the CG
-  ReproducibilityState<typename Field::vector_object> ReprTest; // for the inner proucts
-
-  // Constructor
-  ConjugateGradient(RealD tol, Integer maxit, CGexec_mode Mode = Default)
-    : Tolerance(tol),MaxIterations(maxit){
-    switch(Mode)
-    {
-      case Default  : 
-      ErrorOnNoConverge = true;
-      ReproTest = false;
-      case ReproducibilityTest :
-      ErrorOnNoConverge = false;
-      ReproTest = true; 
-    }
-  };
-
-  void set_reproducibility_interval(unsigned int interval){
-    ReprTest.interval = interval;
-  }
-
+  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
+      : Tolerance(tol),
+        MaxIterations(maxit),
+        ErrorOnNoConverge(err_on_no_conv){};

  void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
                  Field &psi) {
@ -96,37 +60,34 @@ class ConjugateGradient : public OperatorFunction<Field> {
    Field p(src);
    Field mmp(src);
    Field r(src);
-    Field psi_start(psi);// save for the repro test
-
-    if (CGState.do_repro && ReproTest)
-        std::cout << GridLogMessage << "Starting reproducibility test, full check every "
-                  << ReprTest.interval << " calls" << std::endl;
-
-    if(!ReprTest.do_check)
-        ReprTest.reset();
-    ReprTest.enable_reprocheck=ReproTest;
-
-

    // Initial residual computation & set up
-    RealD guess = norm2(psi, ReprTest);
+    RealD guess = norm2(psi);
    assert(std::isnan(guess) == 0);

-    Linop.HermOpAndNorm(psi, mmp, d, b);// eventually split this for the norm check
    
+    Linop.HermOpAndNorm(psi, mmp, d, b);
+    
+
    r = src - mmp;
    p = r;

-    a = norm2(p, ReprTest);
+    a = norm2(p);
    cp = a;
-    ssq = norm2(src, ReprTest);
+    ssq = norm2(src);

-    std::cout << GridLogIterative << "ConjugateGradient: guess " << guess << std::endl;
-    std::cout << GridLogIterative << "ConjugateGradient:   src " << ssq << std::endl;
-    std::cout << GridLogIterative << "ConjugateGradient:    mp " << d << std::endl;
-    std::cout << GridLogIterative << "ConjugateGradient:   mmp " << b << std::endl;
-    std::cout << GridLogIterative << "ConjugateGradient:  cp,r " << cp << std::endl;
-    std::cout << GridLogIterative << "ConjugateGradient:     p " << a << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:   mmp " << b << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(4)
+              << "ConjugateGradient:     p " << a << std::endl;

    RealD rsq = Tolerance * Tolerance * ssq;

@ -146,10 +107,10 @@ class ConjugateGradient : public OperatorFunction<Field> {
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
-      c = cp;// old residual
+      c = cp;

      MatrixTimer.Start();
-      Linop.HermOpAndNorm(p, mmp, d, qq);// mmp = Ap, d=pAp
+      Linop.HermOpAndNorm(p, mmp, d, qq);
      MatrixTimer.Stop();

      LinalgTimer.Start();
@ -157,31 +118,14 @@ class ConjugateGradient : public OperatorFunction<Field> {
      //  ComplexD dck  = innerProduct(p,mmp);

      a = c / d;
-      b_pred = a * (a * qq - d) / c;// a check
+      b_pred = a * (a * qq - d) / c;

-
-      axpy(r, -a, mmp, r);// new residual r = r_old - a * Ap
-      cp = norm2(r, ReprTest); // bookkeeping this norm
-      if (ReproTest && !CGState.do_repro) {
-        CGState.residuals.push_back(cp);  // save residuals state
-                std::cout << GridLogIterative << "ReproTest: Saving state" << std::endl;
-        }
-      if (ReproTest && CGState.do_repro){
-        // check that the residual agrees with the previous run
-        std::cout << GridLogIterative << "ReproTest: Checking state k=" << k << std::endl;
-        if (cp != CGState.residuals[k-1]){
-                std::cout << GridLogMessage << "Failing reproducibility test";
-                std::cout << GridLogMessage << " at k=" << k << std::endl;
-                std::cout << GridLogMessage << "saved residual = " << CGState.residuals[k-1] 
-                        << " cp = " << cp << std::endl;
-                exit(1);  // exit after the first failure
-        }
-      }
+      cp = axpy_norm(r, -a, mmp, r);
      b = cp / c;

      // Fuse these loops ; should be really easy
-      psi = a * p + psi; // update solution
-      p = p * b + r;  // update search direction
+      psi = a * p + psi;
+      p = p * b + r;

      LinalgTimer.Stop();
      std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
@ -212,22 +156,6 @@ class ConjugateGradient : public OperatorFunction<Field> {

        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

-        if (! (CGState.do_repro && ReproTest)){
-                CGState.do_repro = true;
-                ReprTest.do_check = true;
-                ReprTest.reset_counter();
-                this->operator()(Linop, src, psi_start);// run the repro test
-                if (ReprTest.success)
-                	std::cout << GridLogMessage << "Reproducibility test passed" << std::endl;
-                else{
-                	std::cout << GridLogMessage << "Reproducibility test failed" << std::endl;
-                	exit(1);
-                }
-        }
-
-        // Clear state
-        CGState.reset();
-        ReprTest.reset();
        return;
      }
    }
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -59,7 +59,7 @@ public:

    int Nstop;   // Number of evecs checked for convergence
    int Nk;      // Number of converged sought
-    int Np;      // Np -- Number of spare vecs in krylov space
+    int Np;      // Np -- Number of spare vecs in kryloc space
    int Nm;      // Nm -- total number of vectors

    RealD eresid;
@ -1080,10 +1080,10 @@ say con = 2
 **/

 template<class T>
-static void Lock(DenseMatrix<T> &H, 	// Hess mtx	
-		 DenseMatrix<T> &Q, 	// Lock Transform
-		 T val, 		// value to be locked
-		 int con, 	// number already locked
+static void Lock(DenseMatrix<T> &H, 	///Hess mtx	
+		 DenseMatrix<T> &Q, 	///Lock Transform
+		 T val, 		///value to be locked
+		 int con, 	///number already locked
 		 RealD small,
 		 int dfg,
 		 bool herm)
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@ -65,7 +65,6 @@ const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return
 const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
 int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };

-
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
@ -90,11 +89,11 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)

 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-                                                       void *xmit,
-                                                       int xmit_to_rank,
-                                                       void *recv,
-                                                       int recv_from_rank,
-                                                       int bytes)
+						       void *xmit,
+						       int xmit_to_rank,
+						       void *recv,
+						       int recv_from_rank,
+						       int bytes)
 {
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -68,8 +68,6 @@ class CartesianCommunicator {
  static MPI_Comm communicator_world;
         MPI_Comm communicator;
  typedef MPI_Request CommsRequest_t;
-  static char name[MPI_MAX_PROCESSOR_NAME]; // processing node physical name
-  static int length; 
 #else 
  typedef int CommsRequest_t;
 #endif
@ -151,7 +149,6 @@ class CartesianCommunicator {
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;

-  void        		 PrintRankInfo(void)     ;
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -35,8 +35,6 @@ namespace Grid {
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
-char CartesianCommunicator::name[MPI_MAX_PROCESSOR_NAME]; // processing node physical name
-int CartesianCommunicator::length; 

 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
@ -46,8 +44,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    MPI_Init(argc,argv);
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-
-  MPI_Get_processor_name(name, &length);
  ShmInitGeneric();
 }

@ -210,10 +206,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  assert(ierr==0);
 }

-void CartesianCommunicator::PrintRankInfo(){
-    std::cout << "Grid: Rank "<< _processor << "  -  Physical node name: " << name << std::endl;
 }

-
-}// end of namespace
-
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@ -576,10 +576,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  assert(ierr==0);
 }

-  void CartesianCommunicator::PrintRankInfo(){
-    std::cout << "Grid: Rank "<< _processor << "  -  Physical node name: " << name << std::endl;
-  }
-
-  
 }

--- a/lib/communicator/Communicator_mpi3_leader.cc
+++ b/lib/communicator/Communicator_mpi3_leader.cc
@ -869,9 +869,6 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
  return NULL;
 }

-  void CartesianCommunicator::PrintRankInfo(){
-    std::cout << "Grid: Rank "<< _processor << "  -  Physical node name: " << name << std::endl;
-  }

 };

--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@ -37,11 +37,6 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
  ShmInitGeneric();
 }

-void CartesianCommunicator::PrintRankInfo(){
-    std::cout << GridLogMessage << "No Rank Info available" << std::endl;
-}
-
-
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
@ -65,10 +60,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}

 void CartesianCommunicator::SendRecvPacket(void *xmit,
-                                           void *recv,
-                                           int xmit_to_rank,
-                                           int recv_from_rank,
-                                           int bytes)
+					   void *recv,
+					   int xmit_to_rank,
+					   int recv_from_rank,
+					   int bytes)
 {
  assert(0);
 }
@ -76,19 +71,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,

 // Basic Halo comms primitive -- should never call in single node
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
-                                           int dest,
-                                           void *recv,
-                                           int from,
-                                           int bytes)
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-                                                void *xmit,
-                                                int dest,
-                                                void *recv,
-                                                int from,
-                                                int bytes)
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
 {
  assert(0);
 }
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@ -333,9 +333,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
  }
 }

-  void CartesianCommunicator::PrintRankInfo(){
-    std::cout << GridLogMessage << "SHMEM: Rank Info not implemented yet" << std::endl;
-  }
-
 }

--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@ -31,87 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define GRID_LATTICE_REDUCTION_H

 namespace Grid {
-
-template <class T>
-class ReproducibilityState {
- public:
-  typedef typename T::vector_type vector_type;
-  typedef std::vector<vector_type, alignedAllocator<vector_type> >  sum_type;
-  unsigned int  n_call;
-  bool          do_check;
-  bool          enable_reprocheck;
-  bool          success;
-  unsigned int  interval; 
-  std::vector<sum_type> th_states;
-
-  void reset_counter() { n_call = 0; }
-  void reset() {
-    th_states.clear();
-    do_check          = false;
-    enable_reprocheck = false;
-    success           = true;
-    n_call            = 0;
-  };
-
-  ReproducibilityState():interval(1) { 
-    reset(); 
-  }
-
-  void check(GridBase* grid, sum_type &sumarray){
-    ///////////////////////  Reproducibility section, not threaded on purpouse
-    if (enable_reprocheck) {
-      if (do_check && (n_call % interval) == 0) {
-        for (int thread = 0; thread < sumarray.size(); thread++) {
-          int words = sizeof(sumarray[thread])/sizeof(unsigned char);
-          unsigned char xors[words];
-          bitwise_xor(sumarray[thread], th_states[n_call][thread],xors);
-          // OR all words
-          unsigned char res = 0;
-          for (int w = 0; w < words; w++) res = res | xors[w];
-
-          Grid_unquiesce_nodes();
-          int rank = 0;
-          while (rank < grid->_Nprocessors){
-            if (rank == grid->ThisRank() ){
-              if ( res ) {
-                std::cout << "Reproducibility failure report" << std::endl;
-                grid->PrintRankInfo();
-                std::cout << "Call: "<< n_call << " Thread: " << thread << std::endl;
-                std::cout << "Size of states: " << th_states.size() << std::endl;
-                std::cout << std::setprecision(GRID_REAL_DIGITS+1) << std::scientific;
-                std::cout << "Saved partial sum  : " << th_states[n_call][thread] << std::endl;
-                std::cout << "Current partial sum: " << sumarray[thread] << std::endl;
-                std::cout << "Saved state "  << std::endl; show_binaryrep(th_states[n_call][thread]);
-                std::cout << "Current state" << std::endl; show_binaryrep(sumarray[thread]);
-                std::cout << "XOR result"    << std::endl; show_binaryrep(xors, words);
-              		//std::cout << std::defaultfloat;  //not supported by some compilers
-                std::cout << std::setprecision(6);
-                success = false; 
-              }
-            }
-            rank++;
-            grid->Barrier();
-          }
-          Grid_quiesce_nodes();                         
-        }
-
-      } else if (!do_check)
-      {
-        std::cout << GridLogDebug << "Saving thread state for inner product. Call n. " 
-        << n_call << std::endl;
-        th_states.resize(n_call+1);
-        th_states[n_call].resize(grid->SumArraySize());
-          th_states[n_call] = sumarray;  // save threads state
-          //n_call++;
-        }
-        n_call++;
-      }
-    }
-
-};
-
-
-
 #ifdef GRID_WARN_SUBOPTIMAL
 #warning "Optimisation alert all these reduction loops are NOT threaded "
 #endif     
@ -120,25 +39,12 @@ class ReproducibilityState {
    // Deterministic Reduction operations
    ////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
-        ReproducibilityState<vobj> repr;
-        ComplexD nrm = innerProduct(arg, arg, repr);
-        return std::real(nrm); 
-    }
-
-    template <class vobj>
-    inline RealD norm2(const Lattice<vobj> &arg, ReproducibilityState<vobj>& rep) {
-      ComplexD nrm = innerProduct(arg, arg, rep);
-      return std::real(nrm);
-    }
+    ComplexD nrm = innerProduct(arg,arg);
+    return std::real(nrm); 
+  }

    template<class vobj>
-    inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right){
-      ReproducibilityState<vobj> repr;
-      return innerProduct(left, right, repr);
-    } 
-    
-    template<class vobj>
-    inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right, ReproducibilityState<vobj>& repr) 
+    inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) 
    {
      typedef typename vobj::scalar_type scalar_type;
      typedef typename vobj::vector_type vector_type;
@ -148,28 +54,24 @@ class ReproducibilityState {

      std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
      for(int i=0;i<grid->SumArraySize();i++){
-        sumarray[i]=zero;
+	sumarray[i]=zero;
      }

-      PARALLEL_FOR_LOOP_STATIC //request statically scheduled threads for reproducibility
+PARALLEL_FOR_LOOP
      for(int thr=0;thr<grid->SumArraySize();thr++){
-        int nwork, mywork, myoff;
-        GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
-        
-        decltype(innerProduct(left._odata[0],right._odata[0])) vnrm = zero; // private to thread; sub summation
+	int nwork, mywork, myoff;
+	GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
+	
+	decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
        for(int ss=myoff;ss<mywork+myoff; ss++){
-          vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
-        }
-        sumarray[thr]=TensorRemove(vnrm) ;
+	  vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
+	}
+	sumarray[thr]=TensorRemove(vnrm) ;
      }
-      
-      /////////// Reproducibility
-      repr.check(grid, sumarray);
-      ///////////////////////////
-
+    
      vector_type vvnrm; vvnrm=zero;  // sum across threads
      for(int i=0;i<grid->SumArraySize();i++){
-        vvnrm = vvnrm+sumarray[i];
+	vvnrm = vvnrm+sumarray[i];
      } 
      nrm = Reduce(vvnrm);// sum across simd
      right._grid->GlobalSum(nrm);
@ -177,26 +79,26 @@ class ReproducibilityState {
    }

    template<class Op,class T1>
-    inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
-    ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
+      inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
+      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
    {
      return sum(closure(expr));
    }

    template<class Op,class T1,class T2>
-    inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
-    ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
+      inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
+      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
    {
      return sum(closure(expr));
    }


    template<class Op,class T1,class T2,class T3>
-    inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
-    ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
-     eval(0,std::get<1>(expr.second)),
-     eval(0,std::get<2>(expr.second))
-     ))::scalar_object
+      inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+				 eval(0,std::get<1>(expr.second)),
+				 eval(0,std::get<2>(expr.second))
+				 ))::scalar_object
    {
      return sum(closure(expr));
    }
@ -209,24 +111,24 @@ class ReproducibilityState {

      std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
      for(int i=0;i<grid->SumArraySize();i++){
-        sumarray[i]=zero;
+	sumarray[i]=zero;
      }

-      PARALLEL_FOR_LOOP
+PARALLEL_FOR_LOOP
      for(int thr=0;thr<grid->SumArraySize();thr++){
-        int nwork, mywork, myoff;
-        GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
+	int nwork, mywork, myoff;
+	GridThread::GetWork(grid->oSites(),thr,mywork,myoff);

-        vobj vvsum=zero;
+	vobj vvsum=zero;
        for(int ss=myoff;ss<mywork+myoff; ss++){
-          vvsum = vvsum + arg._odata[ss];
-        }
-        sumarray[thr]=vvsum;
+	  vvsum = vvsum + arg._odata[ss];
+	}
+	sumarray[thr]=vvsum;
      }

      vobj vsum=zero;  // sum across threads
      for(int i=0;i<grid->SumArraySize();i++){
-        vsum = vsum+sumarray[i];
+	vsum = vsum+sumarray[i];
      } 

      typedef typename vobj::scalar_object sobj;
@ -236,7 +138,7 @@ class ReproducibilityState {
      extract(vsum,buf);

      for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
-        arg._grid->GlobalSum(ssum);
+      arg._grid->GlobalSum(ssum);

      return ssum;
    }
@ -244,23 +146,23 @@ class ReproducibilityState {


 template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
-    {
-      typedef typename vobj::scalar_object sobj;
-      GridBase  *grid = Data._grid;
-      assert(grid!=NULL);
+{
+  typedef typename vobj::scalar_object sobj;
+  GridBase  *grid = Data._grid;
+  assert(grid!=NULL);

  // FIXME
  // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;

-      const int    Nd = grid->_ndimension;
-      const int Nsimd = grid->Nsimd();
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();

-      assert(orthogdim >= 0);
-      assert(orthogdim < Nd);
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);

-      int fd=grid->_fdimensions[orthogdim];
-      int ld=grid->_ldimensions[orthogdim];
-      int rd=grid->_rdimensions[orthogdim];
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];

  std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
  std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@ -195,7 +195,6 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
 typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
 typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
 typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
-
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
@ -204,20 +203,6 @@ typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;

-// Ls vectorised 
-typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
-typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
-typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
-
-typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
-typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
-typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
-
-typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
-typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
-typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
-
-
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@ -269,7 +254,6 @@ typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;


-
  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -62,50 +62,6 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
  }
 }
-
-
-template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
-{
-  this->Report();
-  std::vector<int> latt = GridDefaultLatt();          
-  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-  RealD NP     = this->_FourDimGrid->_Nprocessors;
-  if ( M5Dcalls > 0 ) {
-    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
-
-    // Flops = 6.0*(Nc*Ns) *Ls*vol
-    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  }
-
-  if ( MooeeInvCalls > 0 ) {
-
-    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
-    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
-
-    // Flops = 9*12*Ls*vol/2
-    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
-  }
-
-}
-template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
-{
-  this->ZeroCounters();
-  M5Dflops=0;
-  M5Dcalls=0;
-  M5Dtime=0;
-  MooeeInvFlops=0;
-  MooeeInvCalls=0;
-  MooeeInvTime=0;
-}
-
-
 template<class Impl>  
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@ -120,18 +120,6 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

-      
-
-     void CayleyReport(void);
-     void CayleyZeroCounters(void);
-
-     double M5Dflops;
-     double M5Dcalls;
-     double M5Dtime;
-
-     double MooeeInvFlops;
-     double MooeeInvCalls;
-     double MooeeInvTime;

    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@ -51,9 +51,6 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  GridBase *grid=psi._grid;
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    for(int s=0;s<Ls;s++){
@ -79,7 +76,6 @@ PARALLEL_FOR_LOOP
      }
    }
  }
-  M5Dtime+=usecond();
 }

 template<class Impl>  
@ -95,9 +91,6 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;

-  // Flops = 6.0*(Nc*Ns) *Ls*vol
-  M5Dcalls++;
-  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
@ -123,7 +116,6 @@ PARALLEL_FOR_LOOP
      }
    }
  }
-  M5Dtime+=usecond();
 }

 template<class Impl>
@ -134,14 +126,10 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &

  chi.checkerboard=psi.checkerboard;

-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
-
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];

-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
    // Apply (L^{\prime})^{-1}
    chi[ss]=psi[ss]; // chi[0]=psi[0]
    for(int s=1;s<Ls;s++){
@ -167,9 +155,6 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
    }
  }
-
-  MooeeInvTime+=usecond();
-
 }

 template<class Impl>
@ -181,8 +166,6 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
  assert(psi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;

-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();

 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
@ -214,9 +197,6 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
    }
  }
-
-  MooeeInvTime+=usecond();
-
 }

 #ifdef CAYLEY_DPERP_CACHE
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@ -60,7 +60,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
  int LLs  = grid->_rdimensions[0];
-  const int nsimd= Simd::Nsimd();
+  int nsimd= Simd::Nsimd();

  Vector<iSinglet<Simd> > u(LLs);
  Vector<iSinglet<Simd> > l(LLs);
@ -86,138 +86,35 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
    d_p[ss] = diag[s];
  }}

-
-  M5Dcalls++;
-  M5Dtime-=usecond();
-
-  assert(Nc==3);
-
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-#if 0
-      alignas(64) SiteHalfSpinor hp;
-      alignas(64) SiteHalfSpinor hm;
-      alignas(64) SiteSpinor fp;
-      alignas(64) SiteSpinor fm;

-      for(int v=0;v<LLs;v++){
+    alignas(64) SiteHalfSpinor hp;
+    alignas(64) SiteHalfSpinor hm;
+    alignas(64) SiteSpinor fp;
+    alignas(64) SiteSpinor fm;

-	int vp=(v+1)%LLs;
-	int vm=(v+LLs-1)%LLs;
+    for(int v=0;v<LLs;v++){

-	spProj5m(hp,psi[ss+vp]);
-	spProj5p(hm,psi[ss+vm]);
+      int vp=(v+1)%LLs;
+      int vm=(v+LLs-1)%LLs;

-	if ( vp<=v ) rotate(hp,hp,1);
-	if ( vm>=v ) rotate(hm,hm,nsimd-1);
-	
-	hp=0.5*hp;
-        hm=0.5*hm;
+      spProj5m(hp,psi[ss+vp]);
+      spProj5p(hm,psi[ss+vm]);
+      
+      if ( vp<=v ) rotate(hp,hp,1);
+      if ( vm>=v ) rotate(hm,hm,nsimd-1);

-	spRecon5m(fp,hp);
-	spRecon5p(fm,hm);
+      hp=hp*0.5;
+      hm=hm*0.5;
+      spRecon5m(fp,hp);
+      spRecon5p(fm,hm);

-	chi[ss+v] = d[v]*phi[ss+v];
-	chi[ss+v] = chi[ss+v]     +u[v]*fp;
-	chi[ss+v] = chi[ss+v]     +l[v]*fm;
+      chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
+      chi[ss+v] = chi[ss+v]     +l[v]*fm;

-      }
-#else
-      for(int v=0;v<LLs;v++){
-
-	vprefetch(psi[ss+v+LLs]);
-	//	vprefetch(phi[ss+v+LLs]);
-
-	int vp= (v==LLs-1) ? 0     : v+1;
-	int vm= (v==0    ) ? LLs-1 : v-1;
-	
-	Simd hp_00 = psi[ss+vp]()(2)(0); 
-	Simd hp_01 = psi[ss+vp]()(2)(1); 
-	Simd hp_02 = psi[ss+vp]()(2)(2); 
-	Simd hp_10 = psi[ss+vp]()(3)(0); 
-	Simd hp_11 = psi[ss+vp]()(3)(1); 
-	Simd hp_12 = psi[ss+vp]()(3)(2); 
-	
-	Simd hm_00 = psi[ss+vm]()(0)(0); 
-	Simd hm_01 = psi[ss+vm]()(0)(1); 
-	Simd hm_02 = psi[ss+vm]()(0)(2); 
-	Simd hm_10 = psi[ss+vm]()(1)(0); 
-	Simd hm_11 = psi[ss+vm]()(1)(1); 
-	Simd hm_12 = psi[ss+vm]()(1)(2); 
-
-	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
-	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
-
-	if ( vp<=v ) {
-	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
-	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
-	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
-	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
-	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
-	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
-	}
-	if ( vm>=v ) {
-	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
-	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
-	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
-	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
-	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
-	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
-	}
-
-	/*
-	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
-	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
-	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
-	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
-	*/	
-	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
-	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
-	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
-	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
-	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
-	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
-	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
-	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
-	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
-	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
-	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
-	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
-
-	
-	//	if ( ss==0){
-	/*
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
-	}
-	*/
-	vstream(chi[ss+v]()(0)(0),p_00);
-	vstream(chi[ss+v]()(0)(1),p_01);
-	vstream(chi[ss+v]()(0)(2),p_02);
-	vstream(chi[ss+v]()(1)(0),p_10);
-	vstream(chi[ss+v]()(1)(1),p_11);
-	vstream(chi[ss+v]()(1)(2),p_12);
-	vstream(chi[ss+v]()(2)(0),p_20);
-	vstream(chi[ss+v]()(2)(1),p_21);
-	vstream(chi[ss+v]()(2)(2),p_22);
-	vstream(chi[ss+v]()(3)(0),p_30);
-	vstream(chi[ss+v]()(3)(1),p_31);
-	vstream(chi[ss+v]()(3)(2),p_32);
-
-      }
-#endif
+    }
  }
-  M5Dtime+=usecond();
 }

 template<class Impl>  
@ -257,8 +154,6 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
    d_p[ss] = diag[s];
  }}

-  M5Dcalls++;
-  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs

@ -288,8 +183,8 @@ PARALLEL_FOR_LOOP

    }
  }
-  M5Dtime+=usecond();
 }
+
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
@ -355,11 +250,13 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
    }
  }
  
-  MooeeInvCalls++;
-  MooeeInvTime-=usecond();
  // Dynamic allocate on stack to get per thread without serialised heap acces
-#pragma omp parallel  
-  {
+PARALLEL_FOR_LOOP
+  for(auto site=0;site<vol;site++){
+    
+    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
+    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));

    Vector<SiteHalfSpinor> SitePplus(LLs);
    Vector<SiteHalfSpinor> SitePminus(LLs);
@ -370,9 +267,6 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;

-#pragma omp for 
-  for(auto site=0;site<vol;site++){
-
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spProj5p(SitePplus[s] ,psi[lex]);
@ -400,8 +294,6 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
      chi[lex] = SiteChi[s]*0.5;
    }
  }
-  }
-  MooeeInvTime+=usecond();
 }

 INSTANTIATE_DPERP(DomainWallVec5dImplD);
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -194,11 +194,6 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;

-    RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
-
-
   }

  if ( DerivCalls > 0 ) {
@ -214,15 +209,12 @@ void WilsonFermion5D<Impl>::Report(void)
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
-
-    RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
-    std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl;  }
+  }

  if (DerivCalls > 0 || DhopCalls > 0){
-    std::cout << GridLogMessage << "WilsonFermion5D Stencil"    <<std::endl;  Stencil.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl;  Stencil.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
-    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
+    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl;  StencilOdd.Report();
  }
 }

--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@ -66,9 +66,6 @@ namespace Optimization {
    double f[4];
  };

-
-
-
  struct Vsplat{
    //Complex float
    inline __m256 operator()(float a, float b){
@ -170,7 +167,7 @@ namespace Optimization {
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA4)
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
@ -198,7 +195,7 @@ namespace Optimization {
    }
    //Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
+#if defined (AVX1) || defined (AVXFMA4)
          __m128i a0,a1;
          __m128i b0,b1;
          a0 = _mm256_extractf128_si256(a,0);
@ -219,7 +216,7 @@ namespace Optimization {
  struct MultComplex{
    // Complex float
    inline __m256 operator()(__m256 a, __m256 b){
-#if defined (AVX1)
+#if defined (AVX1) 
      __m256 ymm0,ymm1,ymm2;
      ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
@ -236,7 +233,7 @@ namespace Optimization {
      a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
      return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2)  || defined (AVXFMA)
+#if defined (AVX2)
      __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
      __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@ -267,7 +264,7 @@ namespace Optimization {
 	IF IMM0[3] = 0
 	THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
      */
-#if defined (AVX1)
+#if defined (AVX1) 
      __m256d ymm0,ymm1,ymm2;
      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
      ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
@ -282,7 +279,7 @@ namespace Optimization {
      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
      return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 #endif
-#if defined (AVX2) || defined (AVXFMA)
+#if defined (AVX2)
      __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
@ -323,7 +320,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
      a= _mm256_macc_ps(b,c,a);
 #endif
-#if defined (AVX2) || defined (AVXFMA)
+#if defined (AVX2)
      a= _mm256_fmadd_ps( b, c, a);
 #endif
    }
@ -335,7 +332,7 @@ namespace Optimization {
 #if defined (AVXFMA4)
      a= _mm256_macc_pd(b,c,a);
 #endif
-#if defined (AVX2) || defined (AVXFMA)
+#if defined (AVX2)
      a= _mm256_fmadd_pd( b, c, a);
 #endif
    }
@ -350,7 +347,7 @@ namespace Optimization {
    }
    // Integer
    inline __m256i operator()(__m256i a, __m256i b){
-#if defined (AVX1) || defined (AVXFMA)
+#if defined (AVX1) 
      __m128i a0,a1;
      __m128i b0,b1;
      a0 = _mm256_extractf128_si256(a,0);
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@ -27,6 +27,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+//----------------------------------------------------------------------
+/*! @file Grid_knc.h
+  @brief Optimization libraries for AVX512 instructions set for KNC
+
+  Using intrinsics
+*/
+// Time-stamp: <2015-06-09 14:27:28 neo>
+//----------------------------------------------------------------------
+
 #include <immintrin.h>


@ -86,13 +95,13 @@ namespace Optimization {
  struct Vstream{
    //Float
    inline void operator()(float * a, __m512 b){
-      _mm512_stream_ps(a,b);
-      //      _mm512_store_ps(a,b);
+      //_mm512_stream_ps(a,b);
+      _mm512_store_ps(a,b);
    }
    //Double
    inline void operator()(double * a, __m512d b){
-      _mm512_stream_pd(a,b);
-      //      _mm512_store_pd(a,b);
+      //_mm512_stream_pd(a,b);
+      _mm512_store_pd(a,b);
    }

  };
--- a/lib/simd/Grid_generic.h
+++ b/lib/simd/Grid_generic.h
@ -6,7 +6,8 @@

    Copyright (C) 2015

-Author: Antonin Portelli <antonin.portelli@me.com>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@ -26,352 +27,133 @@ Author: Antonin Portelli <antonin.portelli@me.com>
    *************************************************************************************/
    /*  END LEGAL */

-static_assert(GEN_SIMD_WIDTH % 16u == 0, "SIMD vector size is not an integer multiple of 16 bytes");
-
-//#define VECTOR_LOOPS
-
-// playing with compiler pragmas
-#ifdef VECTOR_LOOPS
-#ifdef __clang__
-#define VECTOR_FOR(i, w, inc)\
-_Pragma("clang loop unroll(full) vectorize(enable) interleave(enable) vectorize_width(w)")\
-for (unsigned int i = 0; i < w; i += inc)
-#elif defined __INTEL_COMPILER
-#define VECTOR_FOR(i, w, inc)\
-_Pragma("simd vectorlength(w*8)")\
-for (unsigned int i = 0; i < w; i += inc)
-#else
-#define VECTOR_FOR(i, w, inc)\
-for (unsigned int i = 0; i < w; i += inc)
-#endif
-#else
-#define VECTOR_FOR(i, w, inc)\
-for (unsigned int i = 0; i < w; i += inc)
-#endif
-
 namespace Grid {
 namespace Optimization {

-  // type traits giving the number of elements for each vector type
-  template <typename T> struct W;
-  template <> struct W<double> {
-    constexpr static unsigned int c = GEN_SIMD_WIDTH/16u;
-    constexpr static unsigned int r = GEN_SIMD_WIDTH/8u;
+  template<class vtype>
+  union uconv {
+    float f;
+    vtype v;
  };
-  template <> struct W<float> {
-    constexpr static unsigned int c = GEN_SIMD_WIDTH/8u;
-    constexpr static unsigned int r = GEN_SIMD_WIDTH/4u;
+
+  union u128f {
+    float v;
+    float f[4];
  };
-  
-  // SIMD vector types
-  template <typename T>
-  struct vec {
-    alignas(GEN_SIMD_WIDTH) T v[W<T>::r];
+  union u128d {
+    double v;
+    double f[2];
  };
  
-  typedef vec<float>   vecf;
-  typedef vec<double>  vecd;
-  
  struct Vsplat{
-    // Complex
-    template <typename T>
-    inline vec<T> operator()(T a, T b){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::r, 2)
-      {
-        out.v[i]   = a;
-        out.v[i+1] = b;
-      }
-
+    //Complex float
+    inline u128f operator()(float a, float b){
+      u128f out; 
+      out.f[0] = a;
+      out.f[1] = b;
+      out.f[2] = a;
+      out.f[3] = b;
      return out;
    }
-    
-    // Real
-    template <typename T>
-    inline vec<T> operator()(T a){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::r, 1)
-      {
-        out.v[i] = a;
-      }
-      
+    // Real float
+    inline u128f operator()(float a){
+      u128f out; 
+      out.f[0] = a;
+      out.f[1] = a;
+      out.f[2] = a;
+      out.f[3] = a;
      return out;
    }
-    
-    // Integer
+    //Complex double
+    inline u128d operator()(double a, double b){
+      u128d out; 
+      out.f[0] = a;
+      out.f[1] = b;
+      return out;
+    }
+    //Real double
+    inline u128d operator()(double a){
+      u128d out; 
+      out.f[0] = a;
+      out.f[1] = a;
+      return out;
+    }
+    //Integer
    inline int operator()(Integer a){
      return a;
    }
  };

  struct Vstore{
-    // Real
-    template <typename T>
-    inline void operator()(vec<T> a, T *D){
-      *((vec<T> *)D) = a;
+    //Float 
+    inline void operator()(u128f a, float* F){
+      memcpy(F,a.f,4*sizeof(float));
+    }
+    //Double
+    inline void operator()(u128d a, double* D){
+      memcpy(D,a.f,2*sizeof(double));
    }
    //Integer
-    inline void operator()(int a, Integer *I){
-      *I = a;
+    inline void operator()(int a, Integer* I){
+      I[0] = a;
    }

  };

  struct Vstream{
-    // Real
-    template <typename T>
-    inline void operator()(T * a, vec<T> b){
-      *((vec<T> *)a) = b;
+    //Float
+    inline void operator()(float * a, u128f b){
+      memcpy(a,b.f,4*sizeof(float));
    }
+    //Double
+    inline void operator()(double * a, u128d b){
+      memcpy(a,b.f,2*sizeof(double));
+    }
+
+
  };

  struct Vset{
-    // Complex
-    template <typename T>
-    inline vec<T> operator()(std::complex<T> *a){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::c, 1)
-      {
-        out.v[2*i]   = a[i].real();
-        out.v[2*i+1] = a[i].imag();
-      }
-      
+    // Complex float 
+    inline u128f operator()(Grid::ComplexF *a){
+      u128f out; 
+      out.f[0] = a[0].real();
+      out.f[1] = a[0].imag();
+      out.f[2] = a[1].real();
+      out.f[3] = a[1].imag();
      return out;
    }
-    
-    // Real
-    template <typename T>
-    inline vec<T> operator()(T *a){
-      vec<T> out;
-      
-      out = *((vec<T> *)a);
-      
+    // Complex double 
+    inline u128d operator()(Grid::ComplexD *a){
+      u128d out; 
+      out.f[0] = a[0].real();
+      out.f[1] = a[0].imag();
+      return out;
+    }
+    // Real float 
+    inline u128f operator()(float *a){
+      u128f out; 
+      out.f[0] = a[0];
+      out.f[1] = a[1];
+      out.f[2] = a[2];
+      out.f[3] = a[3];
+      return out;
+    }
+    // Real double
+    inline u128d operator()(double *a){
+      u128d out; 
+      out.f[0] = a[0];
+      out.f[1] = a[1];
      return out;
    }
-
    // Integer
    inline int operator()(Integer *a){
-      return *a;
+      return a[0];
    }
+
+
  };

-  /////////////////////////////////////////////////////
-  // Arithmetic operations
-  /////////////////////////////////////////////////////
-  struct Sum{
-    // Complex/Real
-    template <typename T>
-    inline vec<T> operator()(vec<T> a, vec<T> b){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::r, 1)
-      {
-        out.v[i] = a.v[i] + b.v[i];
-      }
-      
-      return out;
-    }
-    
-    //I nteger
-    inline int operator()(int a, int b){
-      return a + b;
-    }
-  };
-
-  struct Sub{
-    // Complex/Real
-    template <typename T>
-    inline vec<T> operator()(vec<T> a, vec<T> b){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::r, 1)
-      {
-        out.v[i] = a.v[i] - b.v[i];
-      }
-      
-      return out;
-    }
-    
-    //Integer
-    inline int operator()(int a, int b){
-      return a-b;
-    }
-  };
-
-  struct Mult{
-    // Real
-    template <typename T>
-    inline vec<T> operator()(vec<T> a, vec<T> b){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::r, 1)
-      {
-        out.v[i] = a.v[i]*b.v[i];
-      }
-      
-      return out;
-    }
-    
-    // Integer
-    inline int operator()(int a, int b){
-      return a*b;
-    }
-  };
-  
-  #define cmul(a, b, c, i)\
-  c[i]   = a[i]*b[i]   - a[i+1]*b[i+1];\
-  c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
-  
-  struct MultComplex{
-    // Complex
-    template <typename T>
-    inline vec<T> operator()(vec<T> a, vec<T> b){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::c, 1)
-      {
-        cmul(a.v, b.v, out.v, 2*i);
-      }      
-      
-      return out;
-    }
-  };
-  
-  #undef cmul
-
-  struct Div{
-    // Real
-    template <typename T>
-    inline vec<T> operator()(vec<T> a, vec<T> b){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::r, 1)
-      {
-        out.v[i] = a.v[i]/b.v[i];
-      }
-      
-      return out;
-    }
-  };
-  
-  #define conj(a, b, i)\
-  b[i]   = a[i];\
-  b[i+1] = -a[i+1];
-  
-  struct Conj{
-    // Complex
-    template <typename T>
-    inline vec<T> operator()(vec<T> a){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::c, 1)
-      {
-        conj(a.v, out.v, 2*i);
-      }
-      
-      return out;
-    }
-  };
-  
-  #undef conj
-
-  #define timesmi(a, b, i)\
-  b[i]   = a[i+1];\
-  b[i+1] = -a[i];
-  
-  struct TimesMinusI{
-    // Complex
-    template <typename T>
-    inline vec<T> operator()(vec<T> a, vec<T> b){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::c, 1)
-      {
-        timesmi(a.v, out.v, 2*i);
-      }
-      
-      return out;
-    }
-  };
-
-  #undef timesmi
-  
-  #define timesi(a, b, i)\
-  b[i]   = -a[i+1];\
-  b[i+1] = a[i];
-  
-  struct TimesI{
-    // Complex
-    template <typename T>
-    inline vec<T> operator()(vec<T> a, vec<T> b){
-      vec<T> out;
-      
-      VECTOR_FOR(i, W<T>::c, 1)
-      {
-        timesi(a.v, out.v, 2*i);
-      }
-      
-      return out;
-    }
-  };
-  
-  #undef timesi
-
-  //////////////////////////////////////////////
-  // Some Template specialization
-  #define perm(a, b, n, w)\
-  unsigned int _mask = w >> (n + 1);\
-  VECTOR_FOR(i, w, 1)\
-  {\
-    b[i] = a[i^_mask];\
-  }
-  
-  #define DECL_PERMUTE_N(n)\
-  template <typename T>\
-  static inline vec<T> Permute##n(vec<T> in) {\
-    vec<T> out;\
-    perm(in.v, out.v, n, W<T>::r);\
-    return out;\
-  }
-  
-  struct Permute{
-    DECL_PERMUTE_N(0);
-    DECL_PERMUTE_N(1);
-    DECL_PERMUTE_N(2);
-    DECL_PERMUTE_N(3);
-  };
-  
-  #undef perm
-  #undef DECL_PERMUTE_N
-  
-  #define rot(a, b, n, w)\
-  VECTOR_FOR(i, w, 1)\
-  {\
-    b[i] = a[(i + n)%w];\
-  }
-  
-  struct Rotate{
-    template <typename T>
-    static inline vec<T> rotate(vec<T> in, int n){
-      vec<T> out;
-      
-      rot(in.v, out.v, n, W<T>::r);
-      
-      return out;
-    }
-  };
-
-  #undef rot
-  
-  #define acc(v, a, off, step, n)\
-  for (unsigned int i = off; i < n; i += step)\
-  {\
-    a += v[i];\
-  }
-  
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
@ -382,67 +164,316 @@ namespace Optimization {
      return 0;
    }
  };
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0] + b.f[0];
+      out.f[1] = a.f[1] + b.f[1];
+      out.f[2] = a.f[2] + b.f[2];
+      out.f[3] = a.f[3] + b.f[3];
+      return out;
+    }
+    //Complex/Real double
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0] + b.f[0];
+      out.f[1] = a.f[1] + b.f[1];
+      return out;
+    }
+    //Integer
+    inline int operator()(int a, int b){
+      return a + b;
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0] - b.f[0];
+      out.f[1] = a.f[1] - b.f[1];
+      out.f[2] = a.f[2] - b.f[2];
+      out.f[3] = a.f[3] - b.f[3];
+      return out;
+    }
+    //Complex/Real double
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0] - b.f[0];
+      out.f[1] = a.f[1] - b.f[1];
+      return out;
+    }
+    //Integer
+    inline int operator()(int a, int b){
+      return a-b;
+    }
+  };
+
+  struct MultComplex{
+    // Complex float
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
+      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
+      out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
+      out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
+      return out;
+    }
+    // Complex double
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
+      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
+      return out;
+    }
+  };
+
+  struct Mult{
+    //CK: Appear unneeded
+    // inline float  mac(float a, float b,double c){
+    //   return 0;
+    // }
+    // inline double mac(double a, double b,double c){
+    //   return 0;
+    // }
+
+    // Real float
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0]*b.f[0];
+      out.f[1] = a.f[1]*b.f[1];
+      out.f[2] = a.f[2]*b.f[2];
+      out.f[3] = a.f[3]*b.f[3];
+      return out;
+    }
+    // Real double
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0]*b.f[0];
+      out.f[1] = a.f[1]*b.f[1];
+      return out;
+    }
+    // Integer
+    inline int operator()(int a, int b){
+      return a*b;
+    }
+  };
+
+  struct Conj{
+    // Complex single
+    inline u128f operator()(u128f in){
+      u128f out;
+      out.f[0] = in.f[0];
+      out.f[1] = -in.f[1];
+      out.f[2] = in.f[2];
+      out.f[3] = -in.f[3];
+      return out;
+    }
+    // Complex double
+    inline u128d operator()(u128d in){
+      u128d out;
+      out.f[0] = in.f[0];
+      out.f[1] = -in.f[1];
+      return out;
+    }
+    // do not define for integer input
+  };
+
+  struct TimesMinusI{
+    //Complex single
+    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
+      u128f out;
+      out.f[0] = in.f[1];
+      out.f[1] = -in.f[0];
+      out.f[2] = in.f[3];
+      out.f[3] = -in.f[2];
+      return out;
+    }
+    //Complex double
+    inline u128d operator()(u128d in, u128d ret){
+      u128d out;
+      out.f[0] = in.f[1];
+      out.f[1] = -in.f[0];
+      return out;
+    }
+  };
+
+  struct TimesI{
+    //Complex single
+    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
+      u128f out;
+      out.f[0] = -in.f[1];
+      out.f[1] = in.f[0];
+      out.f[2] = -in.f[3];
+      out.f[3] = in.f[2];
+      return out;
+    }
+    //Complex double
+    inline u128d operator()(u128d in, u128d ret){
+      u128d out;
+      out.f[0] = -in.f[1];
+      out.f[1] = in.f[0];
+      return out;
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Some Template specialization
+  struct Permute{
+    //We just have to mirror the permutes of Grid_sse4.h
+    static inline u128f Permute0(u128f in){ //AB CD -> CD AB
+      u128f out;
+      out.f[0] = in.f[2];
+      out.f[1] = in.f[3];
+      out.f[2] = in.f[0];
+      out.f[3] = in.f[1];
+      return out;
+    };
+    static inline u128f Permute1(u128f in){ //AB CD -> BA DC
+      u128f out;
+      out.f[0] = in.f[1];
+      out.f[1] = in.f[0];
+      out.f[2] = in.f[3];
+      out.f[3] = in.f[2];
+      return out;
+    };
+    static inline u128f Permute2(u128f in){
+      return in;
+    };
+    static inline u128f Permute3(u128f in){
+      return in;
+    };
+
+    static inline u128d Permute0(u128d in){ //AB -> BA
+      u128d out;
+      out.f[0] = in.f[1];
+      out.f[1] = in.f[0];
+      return out;      
+    };
+    static inline u128d Permute1(u128d in){
+      return in;
+    };
+    static inline u128d Permute2(u128d in){
+      return in;
+    };
+    static inline u128d Permute3(u128d in){
+      return in;
+    };
+
+  };
  
+  template < typename vtype > 
+    void permute(vtype &a, vtype b, int perm) {
+   };
+    
+  struct Rotate{
+
+    static inline u128f rotate(u128f in,int n){
+      u128f out;
+      switch(n){
+      case 0:
+        out.f[0] = in.f[0];
+        out.f[1] = in.f[1];
+        out.f[2] = in.f[2];
+        out.f[3] = in.f[3];
+        break;
+      case 1:
+        out.f[0] = in.f[1];
+        out.f[1] = in.f[2];
+        out.f[2] = in.f[3];
+        out.f[3] = in.f[0];
+        break;
+      case 2:
+        out.f[0] = in.f[2];
+        out.f[1] = in.f[3];
+        out.f[2] = in.f[0];
+        out.f[3] = in.f[1];
+        break;
+      case 3:
+        out.f[0] = in.f[3];
+        out.f[1] = in.f[0];
+        out.f[2] = in.f[1];
+        out.f[3] = in.f[2];
+        break;
+      default: assert(0);
+      }
+      return out;
+    }
+    static inline u128d rotate(u128d in,int n){
+      u128d out;
+      switch(n){
+      case 0:
+        out.f[0] = in.f[0];
+        out.f[1] = in.f[1];
+        break;
+      case 1:
+        out.f[0] = in.f[1];
+        out.f[1] = in.f[0];
+        break;
+      default: assert(0);
+      }
+      return out;
+    }
+  };
+
  //Complex float Reduce
-  template <>
-  inline Grid::ComplexF Reduce<Grid::ComplexF, vecf>::operator()(vecf in){
-    float a = 0.f, b = 0.f;
-    
-    acc(in.v, a, 0, 2, W<float>::r);
-    acc(in.v, b, 1, 2, W<float>::r);
-    
-    return Grid::ComplexF(a, b);
+  template<>
+  inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
+    return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
  }
-  
  //Real float Reduce
  template<>
-  inline Grid::RealF Reduce<Grid::RealF, vecf>::operator()(vecf in){
-    float a = 0.;
-    
-    acc(in.v, a, 0, 1, W<float>::r);
-    
-    return a;
+  inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
+    return in.f[0] + in.f[1] + in.f[2] + in.f[3];
  }
  
+  
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, vecd>::operator()(vecd in){
-    double a = 0., b = 0.;
-    
-    acc(in.v, a, 0, 2, W<double>::r);
-    acc(in.v, b, 1, 2, W<double>::r);
-    
-    return Grid::ComplexD(a, b);
+  inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
+    return Grid::ComplexD(in.f[0],in.f[1]);
  }
  
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, vecd>::operator()(vecd in){
-    double a = 0.f;
-    
-    acc(in.v, a, 0, 1, W<double>::r);
-    
-    return a;
+  inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
+    return in.f[0] + in.f[1];
  }

  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, int>::operator()(int in){
-    return in;
+    // FIXME unimplemented
+   printf("Reduce : Missing integer implementation -> FIX\n");
+    assert(0);
  }
 }

 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 

-  typedef Optimization::vecf SIMD_Ftype; // Single precision type
-  typedef Optimization::vecd SIMD_Dtype; // Double precision type
+  typedef Optimization::u128f SIMD_Ftype;  // Single precision type
+  typedef Optimization::u128d SIMD_Dtype; // Double precision type
  typedef int SIMD_Itype; // Integer type

  // prefetch utilities
  inline void v_prefetch0(int size, const char *ptr){};
  inline void prefetch_HINT_T0(const char *ptr){};

+
+
+  // Gpermute function
+  template < typename VectorSIMD > 
+    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
+    Optimization::permute(y.v,b.v,perm);
+  }
+
+
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
@ -450,13 +481,16 @@ namespace Optimization {
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;

+ 
+
+
  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
-  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
+
 }
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@ -26,6 +26,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+//----------------------------------------------------------------------
+/*! @file Grid_knc.h
+  @brief Optimization libraries for AVX512 instructions set for KNC
+
+  Using intrinsics
+*/
+// Time-stamp: <2015-06-09 14:27:28 neo>
+//----------------------------------------------------------------------

 #include <immintrin.h>
 #include <zmmintrin.h>
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@ -244,22 +244,7 @@ namespace Optimization {
      return a*b;
    }
  };
-
-  struct Div{
-    // Real double
-    inline vector4double operator()(vector4double a, vector4double b){
-      return vec_swdiv(a, b);
-    }
-
-    // Real float
-    FLOAT_WRAP_2(operator(), inline)
-
-    // Integer
-    inline int operator()(int a, int b){
-      return a/b;
-    }
-  };
-
+  
  struct Conj{
    // Complex double
    inline vector4double operator()(vector4double v){
@ -428,7 +413,6 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 typedef Optimization::Sum         SumSIMD;
 typedef Optimization::Sub         SubSIMD;
 typedef Optimization::Mult        MultSIMD;
-typedef Optimization::Div         DivSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@ -38,13 +38,13 @@ directory
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES

-#ifdef GEN
+#ifdef GENERIC_VEC
 #include "Grid_generic.h"
 #endif
 #ifdef SSE4
 #include "Grid_sse4.h"
 #endif
-#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
+#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4)
 #include "Grid_avx.h"
 #endif
 #if defined AVX512
@ -60,13 +60,6 @@ directory
 #include "Grid_neon.h"
 #endif

-#ifdef GRID_DEFAULT_PRECISION_SINGLE
-#define GRID_REAL_DIGITS FLT_DIG
-#endif
-#ifdef GRID_DEFAULT_PRECISION_DOUBLE
-#define GRID_REAL_DIGITS DBL_DIG
-#endif
-
 namespace Grid {

 //////////////////////////////////////
@ -137,7 +130,7 @@ class Grid_simd {

  Vector_type v;

-  static inline constexpr int Nsimd(void) {
+  static inline int Nsimd(void) {
    return sizeof(Vector_type) / sizeof(Scalar_type);
  }

@ -153,23 +146,6 @@ class Grid_simd {
  Grid_simd(const Grid_simd &rhs) : v(rhs.v){};  // compiles in movaps
  Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};

-  /////////////////////////////
-  // Comparisons
-  /////////////////////////////
-  inline bool operator==(const Grid_simd& lhs){ 
-    Grid_simd::conv_t conv1;
-    Grid_simd::conv_t conv2;
-    conv1.v = v;
-    conv2.v = lhs.v;
-    return std::equal(std::begin(conv1.s), std::end(conv1.s), std::begin(conv2.s));
-  }
-
-  inline bool operator!=(const Grid_simd& lhs){ 
-    return !(*this == lhs);
-  }
-
-
-
  /////////////////////////////
  // Constructors
  /////////////////////////////
--- a/m4/ac_prog_doxygen.m4
+++ b/m4/ac_prog_doxygen.m4
@ -0,0 +1,54 @@
+dnl Check for doxygen to create API docs
+dnl
+AC_DEFUN([AC_PROG_DOXYGEN],
+[
+AC_ARG_ENABLE(doxygen,
+AS_HELP_STRING([--enable-doxygen],[enable documentation generation with doxygen (auto)]))
+
+AC_ARG_ENABLE(dot,
+AS_HELP_STRING([--enable-dot],[use 'dot' to generate graphs in doxygen (auto)]))              
+AC_ARG_ENABLE(html-docs,
+AS_HELP_STRING([--enable-html-docs],[enable HTML generation with doxygen (yes)]),
+[],[ enable_html_docs=yes])              
+AC_ARG_ENABLE(latex-docs,
+AS_HELP_STRING([--enable-latex-docs],
+[enable LaTeX documentation generation with doxygen (no)]),[],[enable_latex_docs=no])              
+
+if test "x$enable_doxygen" = xno; then
+ enable_doc=no	
+else 
+ AC_CHECK_PROG(DOXYGEN, doxygen, doxygen)
+ if test x$DOXYGEN = x; then
+  if test "x$enable_doxygen" = xyes; then
+   AC_MSG_ERROR([could not find doxygen])
+  fi
+  enable_doc=no
+ else
+  doxy_ver=`doxygen --version`
+  doxy_major=`expr "$doxy_ver" : '\(@<:@0-9@:>@\)\..*'`
+  doxy_minor=`expr "$doxy_ver" : '@<:@0-9@:>@\.\(@<:@0-9@:>@\).*'`
+  if test $doxy_major -eq "1" -a $doxy_minor -ge "3" ; then
+   enable_doc=yes
+   AC_CHECK_PROG(DOT, dot, dot)
+  else
+   AC_MSG_WARN([doxygen version $doxy_ver too old, doxygen will not be used.])
+   enable_doc=no
+  fi
+ fi
+fi
+
+AM_CONDITIONAL(DOXYGEN_DOC, test x$enable_doc = xyes)
+
+if test x$DOT = x; then
+ if test "x$enable_dot" = xyes; then
+  AC_MSG_ERROR([could not find dot])
+ fi
+ enable_dot=no
+else
+ enable_dot=yes
+fi
+
+AC_SUBST(enable_dot)
+AC_SUBST(enable_html_docs)
+AC_SUBST(enable_latex_docs)
+])
--- a/m4/m4-ax_prog_doxygen.m4
+++ b/m4/m4-ax_prog_doxygen.m4
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@ -50,12 +50,6 @@ public:
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
  std::string name(void) const { return std::string("Times"); }
 };
-class funcDivide {
-public:
-  funcDivide() {};
-  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;}
-  std::string name(void) const { return std::string("Divide"); }
-};
 class funcConj {
 public:
  funcConj() {};
@ -347,7 +341,6 @@ int main (int argc, char ** argv)
  Tester<RealF,vRealF>(funcPlus());
  Tester<RealF,vRealF>(funcMinus());
  Tester<RealF,vRealF>(funcTimes());
-  Tester<RealF,vRealF>(funcDivide());
  Tester<RealF,vRealF>(funcAdj());
  Tester<RealF,vRealF>(funcConj());
  Tester<RealF,vRealF>(funcInnerProduct());
@ -378,7 +371,6 @@ int main (int argc, char ** argv)
  Tester<RealD,vRealD>(funcPlus());
  Tester<RealD,vRealD>(funcMinus());
  Tester<RealD,vRealD>(funcTimes());
-  Tester<RealD,vRealD>(funcDivide());
  Tester<RealD,vRealD>(funcAdj());
  Tester<RealD,vRealD>(funcConj());
  Tester<RealD,vRealD>(funcInnerProduct());
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@ -68,7 +68,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<4;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
-    C = C + (TwoPiL * p[mu]) * coor;
+    C = C - (TwoPiL * p[mu]) * coor;
  }

  C = exp(C*ci);
@ -78,11 +78,10 @@ int main (int argc, char ** argv)

  FFT theFFT(&Fine);

-  Ctilde = C;
-  theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;

  //  C=zero;
  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
@ -94,11 +93,10 @@ int main (int argc, char ** argv)
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;

-  Stilde = S;
-  theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;

  SpinMatrixF Sp; 
  Sp = zero; Sp = Sp+cVol;
--- a/tests/debug/Test_cayley_cg_reproducibility.cc
+++ b/tests/debug/Test_cayley_cg_reproducibility.cc
@ -1,215 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_cayley_cg_reproducibility.cc
-
-    Copyright (C) 2015
-
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: Guido Cossu <guido.cossu@ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-using namespace Grid::QCD;
-
-#define REPRODUCIBILITY_INTERVAL 1
-
-
-template<class d>
-struct scal {
-  d internal;
-};
-
-  Gamma::GammaMatrix Gmu [] = {
-    Gamma::GammaX,
-    Gamma::GammaY,
-    Gamma::GammaZ,
-    Gamma::GammaT
-  };
-
-template<class What> 
-void  TestCGinversions(What & Ddwf, 
-                       GridCartesian         * FGrid,          GridRedBlackCartesian * FrbGrid,
-                       GridCartesian         * UGrid,          GridRedBlackCartesian * UrbGrid,
-                       RealD mass, RealD M5,
-                       GridParallelRNG *RNG4,
-                       GridParallelRNG *RNG5);
-template<class What> 
-void  TestCGschur(What & Ddwf, 
-                  GridCartesian         * FGrid,               GridRedBlackCartesian * FrbGrid,
-                  GridCartesian         * UGrid,               GridRedBlackCartesian * UrbGrid,
-                  RealD mass, RealD M5,
-                  GridParallelRNG *RNG4,
-                  GridParallelRNG *RNG5);
-
-template<class What> 
-void  TestCGunprec(What & Ddwf, 
-                   GridCartesian         * FGrid,              GridRedBlackCartesian * FrbGrid,
-                   GridCartesian         * UGrid,              GridRedBlackCartesian * UrbGrid,
-                   RealD mass, RealD M5,
-                   GridParallelRNG *RNG4,
-                   GridParallelRNG *RNG5);
-
-template<class What> 
-void  TestCGprec(What & Ddwf, 
-                 GridCartesian         * FGrid,        GridRedBlackCartesian * FrbGrid,
-                 GridCartesian         * UGrid,        GridRedBlackCartesian * UrbGrid,
-                 RealD mass, RealD M5,
-                 GridParallelRNG *RNG4,
-                 GridParallelRNG *RNG5);
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-
-  std::cout << GridLogMessage << "###########################################" << std::endl;
-  std::cout << GridLogMessage << "This test disables the convergence alert" << std::endl;
-  std::cout << GridLogMessage << "and checks whether in consecutive runs of the CG" << std::endl;
-  std::cout << GridLogMessage << "the internal reductions are bit reproducible." << std::endl;
-  std::cout << GridLogMessage << "A warning and a report is produced for every failure" << std::endl;
-  std::cout << GridLogMessage << "###########################################" << std::endl;
-
-
-  const int Ls=8;
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-
-  LatticeGaugeField Umu(UGrid);
-  SU3::HotConfiguration(RNG4,Umu);
-  std::vector<LatticeColourMatrix> U(4,UGrid);
-
-  RealD mass=0.1;
-  RealD M5  =1.8;
-  std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
-  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  TestCGinversions<DomainWallFermionR>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
-
-  RealD b=1.5;// Scale factor b+c=2, b-c=1
-  RealD c=0.5;
-  std::cout<<GridLogMessage <<"MobiusFermion test"<<std::endl;
-  MobiusFermionR Dmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
-  TestCGinversions<MobiusFermionR>(Dmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
-
-  std::cout<<GridLogMessage <<"MobiusZolotarevFermion test"<<std::endl;
-  MobiusZolotarevFermionR Dzolo(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,0.1,2.0);
-  TestCGinversions<MobiusZolotarevFermionR>(Dzolo,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
-
-  std::cout<<GridLogMessage <<"ScaledShamirFermion test"<<std::endl;
-  ScaledShamirFermionR Dsham(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,2.0);
-  TestCGinversions<ScaledShamirFermionR>(Dsham,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
-
-  std::cout<<GridLogMessage <<"ShamirZolotarevFermion test"<<std::endl;
-  ShamirZolotarevFermionR Dshamz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
-  TestCGinversions<ShamirZolotarevFermionR>(Dshamz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
-
-  std::cout<<GridLogMessage <<"OverlapWilsonCayleyTanhFermion test"<<std::endl;
-  OverlapWilsonCayleyTanhFermionR Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0);
-  TestCGinversions<OverlapWilsonCayleyTanhFermionR>(Dov,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
-
-  std::cout<<GridLogMessage <<"OverlapWilsonCayleyZolotarevFermion test"<<std::endl;
-  OverlapWilsonCayleyZolotarevFermionR Dovz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
-  TestCGinversions<OverlapWilsonCayleyZolotarevFermionR>(Dovz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
-
-  Grid_finalize();
-}
-template<class What> 
-void  TestCGinversions(What & Ddwf, 
-                       GridCartesian         * FGrid,          GridRedBlackCartesian * FrbGrid,
-                       GridCartesian         * UGrid,          GridRedBlackCartesian * UrbGrid,
-                       RealD mass, RealD M5,
-                       GridParallelRNG *RNG4,
-                       GridParallelRNG *RNG5)
-{
-  std::cout<<GridLogMessage << "Testing unpreconditioned inverter"<<std::endl;
-  TestCGunprec<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
-  std::cout<<GridLogMessage << "Testing red black preconditioned inverter"<<std::endl;
-  TestCGprec<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
-  std::cout<<GridLogMessage << "Testing red black Schur inverter"<<std::endl;
-  TestCGschur<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
-}
-
-template<class What> 
-void  TestCGunprec(What & Ddwf, 
-                   GridCartesian         * FGrid,              GridRedBlackCartesian * FrbGrid,
-                   GridCartesian         * UGrid,              GridRedBlackCartesian * UrbGrid,
-                   RealD mass, RealD M5,
-                   GridParallelRNG *RNG4,
-                   GridParallelRNG *RNG5)
-{
-  LatticeFermion src   (FGrid); random(*RNG5,src);
-  LatticeFermion result(FGrid); result=zero;
-
-  MdagMLinearOperator<What,LatticeFermion> HermOp(Ddwf);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
-  CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
-  CG(HermOp,src,result);
-
-}
-template<class What> 
-void  TestCGprec(What & Ddwf, 
-                 GridCartesian         * FGrid,        GridRedBlackCartesian * FrbGrid,
-                 GridCartesian         * UGrid,        GridRedBlackCartesian * UrbGrid,
-                 RealD mass, RealD M5,
-                 GridParallelRNG *RNG4,
-                 GridParallelRNG *RNG5)
-{
-  LatticeFermion src   (FGrid); random(*RNG5,src);
-  LatticeFermion    src_o(FrbGrid);
-  LatticeFermion result_o(FrbGrid);
-  pickCheckerboard(Odd,src_o,src);
-  result_o=zero;
-
-  SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
-  CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
-  CG(HermOpEO,src_o,result_o);
-}
-
-
-template<class What> 
-void  TestCGschur(What & Ddwf, 
-                   GridCartesian         * FGrid,              GridRedBlackCartesian * FrbGrid,
-                   GridCartesian         * UGrid,              GridRedBlackCartesian * UrbGrid,
-                   RealD mass, RealD M5,
-                   GridParallelRNG *RNG4,
-                   GridParallelRNG *RNG5)
-{
-  LatticeFermion src   (FGrid); random(*RNG5,src);
-  LatticeFermion result(FGrid); result=zero;
-
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
-  CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
-  SchurSolver(Ddwf,src,result);
-}
--- a/tests/hmc/Test_hmc_WilsonFermionGauge.cc
+++ b/tests/hmc/Test_hmc_WilsonFermionGauge.cc
@ -63,9 +63,6 @@ class HmcRunner : public NerscHmcRunner {
    Real mass = -0.77;
    FermionAction FermOp(U, *FGrid, *FrbGrid, mass);

-    // To enable the CG reproducibility tests use 
-    //ConjugateGradient<FermionField> CG(1.0e-8, 10000, ReproducibilityTest);
-    // This is the plain version
    ConjugateGradient<FermionField> CG(1.0e-8, 10000);

    TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG);
--- a/tests/solver/Test_wilson_cg_unprec.cc
+++ b/tests/solver/Test_wilson_cg_unprec.cc
@ -71,7 +71,7 @@ int main (int argc, char ** argv)
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);

  MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000,true, true);
+  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
  CG(HermOp,src,result);

  Grid_finalize();
Author	SHA1	Message	Date
paboyle	c363bdd784	Merge branch 'release/v0.6.0'	2016-11-09 12:43:14 +00:00
paboyle	446c768cd3	Merge branch 'hotfix/v0.5.1' Double precision compile fix	2016-07-01 16:33:59 +01:00