Merge branch 'develop' into feature/hadrons

2025-07-05 07:57:06 +01:00 · 2016-09-20 13:49:33 +01:00
parent 7ff7c7d90d d2573189d8
commit a034e9901b
42 changed files with 1594 additions and 558 deletions
--- a/.gitignore
+++ b/.gitignore
@ -94,6 +94,10 @@ build.sh
 ################
 lib/Eigen/*
 # FFTW source #
 ################
 lib/fftw/*
 # libtool macros #
 ##################
 m4/lt*
--- a/.travis.yml
+++ b/.travis.yml
@ -23,6 +23,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
@ -35,6 +37,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
@ -47,6 +51,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
@ -59,6 +65,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
@ -69,6 +77,7 @@ before_install:
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
 install:
@ -92,3 +101,9 @@ script:
    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
    - echo make clean
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
    - make -j4
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
--- a/README.md
+++ b/README.md
@ -68,10 +68,18 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
 ``` bash
 mkdir build; cd build
-../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi --prefix=<path>
+../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```
-where `--enable-precision=` set the default precision (`single` or `double`), `--enable-simd=` set the SIMD type (see possible values below), `--enable-comms=` set the protocol used for communications (`none`, `mpi` or `shmem`), and `<path>` should be replaced by the prefix path where you want to install Grid. Other options are available, use `configure --help` to display them. Like with any other program using GNU autotool, the `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to customise the build.
+where `--enable-precision=` set the default precision (`single` or `double`),
 `--enable-simd=` set the SIMD type (see possible values below), `--enable-
 comms=` set the protocol used for communications (`none`, `mpi`, `mpi-auto` or
 `shmem`), and `<path>` should be replaced by the prefix path where you want to
 install Grid. The `mpi-auto` communication option set `configure` to determine
 automatically how to link to MPI. Other options are available, use `configure
 --help` to display them. Like with any other program using GNU autotool, the
 `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
 customise the build.
 Finally, you can build and install Grid:
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@ -194,7 +194,7 @@ int main (int argc, char ** argv)
    }
  }  
-
+#if 0
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
@ -315,7 +315,7 @@ int main (int argc, char ** argv)
    }
  }
-
+#endif
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@ -61,6 +61,8 @@ int main (int argc, char ** argv)
    QCD::WilsonKernelsStatic::AsmOpt=0;
  }
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@ -0,0 +1,117 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_wilson.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Richard Rollins <rprollins@users.noreply.github.com>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
 Gamma::GammaMatrix Gmu [] = {
  Gamma::GammaX,
  Gamma::GammaY,
  Gamma::GammaZ,
  Gamma::GammaT
 };
 bool overlapComms = false;
 void bench_wilson (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
 		   WilsonFermionR &     Dw,
 		   double const     volume,
 		   int const           dag );
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; }
  typename WilsonFermionR::ImplParams params;
  params.overlapCommsCompute = overlapComms;
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> seeds({1,2,3,4});
  RealD mass = 0.1;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  int Lmax = 32;
  int dmin = 0;
  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
  for (int L=8; L<=Lmax; L*=2)
    {
      std::vector<int> latt_size = std::vector<int>(4,L);
      for(int d=4; d>dmin; d--)
 	{
 	  if ( d<=3 ) { latt_size[d] *= 2; }
 	  std::cout << GridLogMessage;
 	  std::copy( latt_size.begin(), --latt_size.end(), std::ostream_iterator<int>( std::cout, std::string("x").c_str() ) );
 	  std::cout << latt_size.back() << "\t\t";
 	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 	  GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 	  LatticeFermion    src(&Grid); random(pRNG,src);
 	  LatticeFermion result(&Grid); result=zero;
 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
 	  bench_wilson(src,result,Dw,volume,DaggerNo);
 	  bench_wilson(src,result,Dw,volume,DaggerYes);
 	  std::cout << std::endl;
 	}
    }
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  Grid_finalize();
 }
 void bench_wilson (
 		   LatticeFermion &    src,
 		   LatticeFermion & result,
 		   WilsonFermionR &     Dw,
 		   double const     volume,
 		   int const           dag )
 {
  int ncall    = 1000;
  double t0    = usecond();
  for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
  double t1    = usecond();
  double flops = 1344 * volume * ncall;
  std::cout << flops/(t1-t0) << "\t\t";
 }
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@ -40,14 +40,20 @@ int main(int argc,char **argv)
  std::ofstream os("zmm.dat");
  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  for(int L=4;L<=32;L+=4){
    for(int m=1;m<=2;m++){
      for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
  std::cout << GridLogMessage <<"\t";
 	for(int i=0;i<4;i++) { 
 	  std::cout << grid[i]<<"x";
 	}
-	std::cout << Ls<<std::endl;
+	std::cout << Ls<<"\t\t";
 	bench(os,grid,Ls);
      }
    }
@ -104,7 +110,6 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  RealD M5  =1.8;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=50;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
@ -116,7 +121,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  double flops=1344*volume/2;
  mfc = flops*ncall/(t1-t0);
-  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
+  std::cout<<mfc<<"\t\t";
  QCD::WilsonKernelsStatic::AsmOpt=1;
  t0=usecond();
@ -125,7 +130,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  }
  t1=usecond();
  mfa = flops*ncall/(t1-t0);
-  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
+  std::cout<<mfa<<"\t\t";
  /*
  int dag=DaggerNo;
  t0=usecond();
@ -163,8 +168,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  //resulta = (-0.5) * resulta;
  diff = resulto-resulta;
-  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
+  std::cout<<norm2(diff)<<std::endl;
  std::cout<<std::endl;
  return 0;
 }
--- a/bootstrap.sh
+++ b/bootstrap.sh
@ -1,11 +1,18 @@
 #!/usr/bin/env bash
 EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
 echo "-- deploying Eigen source..."
-wget ${EIGEN_URL}
+wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`
 echo "-- copying fftw prototypes..."
 wget ${FFTW_URL}
 ./scripts/update_fftw.sh `basename ${FFTW_URL}`
 rm `basename ${FFTW_URL}`
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'
--- a/configure.ac
+++ b/configure.ac
@ -8,11 +8,20 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 ############### Checks for programs
 AC_LANG(C++)
-: ${CXXFLAGS="-O3"}
+CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
 AC_PROG_RANLIB
 ############ openmp  ###############
 AC_OPENMP
 ac_openmp=no
 if test "${OPENMP_CXXFLAGS}X" != "X"; then
 ac_openmp=yes
 AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
-LT_INIT([disable-shared])
+AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
 fi
 ############### Checks for header files
 AC_CHECK_HEADERS(stdint.h)
@ -29,7 +38,7 @@ AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T
-############### Options
+############### GMP and MPFR #################
 AC_ARG_WITH([gmp],
    [AS_HELP_STRING([--with-gmp=prefix],
    [try this for a non-standard install prefix of the GMP library])],
@ -40,9 +49,12 @@ AC_ARG_WITH([mpfr],
    [try this for a non-standard install prefix of the MPFR library])],
    [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
 ################## lapack ####################
 AC_ARG_ENABLE([lapack],
    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
    [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
 case ${ac_LAPACK} in
    no)
        ;;
@ -54,6 +66,13 @@ case ${ac_LAPACK} in
        AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
 esac
 ################## FFTW3 ####################
 AC_ARG_WITH([fftw],    
            [AS_HELP_STRING([--with-fftw=prefix],
            [try this for a non-standard install prefix of the FFTW3 library])],
            [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
 ################ Get compiler informations
 AC_LANG([C++])
 AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
@ -67,7 +86,6 @@ AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
 ############### Checks for library functions
 CXXFLAGS_CPY=$CXXFLAGS
 LDFLAGS_CPY=$LDFLAGS
 LIBS_CPY=$LIBS
 CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
 LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 AC_CHECK_FUNCS([gettimeofday])
@ -77,7 +95,7 @@ AC_CHECK_LIB([gmp],[__gmpf_init],
                 [have_mpfr=true]
                 [LIBS="$LIBS -lmpfr"],
                 [AC_MSG_ERROR([MPFR library not found])])]
-             [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
+   	     [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
             [have_gmp=true]
             [LIBS="$LIBS -lgmp"],
             [AC_MSG_WARN([**** GMP library not found, Grid can still compile but RHMC will not work ****])])
@ -86,6 +104,11 @@ if test "${ac_LAPACK}x" != "nox"; then
    AC_CHECK_LIB([lapack],[LAPACKE_sbdsdc],[],
                 [AC_MSG_ERROR("LAPACK enabled but library not found")])
 fi
 AC_CHECK_LIB([fftw3],[fftw_execute],
  [AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])]
  [have_fftw=true]
  [LIBS="$LIBS -lfftw3 -lfftw3f"],
  [AC_MSG_WARN([**** FFTW library not found, Grid can still compile but FFT-based routines will not work ****])])
 CXXFLAGS=$CXXFLAGS_CPY
 LDFLAGS=$LDFLAGS_CPY
@ -108,16 +131,19 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='-mavx -mfma4';;
      AVX2)
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
-        SIMD_FLAGS='-mavx2';;
+        SIMD_FLAGS='-mavx2 -mfma';;
      AVX512|AVX512MIC|KNL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
      IMCI|KNC)
-        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+        AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
        SIMD_FLAGS='';;
      GEN)
        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
        SIMD_FLAGS='';;
      QPX|BGQ)
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
        SIMD_FLAGS='';;
      *)
        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
    esac;;
@ -294,15 +320,17 @@ Summary of configuration for $PACKAGE v$VERSION
 - compiler version              : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 - SIMD                          : ${ac_SIMD}
- communications type           : ${ac_COMMS}
+- Threading                     : ${ac_openmp} 
- default precision             : ${ac_PRECISION}
+- Communications type           : ${ac_COMMS}
 - Default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG} 
 - GMP                           : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 - LAPACK                        : ${ac_LAPACK}
 - FFTW                          : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
 - build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 - graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
- CXXFLAGS: 
+- CXXFLAGS:
 `echo ${AM_CXXFLAGS} ${CXXFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
 - LDFLAGS:
 `echo ${AM_LDFLAGS} ${LDFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
--- a/lib/FFT.h
+++ b/lib/FFT.h
@ -0,0 +1,276 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/Cshift.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
 #ifdef HAVE_FFTW	
 #include <fftw3.h>
 #endif
 namespace Grid {
  template<class scalar> struct FFTW { };
 #ifdef HAVE_FFTW	
  template<> struct FFTW<ComplexD> {
  public:
    typedef fftw_complex FFTW_scalar;
    typedef fftw_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftw_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftw_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftw_destroy_plan(p);
    }
  };
  template<> struct FFTW<ComplexF> {
  public:
    typedef fftwf_complex FFTW_scalar;
    typedef fftwf_plan    FFTW_plan;
    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
 					FFTW_scalar *in, const int *inembed,		
 					int istride, int idist,		
 					FFTW_scalar *out, const int *onembed,		
 					int ostride, int odist,		
 					int sign, unsigned flags) {
      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
    }	  
    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
      ::fftwf_flops(p,add,mul,fmas);
    }
    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
      ::fftwf_execute_dft(p,in,out);
    }
    inline static void fftw_destroy_plan(const FFTW_plan p) {
      ::fftwf_destroy_plan(p);
    }
  };
 #endif
 #ifndef FFTW_FORWARD
 #define FFTW_FORWARD (-1)
 #define FFTW_BACKWARD (+1)
 #endif
  class FFT { 
  private:
    GridCartesian *vgrid;
    GridCartesian *sgrid;
    int Nd;
    double flops;
    double flops_call;
    uint64_t usec;
    std::vector<int> dimensions;
    std::vector<int> processors;
    std::vector<int> processor_coor;
  public:
    static const int forward=FFTW_FORWARD;
    static const int backward=FFTW_BACKWARD;
    double Flops(void) {return flops;}
    double MFlops(void) {return flops/usec;}
    FFT ( GridCartesian * grid ) : 
      vgrid(grid),
      Nd(grid->_ndimension),
      dimensions(grid->_fdimensions),
      processors(grid->_processors),
      processor_coor(grid->_processor_coor)
    {
      flops=0;
      usec =0;
      std::vector<int> layout(Nd,1);
      sgrid = new GridCartesian(dimensions,layout,processors);
    };
    ~FFT ( void)  { 
      delete sgrid; 
    }
    template<class vobj>
    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
      conformable(result._grid,vgrid);
      conformable(source._grid,vgrid);
      int L = vgrid->_ldimensions[dim];
      int G = vgrid->_fdimensions[dim];
      std::vector<int> layout(Nd,1);
      std::vector<int> pencil_gd(vgrid->_fdimensions);
      pencil_gd[dim] = G*processors[dim];    
      // Pencil global vol LxLxGxLxL per node
      GridCartesian pencil_g(pencil_gd,layout,processors);
      // Construct pencils
      typedef typename vobj::scalar_object sobj;
      typedef typename sobj::scalar_type   scalar;
      Lattice<vobj> ssource(vgrid); ssource =source;
      Lattice<sobj> pgsource(&pencil_g);
      Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
 #ifndef HAVE_FFTW	
      assert(0);
 #else 
      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
      {
 	int Ncomp = sizeof(sobj)/sizeof(scalar);
 	int Nlow  = 1;
 	for(int d=0;d<dim;d++){
 	  Nlow*=vgrid->_ldimensions[d];
 	}
 	int rank = 1;  /* 1d transforms */
 	int n[] = {G}; /* 1d transforms of length G */
 	int howmany = Ncomp;
 	int odist,idist,istride,ostride;
 	idist   = odist   = 1;          /* Distance between consecutive FT's */
 	istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
 	int *inembed = n, *onembed = n;
 	int sign = FFTW_FORWARD;
 	if (inverse) sign = FFTW_BACKWARD;
 	FFTW_plan p;
 	{
 	  FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
 	  FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
 	  p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
 					       in,inembed,
 					       istride,idist,
 					       out,onembed,
 					       ostride, odist,
 					       sign,FFTW_ESTIMATE);
 	}
 	double add,mul,fma;
 	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
 	flops_call = add+mul+2.0*fma;
 	GridStopWatch timer;
 	// Barrel shift and collect global pencil
 	for(int p=0;p<processors[dim];p++) { 
 	  for(int idx=0;idx<sgrid->lSites();idx++) { 
 	    std::vector<int> lcoor(Nd);
    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
 	    sobj s;
 	    peekLocalSite(s,ssource,lcoor);
 	    lcoor[dim]+=p*L;
 	    pokeLocalSite(s,pgsource,lcoor);
 	  }
 	  ssource = Cshift(ssource,dim,L);
 	}
 	// Loop over orthog coords
 	int NN=pencil_g.lSites();
 	GridStopWatch Timer;
 	Timer.Start();
 PARALLEL_FOR_LOOP
 	for(int idx=0;idx<NN;idx++) { 
 	  std::vector<int> lcoor(Nd);
 	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
 	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
 	    FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
 	    FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
 	    FFTW<scalar>::fftw_execute_dft(p,in,out);
 	  }
 	}
        Timer.Stop();
 	usec += Timer.useconds();
 	flops+= flops_call*NN;
        int pc = processor_coor[dim];
        for(int idx=0;idx<sgrid->lSites();idx++) { 
 	  std::vector<int> lcoor(Nd);
 	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
 	  std::vector<int> gcoor = lcoor;
 	  // extract the result
 	  sobj s;
 	  gcoor[dim] = lcoor[dim]+L*pc;
 	  peekLocalSite(s,pgresult,gcoor);
 	  pokeLocalSite(s,result,lcoor);
 	}
 	FFTW<scalar>::fftw_destroy_plan(p);
      }
 #endif
    }
  };
 }
 #endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -68,6 +68,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Simd.h>
 #include <Grid/Threads.h>
 #include <Grid/Lexicographic.h>
 #include <Grid/Init.h>
 #include <Grid/Communicator.h> 
 #include <Grid/Cartesian.h>    
 #include <Grid/Tensors.h>      
@ -78,7 +79,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/parallelIO/NerscIO.h>
-#include <Grid/Init.h>
+
 #include <Grid/FFT.h>
 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
--- a/lib/Init.cc
+++ b/lib/Init.cc
@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc,
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
    std::vector<int> cores(0);
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv)
    GridLogConfigure(logstreams);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -17,8 +17,8 @@ endif
 include Make.inc
 include Eigen.inc
-lib_LTLIBRARIES = libGrid.la
+lib_LIBRARIES = libGrid.a
-libGrid_la_SOURCES             = $(CCFILES) $(extra_sources)
+libGrid_a_SOURCES              = $(CCFILES) $(extra_sources)
-libGrid_ladir                  = $(pkgincludedir)
+libGrid_adir                   = $(pkgincludedir)
 nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@ -265,7 +265,7 @@
 	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
       }
       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
-	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
+	 //_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 	 local = _entries[ent]._is_local;
 	 perm  = _entries[ent]._permute;
 	 if (perm)  ptype = _permute_type[point]; 
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -127,21 +127,12 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);
    void SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 			    void *xmit,
 			    int xmit_to_rank,
 			    void *recv,
 			    int recv_from_rank,
 			    int bytes);
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
    ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -144,28 +144,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }
 // Basic Halo comms primitive
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 					       void *xmit,
 					       int dest,
 					       void *recv,
 					       int from,
 					       int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
 {
  MPI_Startall(list.size(),&list[0]);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -173,12 +151,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  std::vector<CommsRequest_t> reqs(0);
+  MPI_Request xrq;
-  SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes);
+  MPI_Request rrq;
-  SendToRecvFromBegin(reqs);
+  int rank = _processor;
-  for(int i=0;i<reqs.size();i++){
+  int ierr;
-    list.push_back(reqs[i]);
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  }
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@ -84,19 +84,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
 {
  assert(0);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@ -268,10 +268,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
 {
  assert(0); //unimplemented
 }
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@ -284,15 +280,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
 void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  assert(0); // Unimplemented
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
  }
-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<ig->lSites();idx++){
    std::vector<int> lcoor(ni);
    ig->LocalIndexToLocalCoor(idx,lcoor);
@ -446,6 +446,79 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
 }
 template<class vobj>
 void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;
  sobj s;
  GridBase *lg = lowDim._grid;
  GridBase *hg = higherDim._grid;
  int nl = lg->_ndimension;
  int nh = hg->_ndimension;
  assert(nl == nh);
  assert(orthog<nh);
  assert(orthog>=0);
  for(int d=0;d<nh;d++){
    assert(lg->_processors[d]  == hg->_processors[d]);
    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
  }
  // the above should guarantee that the operations are local
  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
    if( lcoor[orthog] == slice_lo ) { 
      hcoor=lcoor;
      hcoor[orthog] = slice_hi;
      peekLocalSite(s,lowDim,lcoor);
      pokeLocalSite(s,higherDim,hcoor);
    }
  }
 }
 template<class vobj>
 void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
  typedef typename vobj::scalar_object sobj;
  sobj s;
  GridBase *lg = lowDim._grid;
  GridBase *hg = higherDim._grid;
  int nl = lg->_ndimension;
  int nh = hg->_ndimension;
  assert(nl == nh);
  assert(orthog<nh);
  assert(orthog>=0);
  for(int d=0;d<nh;d++){
    assert(lg->_processors[d]  == hg->_processors[d]);
    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
  }
  // the above should guarantee that the operations are local
  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
    if( lcoor[orthog] == slice_lo ) { 
      hcoor=lcoor;
      hcoor[orthog] = slice_hi;
      peekLocalSite(s,higherDim,hcoor);
      pokeLocalSite(s,lowDim,lcoor);
    }
  }
 }
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@ -194,22 +194,22 @@ class BinaryIO {
      std::vector<int> site({x,y,z,t});
-      if ( grid->IsBoss() ) {
+      if (grid->IsBoss()) {
-	fin.read((char *)&file_object,sizeof(file_object));
+        fin.read((char *)&file_object, sizeof(file_object));
-	bytes += sizeof(file_object);
+        bytes += sizeof(file_object);
-	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
-	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
-	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
-	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
+        if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));
-	munge(file_object,munged,csum);
+        munge(file_object, munged, csum);
      }
      // The boss who read the file has their value poked
      pokeSite(munged,Umu,site);
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@ -254,20 +254,20 @@ class BinaryIO {
      if ( grid->IsBoss() ) {
-	
+  
-	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
-	// NB could gather an xstrip as an optimisation.
+  // NB could gather an xstrip as an optimisation.
-	fout.write((char *)&file_object,sizeof(file_object));
+  fout.write((char *)&file_object,sizeof(file_object));
-	bytes+=sizeof(file_object);
+  bytes+=sizeof(file_object);
      }
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@ -305,15 +305,15 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);
      if( rank == grid->ThisRank() ){
-	//	std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
+  //  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
-	parallel.GetState(saved,l_idx);
+  parallel.GetState(saved,l_idx);
      }
      grid->Broadcast(rank,(void *)&saved[0],bytes);
      if ( grid->IsBoss() ) {
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-	fout.write((char *)&saved[0],bytes);
+  fout.write((char *)&saved[0],bytes);
      }
    }
@ -355,14 +355,14 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);
      if ( grid->IsBoss() ) {
-	fin.read((char *)&saved[0],bytes);
+  fin.read((char *)&saved[0],bytes);
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
      }
      grid->Broadcast(0,(void *)&saved[0],bytes);
      if( rank == grid->ThisRank() ){
-	parallel.SetState(saved,l_idx);
+  parallel.SetState(saved,l_idx);
      }
    }
@ -415,15 +415,15 @@ class BinaryIO {
      if ( d == 0 ) parallel[d] = 0;
      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
+  range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
+  start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
+  range[d] = grid->_gdimensions[d];
-	start[d] = 0;
+  start[d] = 0;
-	ioproc[d]= 0;
+  ioproc[d]= 0;
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
    }
@ -434,9 +434,9 @@ class BinaryIO {
      std::cout<< std::dec ;
      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
+  std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
+  if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@ -463,7 +463,7 @@ class BinaryIO {
      // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@ -472,8 +472,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  gsite[d] = tsite[d]+start[d];               // global site
      }
      /////////////////////////
@ -487,29 +487,29 @@ class BinaryIO {
      // iorank reads from the seek
      ////////////////////////////////
      if (myrank == iorank) {
-	
+  
-	fin.seekg(offset+g_idx*sizeof(fileObj));
+  fin.seekg(offset+g_idx*sizeof(fileObj));
-	fin.read((char *)&fileObj,sizeof(fileObj));
+  fin.read((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  bytes+=sizeof(fileObj);
-	
+  
-	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
-	
+  
-	munge(fileObj,siteObj,csum);
+  munge(fileObj,siteObj,csum);
-      }	
+      } 
      // Possibly do transport through pt2pt 
      if ( rank != iorank ) { 
-	if ( (myrank == rank) || (myrank==iorank) ) {
+  if ( (myrank == rank) || (myrank==iorank) ) {
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
-	}
+  }
      }
      // Poke at destination
      if ( myrank == rank ) {
-	  pokeLocalSite(siteObj,Umu,lsite);
+    pokeLocalSite(siteObj,Umu,lsite);
      }
      grid->Barrier(); // necessary?
    }
@ -520,7 +520,7 @@ class BinaryIO {
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@ -558,15 +558,15 @@ class BinaryIO {
      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
+  range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
+  start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
+  range[d] = grid->_gdimensions[d];
-	start[d] = 0;
+  start[d] = 0;
-	ioproc[d]= 0;
+  ioproc[d]= 0;
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
@ -577,9 +577,9 @@ class BinaryIO {
      grid->GlobalSum(tmp);
      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
+  std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
+  if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@ -610,7 +610,7 @@ class BinaryIO {
    // should aggregate a whole chunk and then write.
    // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@ -619,8 +619,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  gsite[d] = tsite[d]+start[d];               // global site
      }
@ -640,26 +640,26 @@ class BinaryIO {
      // Pair of nodes may need to do pt2pt send
      if ( rank != iorank ) { // comms is necessary
-	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-	  // Send to IOrank 
+    // Send to IOrank 
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
-	}
+  }
      }
      grid->Barrier(); // necessary?
      if (myrank == iorank) {
-	
+  
-	munge(siteObj,fileObj,csum);
+  munge(siteObj,fileObj,csum);
-	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
-	
+  
-	fout.seekp(offset+g_idx*sizeof(fileObj));
+  fout.seekp(offset+g_idx*sizeof(fileObj));
-	fout.write((char *)&fileObj,sizeof(fileObj));
+  fout.write((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  bytes+=sizeof(fileObj);
      }
    }
@ -668,7 +668,7 @@ class BinaryIO {
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@ -55,11 +55,14 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
    // Index conventions:                            Lorentz x Spin x Colour
    // note: static const int or constexpr will work for type deductions
    //       with the intel compiler (up to version 17)
    //////////////////////////////////////////////////////////////////////////////
-    static const int ColourIndex = 2;
+    #define ColourIndex  2
-    static const int SpinIndex   = 1;
+    #define SpinIndex    1
-    static const int LorentzIndex= 0;
+    #define LorentzIndex 0
    // Also should make these a named enum type
    static const int DaggerNo=0;
    static const int DaggerYes=1;
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@ -111,12 +111,16 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 #define FermOp4dVecTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;		\
  template class A<ZWilsonImplF>;		\
  template class A<ZWilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		
 #define FermOp5dVecTemplateInstantiate(A) \
  template class A<DomainWallVec5dImplF>;	\
-  template class A<DomainWallVec5dImplD>;	
+  template class A<DomainWallVec5dImplD>;	\
  template class A<ZDomainWallVec5dImplF>;	\
  template class A<ZDomainWallVec5dImplD>;	
 #define FermOpTemplateInstantiate(A) \
 FermOp4dVecTemplateInstantiate(A) \
@ -138,6 +142,7 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/MobiusFermion.h>
 #include <Grid/qcd/action/fermion/ZMobiusFermion.h>
 #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
@ -176,6 +181,11 @@ typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -54,18 +54,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag (Ls,1.0);
+  std::vector<Coeff_t> diag (Ls,1.0);
-  std::vector<RealD> upper(Ls,-1.0); upper[Ls-1]=mass;
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
-  std::vector<RealD> lower(Ls,-1.0); lower[0]   =mass;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = bs;
+  std::vector<Coeff_t> diag = bs;
-  std::vector<RealD> upper= cs;
+  std::vector<Coeff_t> upper= cs;
-  std::vector<RealD> lower= cs; 
+  std::vector<Coeff_t> lower= cs; 
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
@ -73,9 +73,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = beo;
+  std::vector<Coeff_t> diag = beo;
-  std::vector<RealD> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
@ -88,9 +88,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = bee;
+  std::vector<Coeff_t> diag = bee;
-  std::vector<RealD> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-cee[i];
    lower[i]=-cee[i];
@ -104,9 +104,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = bee;
+  std::vector<Coeff_t> diag = bee;
-  std::vector<RealD> upper(Ls);
+  std::vector<Coeff_t> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
@ -129,9 +129,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag(Ls,1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
-  std::vector<RealD> upper(Ls,-1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);
-  std::vector<RealD> lower(Ls,-1.0);
+  std::vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
@ -141,9 +141,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag =bs;
+  std::vector<Coeff_t> diag =bs;
-  std::vector<RealD> upper=cs;
+  std::vector<Coeff_t> upper=cs;
-  std::vector<RealD> lower=cs;
+  std::vector<Coeff_t> lower=cs;
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,psi,Din,lower,diag,upper);
@ -273,11 +273,21 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  SetCoefficientsZolotarev(1.0,zdata,b,c);
+  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(1.0,gamma,b,c);
 }
 //Zolo
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
  std::vector<Coeff_t> gamma(this->Ls);
  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
  SetCoefficientsInternal(zolo_hi,gamma,b,c);
 }
 //Zolo
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;
@ -315,7 +325,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
  double bmc = b-c;
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
-    omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
@ -377,7 +387,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
  }
  { 
-    double delta_d=mass*cee[Ls-1];
+    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
    dee[Ls-1] += delta_d;
  }  
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@ -62,16 +62,16 @@ namespace Grid {
      void M5D(const FermionField &psi,
 	       const FermionField &phi, 
 	       FermionField &chi,
-	       std::vector<RealD> &lower,
+	       std::vector<Coeff_t> &lower,
-	       std::vector<RealD> &diag,
+	       std::vector<Coeff_t> &diag,
-	       std::vector<RealD> &upper);
+	       std::vector<Coeff_t> &upper);
      void M5Ddag(const FermionField &psi,
 		  const FermionField &phi, 
 		  FermionField &chi,
-		  std::vector<RealD> &lower,
+		  std::vector<Coeff_t> &lower,
-		  std::vector<RealD> &diag,
+		  std::vector<Coeff_t> &diag,
-		  std::vector<RealD> &upper);
+		  std::vector<Coeff_t> &upper);
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
      virtual void   Instantiatable(void)=0;
@ -91,23 +91,23 @@ namespace Grid {
      RealD mass;
      // Cayley form Moebius (tanh and zolotarev)
-      std::vector<RealD> omega; 
+      std::vector<Coeff_t> omega; 
-      std::vector<RealD> bs;    // S dependent coeffs
+      std::vector<Coeff_t> bs;    // S dependent coeffs
-      std::vector<RealD> cs;    
+      std::vector<Coeff_t> cs;    
-      std::vector<RealD> as;    
+      std::vector<Coeff_t> as;    
      // For preconditioning Cayley form
-      std::vector<RealD> bee;    
+      std::vector<Coeff_t> bee;    
-      std::vector<RealD> cee;    
+      std::vector<Coeff_t> cee;    
-      std::vector<RealD> aee;    
+      std::vector<Coeff_t> aee;    
-      std::vector<RealD> beo;    
+      std::vector<Coeff_t> beo;    
-      std::vector<RealD> ceo;    
+      std::vector<Coeff_t> ceo;    
-      std::vector<RealD> aeo;    
+      std::vector<Coeff_t> aeo;    
      // LDU factorisation of the eeoo matrix
-      std::vector<RealD> lee;    
+      std::vector<Coeff_t> lee;    
-      std::vector<RealD> leem;    
+      std::vector<Coeff_t> leem;    
-      std::vector<RealD> uee;    
+      std::vector<Coeff_t> uee;    
-      std::vector<RealD> ueem;    
+      std::vector<Coeff_t> ueem;    
-      std::vector<RealD> dee;    
+      std::vector<Coeff_t> dee;    
      // Constructors
      CayleyFermion5D(GaugeField &_Umu,
@ -117,20 +117,19 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
    };
  }
 }
 #define INSTANTIATE_DPERP(A)\
 template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					   std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
 template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@ -43,9 +43,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
+				std::vector<Coeff_t> &lower,
-				std::vector<RealD> &diag,
+				std::vector<Coeff_t> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
@ -82,9 +82,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
+				   std::vector<Coeff_t> &lower,
-				   std::vector<RealD> &diag,
+				   std::vector<Coeff_t> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
@ -204,6 +204,8 @@ PARALLEL_FOR_LOOP
  INSTANTIATE_DPERP(WilsonImplD);
  INSTANTIATE_DPERP(GparityWilsonImplF);
  INSTANTIATE_DPERP(GparityWilsonImplD);
  INSTANTIATE_DPERP(ZWilsonImplF);
  INSTANTIATE_DPERP(ZWilsonImplD);
 #endif
 }}
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@ -43,9 +43,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
+				std::vector<Coeff_t> &lower,
-				std::vector<RealD> &diag,
+				std::vector<Coeff_t> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
@ -65,9 +65,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
+				   std::vector<Coeff_t> &lower,
-				   std::vector<RealD> &diag,
+				   std::vector<Coeff_t> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@ -53,9 +53,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
+				std::vector<Coeff_t> &lower,
-				std::vector<RealD> &diag,
+				std::vector<Coeff_t> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
@ -121,9 +121,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
+				   std::vector<Coeff_t> &lower,
-				   std::vector<RealD> &diag,
+				   std::vector<Coeff_t> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
@ -194,8 +194,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
  chi.checkerboard=psi.checkerboard;
-  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
@ -212,8 +212,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
-  Eigen::MatrixXd PplusMat ;
+  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXd PminusMat;
+  Eigen::MatrixXcd PminusMat;
  if ( inv ) {
    PplusMat =Pplus.inverse();
@ -298,8 +298,12 @@ PARALLEL_FOR_LOOP
 INSTANTIATE_DPERP(DomainWallVec5dImplD);
 INSTANTIATE_DPERP(DomainWallVec5dImplF);
 INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
 INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
 template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 }}
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@ -100,7 +100,8 @@ namespace Grid {
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
    typedef typename Impl::Compressor               Compressor;		\
    typedef typename Impl::StencilImpl             StencilImpl;	  \
-    typedef typename Impl::ImplParams ImplParams;
+    typedef typename Impl::ImplParams ImplParams; \
    typedef typename Impl::Coeff_t       Coeff_t;
 #define INHERIT_IMPL_TYPES(Base) \
    INHERIT_GIMPL_TYPES(Base)\
@ -109,12 +110,14 @@ namespace Grid {
    ///////
    // Single flavour four spinors with colour index
    ///////
-    template<class S,int Nrepresentation=Nc>
+    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S, Nrepresentation> > { 
    public:
      const bool LsVectorised=false;
      typedef _Coeff_t Coeff_t;
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
@ -192,12 +195,13 @@ PARALLEL_FOR_LOOP
    ///////
    // Single flavour four spinors with colour index, 5d redblack
    ///////
-    template<class S,int Nrepresentation=Nc>
+    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
      const bool LsVectorised=true;
      typedef _Coeff_t Coeff_t;
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
@ -287,12 +291,13 @@ PARALLEL_FOR_LOOP
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
-    template<class S,int Nrepresentation>
+    template<class S,int Nrepresentation,class _Coeff_t = RealD>
    class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
    public:
      const bool LsVectorised=false;
      typedef _Coeff_t Coeff_t;
      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
@ -483,6 +488,18 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
    typedef WilsonImpl<vComplex ,Nc,ComplexD> ZWilsonImplR; // Real.. whichever prec
    typedef WilsonImpl<vComplexF,Nc,ComplexD> ZWilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc,ComplexD> ZWilsonImplD; // Double
    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
    typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
    typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
    typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@ -68,16 +68,21 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
-  // No asm implementation yet.
+#ifdef AVX512
-  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+  if ( AsmOpt ) {
-  //  else
+    WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-  for(int site=0;site<Ns;site++) {
+  } else {
-    for(int s=0;s<Ls;s++) {
+#else
-      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+  {  
-      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+#endif
-      sF++;
+    for(int site=0;site<Ns;site++) {
      for(int s=0;s<Ls;s++) {
 	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	sF++;
      }
      sU++;
    }
    sU++;
  }
 }
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@ -79,6 +79,10 @@ namespace Grid {
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
     void DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
@ -92,7 +96,25 @@ namespace Grid {
     WilsonKernels(const ImplParams &p= ImplParams());
    };
-
+    
    ///////////////////////////////////////////////////////////
    // Default to no assembler implementation
    ///////////////////////////////////////////////////////////
    template<class Impl>
    void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
    {
      assert(0);
    }
    template<class Impl>
    void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
    {
      assert(0);
    }
  }
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@ -26,59 +26,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
+*************************************************************************************/
-    /*  END LEGAL */
+/*  END LEGAL */
 #include <Grid.h>
 namespace Grid {
-namespace QCD {
+  namespace QCD {
-
+    
  ///////////////////////////////////////////////////////////
  // Default to no assembler implementation
  ///////////////////////////////////////////////////////////
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 #if defined(AVX512) 
-
+    
-
+    
-  ///////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////
-  // If we are AVX512 specialise the single precision routine
+    // If we are AVX512 specialise the single precision routine
-  ///////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////
-
+    
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
-
+    
-static Vector<vComplexF> signs;
+    static Vector<vComplexF> signs;
-
+    
-int setupSigns(void ){
+    int setupSigns(void ){
-  Vector<vComplexF> bother(2);
+      Vector<vComplexF> bother(2);
-  signs = bother;
+      signs = bother;
-  vrsign(signs[0]);
+      vrsign(signs[0]);
-  visign(signs[1]);
+      visign(signs[1]);
-  return 1;
+      return 1;
-}
+    }
-static int signInit = setupSigns();
+    static int signInit = setupSigns();
-
+  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
-
+  
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
-template<>
+  
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+#undef KERNEL_DAG
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+    template<>
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+      
 #define KERNEL_DAG
    template<>
    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@ -89,32 +86,22 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-template<>
+				    
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+#undef KERNEL_DAG
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+    template<>
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #define KERNEL_DAG
    template<>
    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #endif
-
+  }
-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+}
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@ -30,7 +30,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);
 #ifdef KERNEL_DAG
    XP_PROJMEM(base);
 #else 
    XM_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@ -41,15 +45,22 @@
    MULT_2SPIN_DIR_PFXP(Xp,basep);
  }
  LOAD64(%r10,isigns);
 #ifdef KERNEL_DAG
  XP_RECON;
 #else
  XM_RECON;
-
+#endif
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    YP_PROJMEM(base);
 #else
    YM_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@ -60,7 +71,11 @@
    MULT_2SPIN_DIR_PFYP(Yp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  YP_RECON_ACCUM;
 #else
  YM_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Zp
@ -68,7 +83,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    ZP_PROJMEM(base);
 #else
    ZM_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@ -79,7 +98,11 @@
    MULT_2SPIN_DIR_PFZP(Zp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  ZP_RECON_ACCUM;
 #else
  ZM_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Tp
@ -87,7 +110,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    TP_PROJMEM(base);
 #else
    TM_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
@ -98,7 +125,11 @@
    MULT_2SPIN_DIR_PFTP(Tp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  TP_RECON_ACCUM;
 #else
  TM_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Xm
@ -107,7 +138,11 @@
  //  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    XM_PROJMEM(base);
 #else
    XP_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@ -118,7 +153,11 @@
    MULT_2SPIN_DIR_PFXM(Xm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  XM_RECON_ACCUM;
 #else
  XP_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Ym
@ -126,7 +165,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    YM_PROJMEM(base);
 #else
    YP_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@ -137,7 +180,11 @@
    MULT_2SPIN_DIR_PFYM(Ym,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  YM_RECON_ACCUM;
 #else
  YP_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Zm
@ -145,7 +192,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    ZM_PROJMEM(base);
 #else
    ZP_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@ -156,7 +207,11 @@
    MULT_2SPIN_DIR_PFZM(Zm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  ZM_RECON_ACCUM;
 #else
  ZP_RECON_ACCUM;
 #endif
  ////////////////////////////////
  // Tm
@ -164,7 +219,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
    TM_PROJMEM(base);
 #else
    TP_PROJMEM(base);
 #endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
@ -175,7 +234,11 @@
    MULT_2SPIN_DIR_PFTM(Tm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 #ifdef KERNEL_DAG
  TM_RECON_ACCUM;
 #else
  TP_RECON_ACCUM;
 #endif
  basep= st.GetPFInfo(nent,plocal); nent++;
  SAVE_RESULT(base,basep);
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@ -839,46 +839,23 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 ////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+#define INSTANTIATE_THEM(A) \
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-							       int ss,int sU,const FermionField &in, FermionField &out);
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+							       int ss,int sU,const FermionField &in, FermionField &out);\
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
-							       int ss,int sU,const FermionField &in, FermionField &out);
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								  int ss,int sU,const FermionField &in, FermionField &out);
-
+INSTANTIATE_THEM(WilsonImplF);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+INSTANTIATE_THEM(WilsonImplD);
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+INSTANTIATE_THEM(ZWilsonImplF);
-								      int ss,int sU,const FermionField &in, FermionField &out);
+INSTANTIATE_THEM(ZWilsonImplD);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+INSTANTIATE_THEM(GparityWilsonImplF);
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+INSTANTIATE_THEM(GparityWilsonImplD);
-								      int ss,int sU,const FermionField &in, FermionField &out);
+INSTANTIATE_THEM(DomainWallVec5dImplF);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+INSTANTIATE_THEM(DomainWallVec5dImplD);
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+INSTANTIATE_THEM(ZDomainWallVec5dImplF);
-									 int ss,int sU,const FermionField &in, FermionField &out);
+INSTANTIATE_THEM(ZDomainWallVec5dImplD);
 template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 }}
--- a/lib/qcd/action/fermion/ZMobiusFermion.h
+++ b/lib/qcd/action/fermion/ZMobiusFermion.h
@ -0,0 +1,79 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/MobiusFermion.h
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef  GRID_QCD_ZMOBIUS_FERMION_H
 #define  GRID_QCD_ZMOBIUS_FERMION_H
 #include <Grid/Grid.h>
 namespace Grid {
  namespace QCD {
    template<class Impl>
    class ZMobiusFermion : public CayleyFermion5D<Impl>
    {
    public:
     INHERIT_IMPL_TYPES(Impl);
    public:
      virtual void   Instantiatable(void) {};
      // Constructors
      ZMobiusFermion(GaugeField &_Umu,
 		     GridCartesian         &FiveDimGrid,
 		     GridRedBlackCartesian &FiveDimRedBlackGrid,
 		     GridCartesian         &FourDimGrid,
 		     GridRedBlackCartesian &FourDimRedBlackGrid,
 		     RealD _mass,RealD _M5,
 		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
      CayleyFermion5D<Impl>(_Umu,
 			    FiveDimGrid,
 			    FiveDimRedBlackGrid,
 			    FourDimGrid,
 			    FourDimRedBlackGrid,_mass,_M5,p)
      {
 	RealD eps = 1.0;
 	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
 	std::vector<Coeff_t> zgamma(this->Ls);
 	for(int s=0;s<this->Ls;s++){
 	  zgamma[s] = gamma[s];
 	}
 	// Call base setter
 	this->SetCoefficientsInternal(1.0,zgamma,b,c);
      }
    };
  }
 }
 #endif
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@ -1,300 +1,421 @@
-    /*************************************************************************************
+/*******************************************************************************
-
+ 
-    Grid physics library, www.github.com/paboyle/Grid 
+ Grid physics library, www.github.com/paboyle/Grid
-
+ 
-    Source file: ./lib/simd/Grid_qpx.h
+ Source file: ./lib/simd/Grid_qpx.h
-
+ 
-    Copyright (C) 2015
+ Copyright (C) 2016
-
+ 
-Author: neo <cossu@post.kek.jp>
+ Author: Antonin Portelli <antonin.portelli@me.com>
-
+ 
-    This program is free software; you can redistribute it and/or modify
+ This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+ it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+ the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
+ (at your option) any later version.
-
+ 
-    This program is distributed in the hope that it will be useful,
+ This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+ GNU General Public License for more details.
-
+ 
-    You should have received a copy of the GNU General Public License along
+ You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
+ with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
+ 
-    See the full license in the file "LICENSE" in the top level distribution directory
+ See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
+ ******************************************************************************/
    /*  END LEGAL */
 //----------------------------------------------------------------------
 /*! @file Grid_qpx.h
  @brief Optimization libraries for QPX instructions set for BG/Q
  Using intrinsics
 */
 // Time-stamp: <2015-05-27 11:30:21 neo>
 //----------------------------------------------------------------------
 // lot of undefined functions
 namespace Grid {
 namespace Optimization {
  typedef struct 
  {
    float v0,v1,v2,v3;
  } vector4float;
  inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
  {
    stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
    return stream;
  };
  inline std::ostream & operator<<(std::ostream& stream, const vector4float a)
  {
    stream << "{"<< a.v0 <<","<< a.v1 <<","<< a.v2 <<","<< a.v3 <<"}";
    return stream;
  };
  struct Vsplat{
    //Complex float
-    inline float operator()(float a, float b){
+    inline vector4float operator()(float a, float b){
-      return {a,b,a,b};
+      return (vector4float){a, b, a, b};
    }
    // Real float
-    inline float operator()(float a){
+    inline vector4float operator()(float a){
-      return {a,a,a,a};
+      return (vector4float){a, a, a, a};
    }
    //Complex double
    inline vector4double operator()(double a, double b){
-      return {a,b,a,b};
+      return (vector4double){a, b, a, b};
    }
    //Real double
    inline vector4double operator()(double a){
-      return {a,a,a,a};
+      return (vector4double){a, a, a, a};
    }
    //Integer
    inline int operator()(Integer a){
-#error
+      return a;
    }
  };
-
+  
  struct Vstore{
-    //Float 
+    //Float
-    inline void operator()(float a, float* F){
+    inline void operator()(vector4double a, float *f){
-      assert(0);
+      vec_st(a, 0, f);
    }
    inline void operator()(vector4double a, vector4float &f){
      vec_st(a, 0, (float *)(&f));
    }
    inline void operator()(vector4float a, float *f){
      f[0] = a.v0;
      f[1] = a.v1;
      f[2] = a.v2;
      f[3] = a.v3;
    }
    //Double
-    inline void operator()(vector4double a, double* D){
+    inline void operator()(vector4double a, double *d){
-      assert(0);
+      vec_st(a, 0, d);
    }
    //Integer
-    inline void operator()(int a, Integer* I){
+    inline void operator()(int a, Integer *i){
-      assert(0);
+      i[0] = a;
    }
  };
-
+  
  struct Vstream{
    //Float
-    inline void operator()(float * a, float b){
+    inline void operator()(float *f, vector4double a){
-      assert(0);
+      vec_st(a, 0, f);
    }
    //Double
    inline void operator()(double * a, vector4double b){
      assert(0);
    }
    inline void operator()(vector4float f, vector4double a){
      vec_st(a, 0, (float *)(&f));
    }
    inline void operator()(float *f, vector4float a){
      f[0] = a.v0;
      f[1] = a.v1;
      f[2] = a.v2;
      f[3] = a.v3;
    }
    //Double
    inline void operator()(double *d, vector4double a){
      vec_st(a, 0, d);
    }
  };
-
+  
  struct Vset{
-    // Complex float 
+    // Complex float
-    inline float operator()(Grid::ComplexF *a){
+    inline vector4float operator()(Grid::ComplexF *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+      return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()};
    }
-    // Complex double 
+    // Complex double
    inline vector4double operator()(Grid::ComplexD *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+      return vec_ld(0, (double *)a);
    }
-    // Real float 
+
-    inline float operator()(float *a){
+    // Real float
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+    inline vector4float operator()(float *a){
      return (vector4float){a[0], a[1], a[2], a[3]};
    }
    inline vector4double operator()(vector4float a){
      return vec_ld(0, (float *)(&a));
    }
    // Real double
    inline vector4double operator()(double *a){
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+      return vec_ld(0, a);
    }
    // Integer
    inline int operator()(Integer *a){
-#error
+      return a[0];
    }
-
+    
-
+    
  };
-
+  
  template <typename Out_type, typename In_type>
-    struct Reduce{
+  struct Reduce{
-      //Need templated class to overload output type
+    //Need templated class to overload output type
-      //General form must generate error if compiled
+    //General form must generate error if compiled
-      inline Out_type operator()(In_type in){
+    inline Out_type operator()(In_type in){
-	printf("Error, using wrong Reduce function\n");
+      printf("Error, using wrong Reduce function\n");
-	exit(1);
+      exit(1);
-	return 0;
+      return 0;
-      }
+    }
-    };
+  };
-
+  
  /////////////////////////////////////////////////////
  // Arithmetic operations
  /////////////////////////////////////////////////////
  #define FLOAT_WRAP_2(fn, pref)\
  pref vector4float fn(vector4float a, vector4float b)\
  {\
    vector4double ad, bd, rd;\
    vector4float  r;\
    \
    ad = Vset()(a);\
    bd = Vset()(b);\
    rd = fn(ad, bd);\
    Vstore()(rd, r);\
    \
    return r;\
  }
  #define FLOAT_WRAP_1(fn, pref)\
  pref vector4float fn(vector4float a)\
  {\
    vector4double ad, rd;\
    vector4float  r;\
    \
    ad = Vset()(a);\
    rd = fn(ad);\
    Vstore()(rd, r);\
    \
    return r;\
  }
  struct Sum{
    //Complex/Real float
    inline float operator()(float a, float b){
 #error
    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
-      return vec_add(a,b);
+      return vec_add(a, b);
    }
    //Complex/Real float
    FLOAT_WRAP_2(operator(), inline)
    //Integer
    inline int operator()(int a, int b){
-#error
+      return a + b;
    }
  };
-
+  
  struct Sub{
    //Complex/Real float
    inline float operator()(float a, float b){
 #error
    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_sub(a, b);
    }
    //Complex/Real float
    FLOAT_WRAP_2(operator(), inline)
    //Integer
-    inline floati operator()(int a, int b){
+    inline int operator()(int a, int b){
-#error
+      return a - b;
    }
  };
-
+  
  struct MultComplex{
    // Complex float
    inline float operator()(float a, float b){
 #error
    }
    // Complex double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_xxnpmadd(a, b, vec_xmul(b, a));
    }
  };
    // Complex float
    FLOAT_WRAP_2(operator(), inline)
  };
  struct Mult{
    // Real float
    inline float operator()(float a, float b){
 #error
    }
    // Real double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_mul(a, b);
    }
    // Real float
    FLOAT_WRAP_2(operator(), inline)
    // Integer
    inline int operator()(int a, int b){
-#error
+      return a*b;
    }
  };
-
+  
  struct Conj{
    // Complex single
    inline float operator()(float in){
      assert(0);
    }
    // Complex double
-    inline vector4double operator()(vector4double in){
+    inline vector4double operator()(vector4double v){
-      assert(0);
+      return vec_mul(v, (vector4double){1., -1., 1., -1.});
    }
    // do not define for integer input
  };
    // Complex float
    FLOAT_WRAP_1(operator(), inline)
  };
  struct TimesMinusI{
    //Complex single
    inline float operator()(float in, float ret){
      assert(0);
    }
    //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
+    inline vector4double operator()(vector4double v, vector4double ret){
-      assert(0);
+      return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
                               (vector4double){0., 0., 0., 0.});
    }
-
+    // Complex float
    FLOAT_WRAP_2(operator(), inline)
  };
-
+  
  struct TimesI{
    //Complex single
    inline float operator()(float in, float ret){
    }
    //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
+    inline vector4double operator()(vector4double v, vector4double ret){
-  
+      return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
                              (vector4double){0., 0., 0., 0.});
    }
-
+    // Complex float
    FLOAT_WRAP_2(operator(), inline)
  };
  struct Permute{
    //Complex double
    static inline vector4double Permute0(vector4double v){ //0123 -> 2301
      return vec_perm(v, v, vec_gpci(02301));
    };
    static inline vector4double Permute1(vector4double v){ //0123 -> 1032
      return vec_perm(v, v, vec_gpci(01032));
    };
    static inline vector4double Permute2(vector4double v){
      return v;
    };
    static inline vector4double Permute3(vector4double v){
      return v;
    };
    // Complex float
    FLOAT_WRAP_1(Permute0, static inline)
    FLOAT_WRAP_1(Permute1, static inline)
    FLOAT_WRAP_1(Permute2, static inline)
    FLOAT_WRAP_1(Permute3, static inline)
  };
  struct Rotate{
    static inline vector4double rotate(vector4double v, int n){
      switch(n){
        case 0:
          return v;
          break;
        case 1:
          return vec_perm(v, v, vec_gpci(01230));
          break;
        case 2:
          return vec_perm(v, v, vec_gpci(02301));
          break;
        case 3:
          return vec_perm(v, v, vec_gpci(03012));
          break;
        default: assert(0);
      }
    }
    static inline vector4float rotate(vector4float v, int n){
      vector4double vd, rd;
      vector4float  r;
-  //////////////////////////////////////////////
+      vd = Vset()(v);
-  // Some Template specialization
+      rd = rotate(vd, n);
      Vstore()(rd, r);
      return r;
    }
  };
  //Complex float Reduce
  template<>
-    inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
+  inline Grid::ComplexF
-    assert(0);
+  Reduce<Grid::ComplexF, vector4float>::operator()(vector4float v) { //2 complex
    vector4float v1,v2;
    v1 = Optimization::Permute::Permute0(v);
    v1 = Optimization::Sum()(v1, v);
    return Grid::ComplexF(v1.v0, v1.v1);
  }
  //Real float Reduce
  template<>
-    inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
+  inline Grid::RealF
-    assert(0);
+  Reduce<Grid::RealF, vector4float>::operator()(vector4float v){ //4 floats
    vector4float v1,v2;
    v1 = Optimization::Permute::Permute0(v);
    v1 = Optimization::Sum()(v1, v);
    v2 = Optimization::Permute::Permute1(v1);
    v1 = Optimization::Sum()(v1, v2);
    return v1.v0;
  }
  //Complex double Reduce
  template<>
-    inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
+  inline Grid::ComplexD
-    assert(0);
+  Reduce<Grid::ComplexD, vector4double>::operator()(vector4double v){ //2 complex
    vector4double v1;
    v1 = Optimization::Permute::Permute0(v);
    v1 = vec_add(v1, v);
    return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
  }
  //Real double Reduce
  template<>
-    inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
+  inline Grid::RealD
-    assert(0);
+  Reduce<Grid::RealD, vector4double>::operator()(vector4double v){ //4 doubles
-  }
+    vector4double v1,v2;
    v1 = Optimization::Permute::Permute0(v);
    v1 = vec_add(v1, v);
    v2 = Optimization::Permute::Permute1(v1);
    v1 = vec_add(v1, v2);
    return vec_extract(v1, 0);
  }
  //Integer Reduce
  template<>
-    inline Integer Reduce<Integer, floati>::operator()(float in){
+  inline Integer Reduce<Integer, int>::operator()(int in){
    // FIXME unimplemented
    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 }
-//////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
-// Here assign types 
+// Here assign types
-namespace Grid {
+typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
-  typedef float SIMD_Ftype  __attribute__ ((vector_size (16)));         // Single precision type
+typedef vector4double              SIMD_Dtype; // Double precision type
-  typedef vector4double SIMD_Dtype; // Double precision type
+typedef int                        SIMD_Itype; // Integer type
  typedef int SIMD_Itype;           // Integer type
-  inline void v_prefetch0(int size, const char *ptr){};
+// prefetch utilities
-
+inline void v_prefetch0(int size, const char *ptr){};
-  // Function name aliases
+inline void prefetch_HINT_T0(const char *ptr){};
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
  typedef Optimization::Vset     VsetSIMD;
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
-  // Arithmetic operations
+// Function name aliases
-  typedef Optimization::Sum         SumSIMD;
+typedef Optimization::Vsplat   VsplatSIMD;
-  typedef Optimization::Sub         SubSIMD;
+typedef Optimization::Vstore   VstoreSIMD;
-  typedef Optimization::Mult        MultSIMD;
+typedef Optimization::Vset     VsetSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
+typedef Optimization::Vstream  VstreamSIMD;
-  typedef Optimization::Conj        ConjSIMD;
+template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
 // Arithmetic operations
 typedef Optimization::Sum         SumSIMD;
 typedef Optimization::Sub         SubSIMD;
 typedef Optimization::Mult        MultSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
 typedef Optimization::TimesI      TimesISIMD;
 }
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@ -388,6 +388,12 @@ class Grid_simd {
 };  // end of Grid_simd class definition
 inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
 inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
 inline void permute(RealD &y,RealD b, int perm) {  y=b; }
 inline void permute(RealF &y,RealF b, int perm) {  y=b; }
 ////////////////////////////////////////////////////////////////////
 // General rotate
 ////////////////////////////////////////////////////////////////////
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@ -67,15 +67,13 @@ template <class scalar>
 struct AsinRealFunctor {
  scalar operator()(const scalar &a) const { return asin(real(a)); }
 };
 template <class scalar>
 struct LogRealFunctor {
  scalar operator()(const scalar &a) const { return log(real(a)); }
 };
 template <class scalar>
-struct ExpRealFunctor {
+struct ExpFunctor {
-  scalar operator()(const scalar &a) const { return exp(real(a)); }
+  scalar operator()(const scalar &a) const { return exp(a); }
 };
 template <class scalar>
 struct NotFunctor {
@ -85,7 +83,6 @@ template <class scalar>
 struct AbsRealFunctor {
  scalar operator()(const scalar &a) const { return std::abs(real(a)); }
 };
 template <class scalar>
 struct PowRealFunctor {
  double y;
@ -135,7 +132,6 @@ template <class Scalar>
 inline Scalar rsqrt(const Scalar &r) {
  return (RSqrtRealFunctor<Scalar>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
  return SimdApply(CosRealFunctor<S>(), r);
@ -162,7 +158,7 @@ inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
 }
 template <class S, class V>
 inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
-  return SimdApply(ExpRealFunctor<S>(), r);
+  return SimdApply(ExpFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
--- a/scripts/update_fftw.sh
+++ b/scripts/update_fftw.sh
@ -0,0 +1,18 @@
 #!/usr/bin/env bash
 if (( $# != 1 )); then
    echo "usage: `basename $0` <archive>" 1>&2
    exit 1
 fi
 ARC=$1
 INITDIR=`pwd`
 rm -rf lib/fftw
 mkdir lib/fftw
 ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
 tar -xf ${ARC}
 cp ${ARCDIR}/api/fftw3.h lib/fftw/
 cd ${INITDIR}
 rm -rf ${ARCDIR}
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@ -157,10 +157,9 @@ void Tester(const functor &func)
  std::cout << GridLogMessage << " " << func.name() << std::endl;
  std::cout << GridLogDebug << v_input1 << std::endl;
  std::cout << GridLogDebug << v_input2 << std::endl;
  std::cout << GridLogDebug << v_result << std::endl;
  int ok=0;
  for(int i=0;i<Nsimd;i++){
    if ( abs(reference[i]-result[i])>1.0e-7){
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@ -0,0 +1,111 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_cshift.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int vol = 1;
  for(int d=0;d<latt_size.size();d++){
    vol = vol * latt_size[d];
  }
  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
  LatticeComplexD     one(&Fine);
  LatticeComplexD      zz(&Fine);
  LatticeComplexD       C(&Fine);
  LatticeComplexD  Ctilde(&Fine);
  LatticeComplexD    coor(&Fine);
  LatticeSpinMatrixD    S(&Fine);
  LatticeSpinMatrixD    Stilde(&Fine);
  std::vector<int> p({1,2,3,2});
  one = ComplexD(1.0,0.0);
  zz  = ComplexD(0.0,0.0);
  ComplexD ci(0.0,1.0);
  C=zero;
  for(int mu=0;mu<4;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
    C = C - (TwoPiL * p[mu]) * coor;
  }
  C = exp(C*ci);
  S=zero;
  S = S+C;
  FFT theFFT(&Fine);
  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
  //  C=zero;
  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
  TComplexD cVol;
  cVol()()() = vol;
  C=zero;
  pokeSite(cVol,C,p);
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;
  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
  SpinMatrixD Sp; 
  Sp = zero; Sp = Sp+cVol;
  S=zero;
  pokeSite(Sp,S,p);
  S= S-Stilde;
  std::cout << "diff FT[SpinMat] "<<norm2(S) << std::endl;
  Grid_finalize();
 }
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@ -0,0 +1,111 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_cshift.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout( { vComplexF::Nsimd(),1,1,1});
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int vol = 1;
  for(int d=0;d<latt_size.size();d++){
    vol = vol * latt_size[d];
  }
  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
  LatticeComplexF     one(&Fine);
  LatticeComplexF      zz(&Fine);
  LatticeComplexF       C(&Fine);
  LatticeComplexF  Ctilde(&Fine);
  LatticeComplexF    coor(&Fine);
  LatticeSpinMatrixF    S(&Fine);
  LatticeSpinMatrixF    Stilde(&Fine);
  std::vector<int> p({1,2,3,2});
  one = ComplexF(1.0,0.0);
  zz  = ComplexF(0.0,0.0);
  ComplexF ci(0.0,1.0);
  C=zero;
  for(int mu=0;mu<4;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
    C = C - (TwoPiL * p[mu]) * coor;
  }
  C = exp(C*ci);
  S=zero;
  S = S+C;
  FFT theFFT(&Fine);
  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
  //  C=zero;
  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
  TComplexF cVol;
  cVol()()() = vol;
  C=zero;
  pokeSite(cVol,C,p);
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;
  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
  SpinMatrixF Sp; 
  Sp = zero; Sp = Sp+cVol;
  S=zero;
  pokeSite(Sp,S,p);
  S= S-Stilde;
  std::cout << "diff FT[SpinMat] "<<norm2(S) << std::endl;
  Grid_finalize();
 }
--- a/tests/debug/Test_cayley_even_odd_vec.cc
+++ b/tests/debug/Test_cayley_even_odd_vec.cc
@ -44,6 +44,7 @@ struct scal {
  };
 typedef DomainWallFermion<DomainWallVec5dImplR>                      DomainWallVecFermionR;
 typedef ZMobiusFermion<ZDomainWallVec5dImplR>                        ZMobiusVecFermionR;
 typedef MobiusFermion<DomainWallVec5dImplR>                          MobiusVecFermionR;
 typedef MobiusZolotarevFermion<DomainWallVec5dImplR>                 MobiusZolotarevVecFermionR;
 typedef ScaledShamirFermion<DomainWallVec5dImplR>                    ScaledShamirVecFermionR;
@ -117,6 +118,17 @@ int main (int argc, char ** argv)
  TestWhat<MobiusFermionR>(Dmob,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);
  TestWhat<MobiusVecFermionR>(sDmob,sFGrid,sFrbGrid,sUGrid,mass,M5,&sRNG4,&sRNG5);
  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
  std::cout<<GridLogMessage <<"Z-MobiusFermion test"<<std::endl;
  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
  std::vector<ComplexD> gamma(Ls,std::complex<double>(1.0,0.0));
  ZMobiusFermionR     zDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,gamma,b,c);
  ZMobiusVecFermionR szDmob(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c);
  TestMoo(zDmob,szDmob);
  TestWhat<ZMobiusFermionR>(zDmob,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);
  TestWhat<ZMobiusVecFermionR>(szDmob,sFGrid,sFrbGrid,sUGrid,mass,M5,&sRNG4,&sRNG5);
  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusZolotarevFermion test"<<std::endl;
  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;