Merge branch 'develop' into feature/hirep

2025-10-13 12:44:42 +01:00 · 2016-09-01 12:59:53 +01:00
parent f45ef8d114 8c89391c02
commit 0fd179fb33
75 changed files with 16078 additions and 2795 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@
 *.o
 *.obj

-
 # Editor files #
 ################
 *~
@@ -48,6 +47,7 @@ Config.h.in
 config.log
 config.status
 .deps
+*.inc

 # http://www.gnu.org/software/autoconf #
 ########################################
@@ -63,19 +63,7 @@ config.sub
 config.guess
 INSTALL
 .dirstamp
-
-# Packages #
-############
-# it's better to unpack these files and commit the raw source
-# git has its own built in compression methods
-*.7z
-*.dmg
-*.gz
-*.iso
-*.jar
-*.rar
-*.tar
-*.zip
+ltmain.sh
 
 # Logs and databases #
 ######################
@@ -101,3 +89,12 @@ build*/*
 #####################
 *.xcodeproj/*
 build.sh
+
+# Eigen source #
+################
+lib/Eigen/*
+
+# libtool macros #
+##################
+m4/lt*
+m4/libtool.m4
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,6 +23,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
@@ -35,6 +37,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
@@ -47,6 +51,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
@@ -59,6 +65,8 @@ matrix:
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
      
@@ -69,6 +77,7 @@ before_install:
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
    
 install:
@@ -82,14 +91,20 @@ install:
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
    
 script:
-    - ./autogen.sh
+    - ./bootstrap.sh
    - mkdir build
    - cd build
-    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
-    - make -j1 -C prerequisites
+    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - make -j4 
    - ./benchmarks/Benchmark_dwf --threads 1
    - echo make clean
-    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none
+    - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
    - make -j4
-    - ./benchmarks/Benchmark_dwf --threads 1
+    - ./benchmarks/Benchmark_dwf --threads 1
+    - echo make clean
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
+    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
+    - make -j4
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
+
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,6 +1,5 @@
 # additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/include/
+SUBDIRS = lib benchmarks tests

-SUBDIRS = prerequisites lib benchmarks tests
-
-filelist: $(SUBDIRS)
+AM_CXXFLAGS += -I$(top_builddir)/include
+ACLOCAL_AMFLAGS = -I m4
--- a/README.md
+++ b/README.md
@@ -1,8 +1,28 @@
-# Grid [![Build Status](https://travis-ci.org/paboyle/Grid.svg?branch=master)](https://travis-ci.org/paboyle/Grid)
-Data parallel C++ mathematical object library
+# Grid
+<table>
+<tr>
+    <td>Last stable release</td>
+    <td><a href="https://travis-ci.org/paboyle/Grid">
+    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
+    </td>
+</tr>
+<tr>
+    <td>Development branch</td>
+    <td><a href="https://travis-ci.org/paboyle/Grid">
+    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
+    </td>
+</tr>
+</table>

-Last update 2015/7/30
+**Data parallel C++ mathematical object library.**

+Please send all pull requests to the `develop` branch.
+
+License: GPL v2.
+
+Last update 2016/08/03.
+
+### Description
 This library provides data parallel C++ container classes with internal memory layout
 that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 are provided, similar to HPF and cmfortran, and user control is given over the mapping of
@@ -22,37 +42,75 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.

 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON on the way).
+Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON and BG/Q QPX on the way).

-These are presented as 
-
-     vRealF, vRealD, vComplexF, vComplexD 
-
-internal vector data types. These may be useful in themselves for other programmers.
-The corresponding scalar types are named
-
-     RealF, RealD, ComplexF, ComplexD
+These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
+The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.

 MPI, OpenMP, and SIMD parallelism are present in the library.
+Please see https://arxiv.org/abs/1512.03487 for more detail.

-   You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment.  Here
-are examples:
+### Installation
+First, start by cloning the repository:

-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
+``` bash
+git clone https://github.com/paboyle/Grid.git
+```

-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX
+Then enter the cloned directory and set up the build system:

-     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
+``` bash
+cd Grid
+./bootstrap.sh
+```

-     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
-     
-Note: Before running configure it could be necessary to execute the script 
-       
-       script/filelist
+Now you can execute the `configure` script to generate makefiles (here from a build directory):

+``` bash
+mkdir build; cd build
+../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
+```

-     
-For developers:
-Use reconfigure_script in the scripts/ directory to create the autotools environment 
+where `--enable-precision=` set the default precision (`single` or `double`),
+`--enable-simd=` set the SIMD type (see possible values below), `--enable-
+comms=` set the protocol used for communications (`none`, `mpi`, `mpi-auto` or
+`shmem`), and `<path>` should be replaced by the prefix path where you want to
+install Grid. The `mpi-auto` communication option set `configure` to determine
+automatically how to link to MPI. Other options are available, use `configure
+--help` to display them. Like with any other program using GNU autotool, the
+`CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
+customise the build.

+Finally, you can build and install Grid:
+
+``` bash
+make; make install
+```
+
+To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
+
+``` bash
+make -C tests/<subdir> tests
+```
+
+### Possible SIMD types
+
+The following options can be use with the `--enable-simd=` option to target different SIMD instruction sets:
+
+| String      | Description                            |
+| ----------- | -------------------------------------- |
+| `GEN`       | generic portable vector code           |
+| `SSE4`      | SSE 4.2 (128 bit)                      |
+| `AVX`       | AVX (256 bit)                          |
+| `AVXFMA4`   | AVX (256 bit) + FMA                    |
+| `AVX2`      | AVX 2 (256 bit)                        |
+| `AVX512`    | AVX 512 bit                            |
+| `AVX512MIC` | AVX 512 bit for Intel MIC architecture |
+| `ICMI`      | Intel ICMI instructions (512 bit)      |
+
+Alternatively, some CPU codenames can be directly used:
+
+| String      | Description                            |
+| ----------- | -------------------------------------- |
+| `KNC`       | [Intel Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
+| `KNL`       | [Intel Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
--- a/autogen.sh
+++ b/autogen.sh
@@ -1,4 +0,0 @@
-aclocal -I m4
-autoheader -f
-automake -f --add-missing
-autoconf -f
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -194,7 +194,7 @@ int main (int argc, char ** argv)
    }
  }  

-
+#if 0

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
@@ -315,7 +315,7 @@ int main (int argc, char ** argv)
    }
  }

-
+#endif

  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -61,6 +61,8 @@ int main (int argc, char ** argv)
    QCD::WilsonKernelsStatic::AsmOpt=0;
  }

+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -0,0 +1,117 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+    Source file: ./benchmarks/Benchmark_wilson.cc
+    Copyright (C) 2015
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Richard Rollins <rprollins@users.noreply.github.com>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+Gamma::GammaMatrix Gmu [] = {
+  Gamma::GammaX,
+  Gamma::GammaY,
+  Gamma::GammaZ,
+  Gamma::GammaT
+};
+
+bool overlapComms = false;
+
+void bench_wilson (
+		   LatticeFermion &    src,
+		   LatticeFermion & result,
+		   WilsonFermionR &     Dw,
+		   double const     volume,
+		   int const           dag );
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; }
+  typename WilsonFermionR::ImplParams params;
+  params.overlapCommsCompute = overlapComms;
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> seeds({1,2,3,4});
+  RealD mass = 0.1;
+
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+
+  int Lmax = 32;
+  int dmin = 0;
+  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
+  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
+  for (int L=8; L<=Lmax; L*=2)
+    {
+      std::vector<int> latt_size = std::vector<int>(4,L);
+      for(int d=4; d>dmin; d--)
+	{
+	  if ( d<=3 ) { latt_size[d] *= 2; }
+
+	  std::cout << GridLogMessage;
+	  std::copy( latt_size.begin(), --latt_size.end(), std::ostream_iterator<int>( std::cout, std::string("x").c_str() ) );
+	  std::cout << latt_size.back() << "\t\t";
+
+	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
+	  GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
+
+	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
+	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+	  LatticeFermion    src(&Grid); random(pRNG,src);
+	  LatticeFermion result(&Grid); result=zero;
+
+	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
+
+	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
+      
+	  bench_wilson(src,result,Dw,volume,DaggerNo);
+	  bench_wilson(src,result,Dw,volume,DaggerYes);
+	  std::cout << std::endl;
+	}
+    }
+
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  Grid_finalize();
+}
+
+void bench_wilson (
+		   LatticeFermion &    src,
+		   LatticeFermion & result,
+		   WilsonFermionR &     Dw,
+		   double const     volume,
+		   int const           dag )
+{
+  int ncall    = 1000;
+  double t0    = usecond();
+  for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
+  double t1    = usecond();
+  double flops = 1344 * volume * ncall;
+  std::cout << flops/(t1-t0) << "\t\t";
+}
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -40,14 +40,20 @@ int main(int argc,char **argv)
  std::ofstream os("zmm.dat");

  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
+  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
+  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
+  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  for(int L=4;L<=32;L+=4){
    for(int m=1;m<=2;m++){
      for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
+  std::cout << GridLogMessage <<"\t";
 	for(int i=0;i<4;i++) { 
 	  std::cout << grid[i]<<"x";
 	}
-	std::cout << Ls<<std::endl;
+	std::cout << Ls<<"\t\t";
 	bench(os,grid,Ls);
      }
    }
@@ -104,7 +110,6 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  RealD M5  =1.8;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);

-  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=50;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
@@ -116,7 +121,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  double flops=1344*volume/2;

  mfc = flops*ncall/(t1-t0);
-  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
+  std::cout<<mfc<<"\t\t";

  QCD::WilsonKernelsStatic::AsmOpt=1;
  t0=usecond();
@@ -125,7 +130,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  }
  t1=usecond();
  mfa = flops*ncall/(t1-t0);
-  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
+  std::cout<<mfa<<"\t\t";
  /*
  int dag=DaggerNo;
  t0=usecond();
@@ -163,8 +168,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  //resulta = (-0.5) * resulta;

  diff = resulto-resulta;
-  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
-  std::cout<<std::endl;
+  std::cout<<norm2(diff)<<std::endl;
  return 0;
 }

--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@@ -1,39 +0,0 @@
-
-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
-
-
-Benchmark_comms_SOURCES=Benchmark_comms.cc
-Benchmark_comms_LDADD=-lGrid
-
-
-Benchmark_dwf_SOURCES=Benchmark_dwf.cc
-Benchmark_dwf_LDADD=-lGrid
-
-
-Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
-Benchmark_dwf_ntpf_LDADD=-lGrid
-
-
-Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
-Benchmark_dwf_sweep_LDADD=-lGrid
-
-
-Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
-Benchmark_memory_asynch_LDADD=-lGrid
-
-
-Benchmark_memory_bandwidth_SOURCES=Benchmark_memory_bandwidth.cc
-Benchmark_memory_bandwidth_LDADD=-lGrid
-
-
-Benchmark_su3_SOURCES=Benchmark_su3.cc
-Benchmark_su3_LDADD=-lGrid
-
-
-Benchmark_wilson_SOURCES=Benchmark_wilson.cc
-Benchmark_wilson_LDADD=-lGrid
-
-
-Benchmark_zmm_SOURCES=Benchmark_zmm.cc
-Benchmark_zmm_LDADD=-lGrid
-
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -1,8 +1 @@
-# additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/include
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-#
-# Test code
-#
 include Make.inc
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
+FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
+
+echo "-- deploying Eigen source..."
+wget ${EIGEN_URL}
+./scripts/update_eigen.sh `basename ${EIGEN_URL}`
+rm `basename ${EIGEN_URL}`
+
+echo "-- copying fftw prototypes..."
+wget ${FFTW_URL}
+./scripts/update_fftw.sh `basename ${FFTW_URL}`
+rm `basename ${FFTW_URL}`
+
+echo '-- generating Make.inc files...'
+./scripts/filelist
+echo '-- generating configure script...'
+autoreconf -fvi
--- a/15329
+++ b/15329
--- a/configure.ac
+++ b/configure.ac
@@ -1,277 +1,293 @@
-#                         -*- Autoconf -*-
-# Process this file with autoconf to produce a configure script.
-#
-# Project Grid package  
-# 
-# Time-stamp: <2015-07-10 17:46:21 neo>
-
 AC_PREREQ([2.63])
-AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
-AC_CANONICAL_SYSTEM
+AC_INIT([Grid], [0.5.1-dev], [https://github.com/paboyle/Grid], [Grid])
 AM_INIT_AUTOMAKE(subdir-objects)
 AC_CONFIG_MACRO_DIR([m4])
-AC_LINK_FILES(lib,include/Grid )
 AC_CONFIG_SRCDIR([lib/Grid.h])
 AC_CONFIG_HEADERS([lib/Config.h])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])

-AC_MSG_NOTICE([
-
-:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-Configuring $PACKAGE v$VERSION  for $host
-:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-])
-
-# Checks for programs.
+############### Checks for programs
 AC_LANG(C++)
+CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
+
+############ openmp  ###############
 AC_OPENMP
-AC_PROG_RANLIB
-#AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
-AX_EXT

-# Checks for libraries.
-#AX_GCC_VAR_ATTRIBUTE(aligned)
+ac_openmp=no

-# Checks for header files.
+if test "${OPENMP_CXXFLAGS}X" != "X"; then
+ac_openmp=yes
+AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
+AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
+fi
+
+############ libtool ###############
+LT_INIT
+
+############### Checks for header files
 AC_CHECK_HEADERS(stdint.h)
 AC_CHECK_HEADERS(mm_malloc.h)
 AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
-AC_CHECK_HEADERS(gmp.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])

-# Checks for typedefs, structures, and compiler characteristics.
+############### Checks for typedefs, structures, and compiler characteristics
 AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T

-# Checks for library functions.
-echo
-echo Checking libraries 
-echo :::::::::::::::::::::::::::::::::::::::::::
+############### GMP and MPFR #################
+AC_ARG_WITH([gmp],
+    [AS_HELP_STRING([--with-gmp=prefix],
+    [try this for a non-standard install prefix of the GMP library])],
+    [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
+    [AM_LDFLAGS="-L$with_gmp/lib" $AM_LDFLAGS])
+AC_ARG_WITH([mpfr],
+    [AS_HELP_STRING([--with-mpfr=prefix],
+    [try this for a non-standard install prefix of the MPFR library])],
+    [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
+    [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])

+################## lapack ####################
+AC_ARG_ENABLE([lapack],
+    [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
+    [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+
+case ${ac_LAPACK} in
+    no)
+        ;;
+    yes)
+        AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
+    *)
+        AM_CXXFLAGS="-I$ac_LAPACK/include $AM_CXXFLAGS"
+        AM_LDFLAGS="-L$ac_LAPACK/lib $AM_LDFLAGS"
+        AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
+esac
+
+################## FFTW3 ####################
+AC_ARG_WITH([fftw],    
+            [AS_HELP_STRING([--with-fftw=prefix],
+            [try this for a non-standard install prefix of the FFTW3 library])],
+            [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
+            [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
+
+#
+# What about the MKL library replacement for fftw3 ?  How do we know if fftw_execute
+# can be found in MKL? 
+#
+AC_CHECK_LIB([fftw3],[fftw_execute],
+	[AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes],
+        [ac_fftw=no])
+
+case ${ac_fftw} in
+    no)
+        echo WARNING libfftw3 not found FFT routines will not work
+        ;;
+    yes)
+        AM_LDFLAGS="$AM_LDFLAGS -lfftw3 -lfftw3f"
+esac
+
+
+################ Get compiler informations
+AC_LANG([C++])
+AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
+AX_COMPILER_VENDOR
+AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
+      [vendor of C++ compiler that will compile the code])
+AX_GXX_VERSION
+AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
+      [version of g++ that will compile the code])
+
+############### Checks for library functions
+CXXFLAGS_CPY=$CXXFLAGS
+LDFLAGS_CPY=$LDFLAGS
+LIBS_CPY=$LIBS
+CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
+LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 AC_CHECK_FUNCS([gettimeofday])
+AC_CHECK_LIB([gmp],[__gmpf_init],
+             [AC_CHECK_LIB([mpfr],[mpfr_init],
+                 [AC_DEFINE([HAVE_LIBMPFR], [1], [Define to 1 if you have the `MPFR' library (-lmpfr).])]
+                 [have_mpfr=true]
+                 [LIBS="$LIBS -lmpfr"],
+                 [AC_MSG_ERROR([MPFR library not found])])]
+   	     [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
+             [have_gmp=true]
+             [LIBS="$LIBS -lgmp"],
+             [AC_MSG_WARN([**** GMP library not found, Grid can still compile but RHMC will not work ****])])

-#AC_CHECK_LIB([gmp],[__gmpf_init],,
-#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.gmplib.org)])
+if test "${ac_LAPACK}x" != "nox"; then
+    AC_CHECK_LIB([lapack],[LAPACKE_sbdsdc],[],
+                 [AC_MSG_ERROR("LAPACK enabled but library not found")])
+fi
+CXXFLAGS=$CXXFLAGS_CPY
+LDFLAGS=$LDFLAGS_CPY

-#AC_CHECK_LIB([mpfr],[mpfr_init],,
-#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.mpfr.org/)])
-
-#
-# SIMD instructions selection
-#
-
-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
+############### SIMD instruction selection
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|AVX512MIC|IMCI|KNL|KNC],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
-	[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])
+	[ac_SIMD=${enable_simd}],[ac_SIMD=GEN])

-supported=no
-
-ac_ZMM=no;
+case ${ax_cv_cxx_compiler_vendor} in
+  clang|gnu)
+    case ${ac_SIMD} in
+      SSE4)
+        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
+        SIMD_FLAGS='-msse4.2';;
+      AVX)
+        AC_DEFINE([AVX1],[1],[AVX intrinsics])
+        SIMD_FLAGS='-mavx';;
+      AVXFMA4)
+        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
+        SIMD_FLAGS='-mavx -mfma4';;
+      AVX2)
+        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
+        SIMD_FLAGS='-mavx2 -mfma';;
+      AVX512|AVX512MIC|KNL)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
+      IMCI|KNC)
+        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+        SIMD_FLAGS='';;
+      GEN)
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
+        SIMD_FLAGS='';;
+      *)
+        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
+    esac;;
+  intel)
+    case ${ac_SIMD} in
+      SSE4)
+        AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
+        SIMD_FLAGS='-msse4.2 -xsse4.2';;
+      AVX)
+        AC_DEFINE([AVX1],[1],[AVX intrinsics])
+        SIMD_FLAGS='-mavx -xavx';;
+      AVXFMA4)
+        AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
+        SIMD_FLAGS='-mavx -xavx -mfma';;
+      AVX2)
+        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
+        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
+      AVX512)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        SIMD_FLAGS='-xcore-avx512';;
+      AVX512MIC|KNL)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
+        SIMD_FLAGS='-xmic-avx512';;
+      IMCI|KNC)
+        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+        SIMD_FLAGS='';;
+      GEN)
+        AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
+        SIMD_FLAGS='';;
+      *)
+        AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
+    esac;;
+  *)
+    AC_MSG_WARN([Compiler unknown, using generic vector code])
+    AC_DEFINE([GENERIC_VEC],[1],[generic vector code]);;
+esac
+AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
+AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"

 case ${ac_SIMD} in
-     SSE4)
-       echo Configuring for SSE4
-       AC_DEFINE([SSE4],[1],[SSE4 Intrinsics] )
-       if test x"$ax_cv_support_ssse3_ext" = x"yes"; then  dnl minimal support for SSE4
-         supported=yes
-       else
-  	AC_MSG_WARN([Your processor does not support SSE4 instructions])
-       fi
-     ;;
-     AVX)
-       echo Configuring for AVX
-       AC_DEFINE([AVX1],[1],[AVX Intrinsics] )
-       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
-       supported=yes			  
-       else
-       	AC_MSG_WARN([Your processor does not support AVX instructions])
-       fi
-     ;;
-     AVXFMA4)
-       echo Configuring for AVX
-       AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
-       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
-       supported=yes			  
-       else
-       	AC_MSG_WARN([Your processor does not support AVX instructions])
-       fi
-     ;;
-     AVX2)
-       echo Configuring for AVX2
-       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
-       if test x"$ax_cv_support_avx2_ext" = x"yes"; then  dnl minimal support for AVX2
-       supported=yes
-       else
-       AC_MSG_WARN([Your processor does not support AVX2 instructions])
-       fi
-     ;;
-     AVX512)
-       echo Configuring for AVX512 
-       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
-       supported="cross compilation"
-       ac_ZMM=yes;
-     ;;
-     IMCI)
-       echo Configuring for IMCI
-       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
-       supported="cross compilation"
-       ac_ZMM=no;
-     ;;
-     NEONv8)
-       echo Configuring for experimental ARMv8a support 
-       AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] )
-       supported="cross compilation"
-     ;;
-     DEBUG)
-       echo Configuring without SIMD support - only for compiler DEBUGGING!
-       AC_DEFINE([EMPTY_SIMD],[1],[EMPTY_SIMD only for DEBUGGING] )
-      ;;     
-     *)
-     AC_MSG_ERROR([${ac_SIMD} flag unsupported as --enable-simd option\nRun ./configure --help for the list of options]); 
-     ;;
+  AVX512|AVX512MIC|KNL)
+    AC_DEFINE([TEST_ZMM],[1],[compile ZMM test]);;
+  *)
+	;;
 esac

-case ${ac_ZMM} in
-yes)
-	echo Enabling ZMM source code
-;;
-no)
-	echo Disabling ZMM source code
-;;
-esac
-
-AM_CONDITIONAL(BUILD_ZMM,[ test "X${ac_ZMM}X" == "XyesX" ])
-
+############### precision selection
 AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
 case ${ac_PRECISION} in
     single)
-       echo default precision is single
       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
     ;;
     double)
-       echo default precision is double
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
     ;;
 esac

-#
-# Comms selection
-#
-
-AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
+############### communication type selection
+AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|shmem],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])

 case ${ac_COMMS} in
     none)
-       echo Configuring for NO communications
       AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
     ;;
+     mpi-auto)
+       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
+       LX_FIND_MPI
+       if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
+       AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
+       AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
+       AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
+       LIBS="`echo $MPI_CXXLDFLAGS | sed -E 's/-L@<:@^ @:>@+//g'` $LIBS"
+     ;;
     mpi)
-       echo Configuring for MPI communications
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
     ;;
     shmem)
-       echo Configuring for SHMEM communications
       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
     ;;
     *)
     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
     ;;
 esac
-
 AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
-AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
+AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])

-#
-# RNG selection
-#
+############### RNG selection
 AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\
 	[Select Random Number Generator to be used])],\
 	[ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
+
 case ${ac_RNG} in
     ranlux48)
-     AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
+      AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
     ;;
     mt19937)
-     AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
+      AC_DEFINE([RNG_MT19937],[1],[RNG_MT19937] )
     ;;
     *)
-     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
+      AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac

-#
-# SDE timing mode
-#
-AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
+############### timer option
+AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers],\
 	[Enable system dependent high res timers])],\
 	[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
 case ${ac_TIMERS} in
     yes)
-     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
+      AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
     ;;
     no)
-     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
+      AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
     ;;
     *)
-     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
+      AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
     ;;
 esac

-#
-# Chroma regression tests
-#
+############### Chroma regression test
 AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
 case ${ac_CHROMA} in
-     yes)
-       echo Enabling tests regressing to Chroma
-     ;;
-     no)
-       echo Disabling tests regressing to Chroma
+     yes|no)
     ;;
     *)
-     AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
+       AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); 
     ;;
 esac
-
 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])

-#
-# Lapack
-#
-AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
-
-case ${ac_LAPACK} in
-     yes)
-       echo Enabling lapack
-     ;;
-     no)
-       echo Disabling lapack
-     ;;
-     *)
-       echo Enabling lapack at ${ac_LAPACK}
-     ;;
-esac
-
-AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
-AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
-
-###################################################################
-# Checks for doxygen support
-# if present enables the "make doxyfile" command
-echo
-echo Checking doxygen support 
-echo :::::::::::::::::::::::::::::::::::::::::::
+############### Doxygen
 AC_PROG_DOXYGEN

 if test -n "$DOXYGEN"
@@ -279,9 +295,14 @@ then
 AC_CONFIG_FILES([docs/doxy.cfg])
 fi

-echo
-echo Creating configuration files
-echo :::::::::::::::::::::::::::::::::::::::::::
+############### Ouput
+cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
+AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
+AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
+AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
+AC_SUBST([AM_CFLAGS])
+AC_SUBST([AM_CXXFLAGS])
+AC_SUBST([AM_LDFLAGS])
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
@@ -293,30 +314,34 @@ AC_CONFIG_FILES(tests/hmc/Makefile)
 AC_CONFIG_FILES(tests/solver/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
-AC_CONFIG_FILES(prerequisites/Makefile)
 AC_OUTPUT

-
 echo "
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Summary of configuration for $PACKAGE v$VERSION
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The following features are enabled:
-
+----- PLATFORM ----------------------------------------
 - architecture (build)          : $build_cpu
 - os (build)                    : $build_os
 - architecture (target)         : $target_cpu
 - os (target)                   : $target_os
+- compiler vendor               : ${ax_cv_cxx_compiler_vendor}
+- compiler version              : ${ax_cv_gxx_version}
+----- BUILD OPTIONS -----------------------------------
+- SIMD                          : ${ac_SIMD}
+- Threading                     : ${ac_openmp} 
+- Communications type           : ${ac_COMMS}
+- Default precision             : ${ac_PRECISION}
+- RNG choice                    : ${ac_RNG} 
+- GMP                           : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
+- LAPACK                        : ${ac_LAPACK}
+- FFTW                          : ${ac_fftw}
 - build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 - graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
- Supported SIMD flags          : $SIMD_FLAGS
----------------------------------------------------------
- enabled simd support          : ${ac_SIMD}   (config macro says supported: $supported )
- communications type           : ${ac_COMMS}
- default precision             : ${ac_PRECISION}
- RNG choice                    : ${ac_RNG} 
- LAPACK	                : ${ac_LAPACK} 
-
-
+----- BUILD FLAGS -------------------------------------
+- CXXFLAGS:  "${AM_CXXFLAGS} ${CXXFLAGS}"
+- LDFLAGS:   "${AM_LDFLAGS} ${LDFLAGS}"
+- LIBS:      "${LIBS} "
+-------------------------------------------------------
 "
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -0,0 +1,276 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_FFT_H_
+#define _GRID_FFT_H_
+
+#ifdef HAVE_FFTW	
+#include <fftw3.h>
+#endif
+namespace Grid {
+
+  template<class scalar> struct FFTW { };
+
+#ifdef HAVE_FFTW	
+  template<> struct FFTW<ComplexD> {
+  public:
+
+    typedef fftw_complex FFTW_scalar;
+    typedef fftw_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftw_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftw_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftw_destroy_plan(p);
+    }
+  };
+
+  template<> struct FFTW<ComplexF> {
+  public:
+
+    typedef fftwf_complex FFTW_scalar;
+    typedef fftwf_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftwf_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftwf_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftwf_destroy_plan(p);
+    }
+  };
+
+#endif
+
+#ifndef FFTW_FORWARD
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+#endif
+
+  class FFT { 
+  private:
+
+    GridCartesian *vgrid;
+    GridCartesian *sgrid;
+
+    int Nd;
+    double flops;
+    double flops_call;
+    uint64_t usec;
+
+    std::vector<int> dimensions;
+    std::vector<int> processors;
+    std::vector<int> processor_coor;
+
+  public:
+
+    static const int forward=FFTW_FORWARD;
+    static const int backward=FFTW_BACKWARD;
+
+    double Flops(void) {return flops;}
+    double MFlops(void) {return flops/usec;}
+
+    FFT ( GridCartesian * grid ) : 
+      vgrid(grid),
+      Nd(grid->_ndimension),
+      dimensions(grid->_fdimensions),
+      processors(grid->_processors),
+      processor_coor(grid->_processor_coor)
+    {
+      flops=0;
+      usec =0;
+      std::vector<int> layout(Nd,1);
+      sgrid = new GridCartesian(dimensions,layout,processors);
+    };
+
+    ~FFT ( void)  { 
+      delete sgrid; 
+    }
+    
+    template<class vobj>
+    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
+
+      conformable(result._grid,vgrid);
+      conformable(source._grid,vgrid);
+
+      int L = vgrid->_ldimensions[dim];
+      int G = vgrid->_fdimensions[dim];
+
+      std::vector<int> layout(Nd,1);
+      std::vector<int> pencil_gd(vgrid->_fdimensions);
+
+      pencil_gd[dim] = G*processors[dim];    
+
+      // Pencil global vol LxLxGxLxL per node
+      GridCartesian pencil_g(pencil_gd,layout,processors);
+
+      // Construct pencils
+      typedef typename vobj::scalar_object sobj;
+      typedef typename sobj::scalar_type   scalar;
+
+      Lattice<vobj> ssource(vgrid); ssource =source;
+      Lattice<sobj> pgsource(&pencil_g);
+      Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
+
+#ifndef HAVE_FFTW	
+      assert(0);
+#else 
+      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
+      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
+
+      {
+	int Ncomp = sizeof(sobj)/sizeof(scalar);
+	int Nlow  = 1;
+	for(int d=0;d<dim;d++){
+	  Nlow*=vgrid->_ldimensions[d];
+	}
+
+	int rank = 1;  /* 1d transforms */
+	int n[] = {G}; /* 1d transforms of length G */
+	int howmany = Ncomp;
+	int odist,idist,istride,ostride;
+	idist   = odist   = 1;          /* Distance between consecutive FT's */
+	istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
+	int *inembed = n, *onembed = n;
+
+	
+	int sign = FFTW_FORWARD;
+	if (inverse) sign = FFTW_BACKWARD;
+
+	FFTW_plan p;
+	{
+	  FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
+	  FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
+	  p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
+					       in,inembed,
+					       istride,idist,
+					       out,onembed,
+					       ostride, odist,
+					       sign,FFTW_ESTIMATE);
+	}
+
+	double add,mul,fma;
+	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
+	flops_call = add+mul+2.0*fma;
+
+	GridStopWatch timer;
+
+	// Barrel shift and collect global pencil
+	for(int p=0;p<processors[dim];p++) { 
+
+	  for(int idx=0;idx<sgrid->lSites();idx++) { 
+
+	    std::vector<int> lcoor(Nd);
+    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
+
+	    sobj s;
+
+	    peekLocalSite(s,ssource,lcoor);
+
+	    lcoor[dim]+=p*L;
+	   
+	    pokeLocalSite(s,pgsource,lcoor);
+	  }
+
+	  ssource = Cshift(ssource,dim,L);
+	}
+	
+	// Loop over orthog coords
+	int NN=pencil_g.lSites();
+
+	GridStopWatch Timer;
+	Timer.Start();
+
+PARALLEL_FOR_LOOP
+	for(int idx=0;idx<NN;idx++) { 
+
+	  std::vector<int> lcoor(Nd);
+	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
+
+	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
+	    FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
+	    FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
+	    FFTW<scalar>::fftw_execute_dft(p,in,out);
+	  }
+	}
+
+        Timer.Stop();
+	usec += Timer.useconds();
+	flops+= flops_call*NN;
+
+        int pc = processor_coor[dim];
+        for(int idx=0;idx<sgrid->lSites();idx++) { 
+	  std::vector<int> lcoor(Nd);
+	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
+	  std::vector<int> gcoor = lcoor;
+	  // extract the result
+	  sobj s;
+	  gcoor[dim] = lcoor[dim]+L*pc;
+	  peekLocalSite(s,pgresult,gcoor);
+	  pokeLocalSite(s,result,lcoor);
+	}
+      	  
+	FFTW<scalar>::fftw_destroy_plan(p);
+      }
+#endif
+
+
+    }
+
+  };
+
+
+}
+
+#endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -68,6 +68,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Simd.h>
 #include <Grid/Threads.h>
 #include <Grid/Lexicographic.h>
+#include <Grid/Init.h>
 #include <Grid/Communicator.h> 
 #include <Grid/Cartesian.h>    
 #include <Grid/Tensors.h>      
@@ -78,7 +79,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/parallelIO/NerscIO.h>
-#include <Grid/Init.h>
+
+#include <Grid/FFT.h>

 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc,
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
+
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
    std::vector<int> cores(0);
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
@@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv)
    GridLogConfigure(logstreams);
  }

-
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -1,6 +1,3 @@
-# additional include paths necessary to compile the C++ library
-AM_CXXFLAGS = -I$(top_srcdir)/include/
-
 extra_sources=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
@@ -20,16 +17,8 @@ endif
 include Make.inc
 include Eigen.inc

-lib_LIBRARIES = libGrid.a
-
-libGrid_a_SOURCES = $(CCFILES) $(extra_sources)
-
-fftwdir = $(prefix)/lib/
-fftw_DATA = libfftw3.a
-
-#
-# Include files
-#
-otherincludedir = $(includedir)/Grid
-nobase_otherinclude_HEADERS =$(HFILES) $(EFILES) fftw3.h Config.h
+lib_LTLIBRARIES = libGrid.la

+libGrid_la_SOURCES             = $(CCFILES) $(extra_sources)
+libGrid_ladir                  = $(pkgincludedir)
+nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -18,10 +18,10 @@
 #include <stddef.h>
 #include <Config.h>

-#ifdef HAVE_GMP_H
+#ifdef HAVE_LIBGMP
 #include "bigfloat.h"
 #else
-#include "algorithms/approx/bigfloat_double.h"
+#include "bigfloat_double.h"
 #endif

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -127,21 +127,12 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);

-    void SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-			    void *xmit,
-			    int xmit_to_rank,
-			    void *recv,
-			    int recv_from_rank,
-			    int bytes);
-
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
-
-    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

    ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -144,28 +144,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-					       void *xmit,
-					       int dest,
-					       void *recv,
-					       int from,
-					       int bytes)
-{
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq);
-  assert(ierr==0);
-  list.push_back(xrq);
-  list.push_back(rrq);
-}
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
-{
-  MPI_Startall(list.size(),&list[0]);
-}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -173,12 +151,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromBegin(reqs);
-  for(int i=0;i<reqs.size();i++){
-    list.push_back(reqs[i]);
-  }
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  
+  assert(ierr==0);
+
+  list.push_back(xrq);
+  list.push_back(rrq);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -84,19 +84,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
  assert(0);
 }
-void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0);
-}
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
-{
-  assert(0);
-}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -268,10 +268,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
-{
-  assert(0); //unimplemented
-}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -284,15 +280,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
-void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0); // Unimplemented
-}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
  }

-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<ig->lSites();idx++){
    std::vector<int> lcoor(ni);
    ig->LocalIndexToLocalCoor(idx,lcoor);
@@ -446,6 +446,79 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in

 }

+
+template<class vobj>
+void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,lowDim,lcoor);
+      pokeLocalSite(s,higherDim,hcoor);
+    }
+  }
+}
+
+
+template<class vobj>
+void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,higherDim,hcoor);
+      pokeLocalSite(s,lowDim,lcoor);
+    }
+  }
+}
+
+
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -111,6 +111,8 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 #define FermOp4dVecTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;		\
+  template class A<ZWilsonImplF>;		\
+  template class A<ZWilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		

@@ -120,7 +122,9 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction

 #define FermOp5dVecTemplateInstantiate(A) \
  template class A<DomainWallVec5dImplF>;	\
-  template class A<DomainWallVec5dImplD>;	
+  template class A<DomainWallVec5dImplD>;	\
+  template class A<ZDomainWallVec5dImplF>;	\
+  template class A<ZDomainWallVec5dImplD>;	

 #define FermOpTemplateInstantiate(A) \
 FermOp4dVecTemplateInstantiate(A) \
@@ -143,6 +147,7 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
 #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
@@ -185,6 +190,11 @@ typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
+
+typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
+typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
+typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
+
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -54,18 +54,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag (Ls,1.0);
-  std::vector<RealD> upper(Ls,-1.0); upper[Ls-1]=mass;
-  std::vector<RealD> lower(Ls,-1.0); lower[0]   =mass;
+  std::vector<Coeff_t> diag (Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = bs;
-  std::vector<RealD> upper= cs;
-  std::vector<RealD> lower= cs; 
+  std::vector<Coeff_t> diag = bs;
+  std::vector<Coeff_t> upper= cs;
+  std::vector<Coeff_t> lower= cs; 
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
@@ -73,9 +73,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = beo;
-  std::vector<RealD> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> diag = beo;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
@@ -88,9 +88,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = bee;
-  std::vector<RealD> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-cee[i];
    lower[i]=-cee[i];
@@ -104,9 +104,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = bee;
-  std::vector<RealD> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);

  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
@@ -129,9 +129,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag(Ls,1.0);
-  std::vector<RealD> upper(Ls,-1.0);
-  std::vector<RealD> lower(Ls,-1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);
+  std::vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
@@ -141,9 +141,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag =bs;
-  std::vector<RealD> upper=cs;
-  std::vector<RealD> lower=cs;
+  std::vector<Coeff_t> diag =bs;
+  std::vector<Coeff_t> upper=cs;
+  std::vector<Coeff_t> lower=cs;
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,psi,Din,lower,diag,upper);
@@ -273,11 +273,21 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  SetCoefficientsZolotarev(1.0,zdata,b,c);
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(1.0,gamma,b,c);
 }
 //Zolo
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(zolo_hi,gamma,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;

@@ -315,7 +325,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
  double bmc = b-c;
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
-    omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
@@ -377,7 +387,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
  }
 	
  { 
-    double delta_d=mass*cee[Ls-1];
+    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
    dee[Ls-1] += delta_d;
  }  
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -62,16 +62,16 @@ namespace Grid {
      void M5D(const FermionField &psi,
 	       const FermionField &phi, 
 	       FermionField &chi,
-	       std::vector<RealD> &lower,
-	       std::vector<RealD> &diag,
-	       std::vector<RealD> &upper);
+	       std::vector<Coeff_t> &lower,
+	       std::vector<Coeff_t> &diag,
+	       std::vector<Coeff_t> &upper);

      void M5Ddag(const FermionField &psi,
 		  const FermionField &phi, 
 		  FermionField &chi,
-		  std::vector<RealD> &lower,
-		  std::vector<RealD> &diag,
-		  std::vector<RealD> &upper);
+		  std::vector<Coeff_t> &lower,
+		  std::vector<Coeff_t> &diag,
+		  std::vector<Coeff_t> &upper);
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);

      virtual void   Instantiatable(void)=0;
@@ -91,23 +91,23 @@ namespace Grid {
      RealD mass;

      // Cayley form Moebius (tanh and zolotarev)
-      std::vector<RealD> omega; 
-      std::vector<RealD> bs;    // S dependent coeffs
-      std::vector<RealD> cs;    
-      std::vector<RealD> as;    
+      std::vector<Coeff_t> omega; 
+      std::vector<Coeff_t> bs;    // S dependent coeffs
+      std::vector<Coeff_t> cs;    
+      std::vector<Coeff_t> as;    
      // For preconditioning Cayley form
-      std::vector<RealD> bee;    
-      std::vector<RealD> cee;    
-      std::vector<RealD> aee;    
-      std::vector<RealD> beo;    
-      std::vector<RealD> ceo;    
-      std::vector<RealD> aeo;    
+      std::vector<Coeff_t> bee;    
+      std::vector<Coeff_t> cee;    
+      std::vector<Coeff_t> aee;    
+      std::vector<Coeff_t> beo;    
+      std::vector<Coeff_t> ceo;    
+      std::vector<Coeff_t> aeo;    
      // LDU factorisation of the eeoo matrix
-      std::vector<RealD> lee;    
-      std::vector<RealD> leem;    
-      std::vector<RealD> uee;    
-      std::vector<RealD> ueem;    
-      std::vector<RealD> dee;    
+      std::vector<Coeff_t> lee;    
+      std::vector<Coeff_t> leem;    
+      std::vector<Coeff_t> uee;    
+      std::vector<Coeff_t> ueem;    
+      std::vector<Coeff_t> dee;    

      // Constructors
      CayleyFermion5D(GaugeField &_Umu,
@@ -117,20 +117,19 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

-
-
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
+      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
    };

  }
 }
 #define INSTANTIATE_DPERP(A)\
 template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					   std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
 template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);

--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -43,9 +43,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
-				std::vector<RealD> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
@@ -82,9 +82,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
-				   std::vector<RealD> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
@@ -204,6 +204,8 @@ PARALLEL_FOR_LOOP
  INSTANTIATE_DPERP(WilsonImplD);
  INSTANTIATE_DPERP(GparityWilsonImplF);
  INSTANTIATE_DPERP(GparityWilsonImplD);
+  INSTANTIATE_DPERP(ZWilsonImplF);
+  INSTANTIATE_DPERP(ZWilsonImplD);
 #endif

 }}
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@@ -43,9 +43,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
-				std::vector<RealD> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
@@ -65,9 +65,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
-				   std::vector<RealD> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -53,9 +53,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
-				std::vector<RealD> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
@@ -121,9 +121,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
-				   std::vector<RealD> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
@@ -194,8 +194,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField

  chi.checkerboard=psi.checkerboard;
  
-  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
-  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
  
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
@@ -212,8 +212,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  
-  Eigen::MatrixXd PplusMat ;
-  Eigen::MatrixXd PminusMat;
+  Eigen::MatrixXcd PplusMat ;
+  Eigen::MatrixXcd PminusMat;
  
  if ( inv ) {
    PplusMat =Pplus.inverse();
@@ -298,8 +298,12 @@ PARALLEL_FOR_LOOP

 INSTANTIATE_DPERP(DomainWallVec5dImplD);
 INSTANTIATE_DPERP(DomainWallVec5dImplF);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplF);

 template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);

 }}
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -103,7 +103,7 @@ namespace Grid {
    typedef typename Impl::StencilImpl             StencilImpl;		\
    typedef typename Impl::ImplParams ImplParams;			\
    typedef typename Impl::Coeff_t       Coeff_t;
-    
+
 #define INHERIT_IMPL_TYPES(Base) \
    INHERIT_GIMPL_TYPES(Base)	 \
    INHERIT_FIMPL_TYPES(Base)
@@ -122,9 +122,9 @@ namespace Grid {
      constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}

      const bool LsVectorised=false;
-
      typedef _Coeff_t Coeff_t;
-      
+
+
      INHERIT_GIMPL_TYPES(Gimpl);
      
      template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
@@ -211,10 +211,9 @@ namespace Grid {
      
      static const int Dimension = Nrepresentation;
      const bool LsVectorised=true;
-      
      typedef _Coeff_t Coeff_t;      
      typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
-      
+
      INHERIT_GIMPL_TYPES(Gimpl);
      
      template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
@@ -312,7 +311,7 @@ namespace Grid {
      static const int Dimension = Nrepresentation;

      const bool LsVectorised=false;
-      
+
      typedef _Coeff_t Coeff_t;
      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
      
@@ -515,6 +514,7 @@ namespace Grid {
    typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
    typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double

+
    typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
    typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
    typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -38,90 +38,6 @@ int WilsonKernelsStatic::AsmOpt;
 template <class Impl>
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};

-/*
-template <class Impl>
-typename std::enable_if<Impl::Dimension == 3>::type WilsonKernels<Impl>::DiracOptDhopSite(
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
-    int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
-#ifdef AVX512
-  if (AsmOpt) {
-    WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns, in,
-                                             out);
-
-  } else {
-#else
-  {
-#endif
-    for (int site = 0; site < Ns; site++) {
-      for (int s = 0; s < Ls; s++) {
-        if (HandOpt)
-          WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU, in,
-                                                    out);
-        else
-          WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
-                                                       in, out);
-        sF++;
-      }
-      sU++;
-    }
-  }
-}
-
-template <class Impl>
-typename std::enable_if<Impl::Dimension != 3>::type WilsonKernels<Impl>::DiracOptDhopSite(
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
-    int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
-  for (int site = 0; site < Ns; site++) {
-    for (int s = 0; s < Ls; s++) {
-      WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
-                                                   out);
-      sF++;
-    }
-    sU++;
-  }
-}
-
-
-template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-             std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-             int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out,
-             typename std::enable_if<Impl::Dimension == 3, int>::type = 0)
-{
-  // No asm implementation yet.
-  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-  //  else
-  for(int site=0;site<Ns;site++) {
-    for(int s=0;s<Ls;s++) {
-      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-      sF++;
-    }
-    sU++;
-  }
-}
-
-
-
-
-template <class Impl>
-void WilsonKernels<Impl>::DiracOptDhopSiteDag(
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
-    int sU, int Ls, int Ns, const FermionField &in, FermionField &out,
-    typename std::enable_if<Impl::Dimension != 3, int>::type = 0) {
-  for (int site = 0; site < Ns; site++) {
-    for (int s = 0; s < Ls; s++) {
-      WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
-                                                      in, out);
-      sF++;
-    }
-    sU++;
-  }
-}
-*/
 ////////////////////////////////////////////
 // Generic implementation; move to different file?
 ////////////////////////////////////////////
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -33,26 +33,27 @@ directory

 namespace Grid {

-namespace QCD {
+  namespace QCD {

-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Helper routines that implement Wilson stencil for a single site.
-// Common to both the WilsonFermion and WilsonFermion5D
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class WilsonKernelsStatic {
- public:
-  // S-direction is INNERMOST and takes no part in the parity.
-  static int AsmOpt;   // these are a temporary hack
-  static int HandOpt;  // these are a temporary hack
-};
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Helper routines that implement Wilson stencil for a single site.
+    // Common to both the WilsonFermion and WilsonFermion5D
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    class WilsonKernelsStatic { 
+    public:
+      // S-direction is INNERMOST and takes no part in the parity.
+      static int AsmOpt;  // these are a temporary hack
+      static int HandOpt; // these are a temporary hack
+    };

-template <class Impl>
-class WilsonKernels : public FermionOperator<Impl>, public WilsonKernelsStatic {
- public:
-  INHERIT_IMPL_TYPES(Impl);
-  typedef FermionOperator<Impl> Base;
+    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
+    public:
+
+     INHERIT_IMPL_TYPES(Impl);
+     typedef FermionOperator<Impl> Base;
+     
+    public:

- public:
  template <bool EnableBool = true>
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
  DiracOptDhopSite(
@@ -102,35 +103,45 @@ class WilsonKernels : public FermionOperator<Impl>, public WilsonKernelsStatic {
  }

  template <bool EnableBool = true>
-  typename std::enable_if<Impl::Dimension == 3 && Nc== 3 && EnableBool, void>::type
+  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
+                          void>::type
  DiracOptDhopSiteDag(
      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
-      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
-    // No asm implementation yet.
-    //  if ( AsmOpt )
-    //  WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-    //  else
-    for (int site = 0; site < Ns; site++) {
-      for (int s = 0; s < Ls; s++) {
-        if (HandOpt)
-          WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
-                                                       in, out);
-        else
-          WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
-                                                          sU, in, out);
-        sF++;
+      int sF, int sU, int Ls, int Ns, const FermionField &in,
+      FermionField &out) {
+#ifdef AVX512
+    if (AsmOpt) {
+      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
+                                                  Ns, in, out);
+    } else {
+#else
+    {
+#endif
+      for (int site = 0; site < Ns; site++) {
+        for (int s = 0; s < Ls; s++) {
+          if (HandOpt)
+            WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
+                                                         in, out);
+          else
+            WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
+                                                            sU, in, out);
+          sF++;
+        }
+        sU++;
      }
-      sU++;
    }
  }

  template <bool EnableBool = true>
-    typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
-    DiracOptDhopSiteDag(
-			StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-			std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
-			int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+  typename std::enable_if<
+      (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
+      void>::type
+  DiracOptDhopSiteDag(
+      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+      int sF, int sU, int Ls, int Ns, const FermionField &in,
+      FermionField &out) {
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
        WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
@@ -140,7 +151,7 @@ class WilsonKernels : public FermionOperator<Impl>, public WilsonKernelsStatic {
      sU++;
    }
  }
-  
+
  void DiracOptDhopDir(
      StencilImpl &st, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
@@ -165,6 +176,12 @@ class WilsonKernels : public FermionOperator<Impl>, public WilsonKernelsStatic {
      int sF, int sU, int Ls, int Ns, const FermionField &in,
      FermionField &out);

+  void DiracOptAsmDhopSiteDag(
+      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+      int sF, int sU, int Ls, int Ns, const FermionField &in,
+      FermionField &out);
+
  void DiracOptHandDhopSite(
      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
@@ -177,7 +194,9 @@ class WilsonKernels : public FermionOperator<Impl>, public WilsonKernelsStatic {

 public:
  WilsonKernels(const ImplParams &p = ImplParams());
-};
-}
+  };
+
+
+  }
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -73,12 +73,21 @@ static int signInit = setupSigns();
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
+
+#undef KERNEL_DAG
 template<>
 void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>

+#define KERNEL_DAG
+template<>
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@@ -89,14 +98,25 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+
+#undef KERNEL_DAG
 template<>
 void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>

+#define KERNEL_DAG
+template<>
+void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
 #endif

+
+
 template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -30,7 +30,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+    XP_PROJMEM(base);
+#else 
    XM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@@ -41,15 +45,22 @@
    MULT_2SPIN_DIR_PFXP(Xp,basep);
  }
  LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+  XP_RECON;
+#else
  XM_RECON;
-
+#endif
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YP_PROJMEM(base);
+#else
    YM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@@ -60,7 +71,11 @@
    MULT_2SPIN_DIR_PFYP(Yp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YP_RECON_ACCUM;
+#else
  YM_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Zp
@@ -68,7 +83,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZP_PROJMEM(base);
+#else
    ZM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@@ -79,7 +98,11 @@
    MULT_2SPIN_DIR_PFZP(Zp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZP_RECON_ACCUM;
+#else
  ZM_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Tp
@@ -87,7 +110,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TP_PROJMEM(base);
+#else
    TM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
@@ -98,7 +125,11 @@
    MULT_2SPIN_DIR_PFTP(Tp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TP_RECON_ACCUM;
+#else
  TM_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Xm
@@ -107,7 +138,11 @@
  //  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    XM_PROJMEM(base);
+#else
    XP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@@ -118,7 +153,11 @@
    MULT_2SPIN_DIR_PFXM(Xm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  XM_RECON_ACCUM;
+#else
  XP_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Ym
@@ -126,7 +165,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YM_PROJMEM(base);
+#else
    YP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@@ -137,7 +180,11 @@
    MULT_2SPIN_DIR_PFYM(Ym,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YM_RECON_ACCUM;
+#else
  YP_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Zm
@@ -145,7 +192,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZM_PROJMEM(base);
+#else
    ZP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@@ -156,7 +207,11 @@
    MULT_2SPIN_DIR_PFZM(Zm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZM_RECON_ACCUM;
+#else
  ZP_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Tm
@@ -164,7 +219,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TM_PROJMEM(base);
+#else
    TP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
@@ -175,7 +234,11 @@
    MULT_2SPIN_DIR_PFTM(Tm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TM_RECON_ACCUM;
+#else
  TP_RECON_ACCUM;
+#endif

  basep= st.GetPFInfo(nent,plocal); nent++;
  SAVE_RESULT(base,basep);
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -839,46 +839,23 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 ////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //

-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+#define INSTANTIATE_THEM(A) \
+template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+							       int ss,int sU,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 								  int ss,int sU,const FermionField &in, FermionField &out);

-
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-
-
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-
+INSTANTIATE_THEM(WilsonImplF);
+INSTANTIATE_THEM(WilsonImplD);
+INSTANTIATE_THEM(ZWilsonImplF);
+INSTANTIATE_THEM(ZWilsonImplD);
+INSTANTIATE_THEM(GparityWilsonImplF);
+INSTANTIATE_THEM(GparityWilsonImplD);
+INSTANTIATE_THEM(DomainWallVec5dImplF);
+INSTANTIATE_THEM(DomainWallVec5dImplD);
+INSTANTIATE_THEM(ZDomainWallVec5dImplF);
+INSTANTIATE_THEM(ZDomainWallVec5dImplD);

 }}
--- a/lib/qcd/action/fermion/ZMobiusFermion.h
+++ b/lib/qcd/action/fermion/ZMobiusFermion.h
@@ -0,0 +1,79 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/MobiusFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_ZMOBIUS_FERMION_H
+#define  GRID_QCD_ZMOBIUS_FERMION_H
+
+#include <Grid/Grid.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class ZMobiusFermion : public CayleyFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void) {};
+      // Constructors
+      ZMobiusFermion(GaugeField &_Umu,
+		     GridCartesian         &FiveDimGrid,
+		     GridRedBlackCartesian &FiveDimRedBlackGrid,
+		     GridCartesian         &FourDimGrid,
+		     GridRedBlackCartesian &FourDimRedBlackGrid,
+		     RealD _mass,RealD _M5,
+		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
+      
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)
+
+      {
+	RealD eps = 1.0;
+	
+	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+	std::vector<Coeff_t> zgamma(this->Ls);
+	for(int s=0;s<this->Ls;s++){
+	  zgamma[s] = gamma[s];
+	}
+
+	// Call base setter
+	this->SetCoefficientsInternal(1.0,zgamma,b,c);
+      }
+
+    };
+
+  }
+}
+
+#endif
--- a/lib/simd/Grid_generic.h
+++ b/lib/simd/Grid_generic.h
@@ -2,7 +2,7 @@

    Grid physics library, www.github.com/paboyle/Grid 

-    Source file: ./lib/simd/Grid_empty.h
+    Source file: ./lib/simd/Grid_generic.h

    Copyright (C) 2015

@@ -26,14 +26,6 @@ Author: neo <cossu@post.kek.jp>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-//----------------------------------------------------------------------
-/*! @file Grid_sse4.h
-  @brief Empty Optimization libraries for debugging
-
-  Using intrinsics
-*/
-// Time-stamp: <2015-06-09 14:28:02 neo>
-//----------------------------------------------------------------------

 namespace Grid {
 namespace Optimization {
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -38,8 +38,8 @@ directory
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES

-#ifdef EMPTY_SIMD
-#include "Grid_empty.h"
+#ifdef GENERIC_VEC
+#include "Grid_generic.h"
 #endif
 #ifdef SSE4
 #include "Grid_sse4.h"
@@ -388,6 +388,12 @@ class Grid_simd {

 };  // end of Grid_simd class definition

+
+inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
+inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
+inline void permute(RealD &y,RealD b, int perm) {  y=b; }
+inline void permute(RealF &y,RealF b, int perm) {  y=b; }
+
 ////////////////////////////////////////////////////////////////////
 // General rotate
 ////////////////////////////////////////////////////////////////////
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -67,15 +67,13 @@ template <class scalar>
 struct AsinRealFunctor {
  scalar operator()(const scalar &a) const { return asin(real(a)); }
 };
-
 template <class scalar>
 struct LogRealFunctor {
  scalar operator()(const scalar &a) const { return log(real(a)); }
 };
-
 template <class scalar>
-struct ExpRealFunctor {
-  scalar operator()(const scalar &a) const { return exp(real(a)); }
+struct ExpFunctor {
+  scalar operator()(const scalar &a) const { return exp(a); }
 };
 template <class scalar>
 struct NotFunctor {
@@ -85,7 +83,6 @@ template <class scalar>
 struct AbsRealFunctor {
  scalar operator()(const scalar &a) const { return std::abs(real(a)); }
 };
-
 template <class scalar>
 struct PowRealFunctor {
  double y;
@@ -135,7 +132,6 @@ template <class Scalar>
 inline Scalar rsqrt(const Scalar &r) {
  return (RSqrtRealFunctor<Scalar>(), r);
 }
-
 template <class S, class V>
 inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
  return SimdApply(CosRealFunctor<S>(), r);
@@ -162,7 +158,7 @@ inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
 }
 template <class S, class V>
 inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
-  return SimdApply(ExpRealFunctor<S>(), r);
+  return SimdApply(ExpFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
--- a/m4/ax_compiler_vendor.m4
+++ b/m4/ax_compiler_vendor.m4
@@ -0,0 +1,87 @@
+# ===========================================================================
+#    http://www.gnu.org/software/autoconf-archive/ax_compiler_vendor.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_COMPILER_VENDOR
+#
+# DESCRIPTION
+#
+#   Determine the vendor of the C/C++ compiler, e.g., gnu, intel, ibm, sun,
+#   hp, borland, comeau, dec, cray, kai, lcc, metrowerks, sgi, microsoft,
+#   watcom, etc. The vendor is returned in the cache variable
+#   $ax_cv_c_compiler_vendor for C and $ax_cv_cxx_compiler_vendor for C++.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
+#   Copyright (c) 2008 Matteo Frigo
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 15
+
+AC_DEFUN([AX_COMPILER_VENDOR],
+[AC_CACHE_CHECK([for _AC_LANG compiler vendor], ax_cv_[]_AC_LANG_ABBREV[]_compiler_vendor,
+  dnl Please add if possible support to ax_compiler_version.m4
+  [# note: don't check for gcc first since some other compilers define __GNUC__
+  vendors="intel:     __ICC,__ECC,__INTEL_COMPILER
+           ibm:       __xlc__,__xlC__,__IBMC__,__IBMCPP__
+           pathscale: __PATHCC__,__PATHSCALE__
+           clang:     __clang__
+           cray:      _CRAYC
+           fujitsu:   __FUJITSU
+           gnu:       __GNUC__
+           sun:       __SUNPRO_C,__SUNPRO_CC
+           hp:        __HP_cc,__HP_aCC
+           dec:       __DECC,__DECCXX,__DECC_VER,__DECCXX_VER
+           borland:   __BORLANDC__,__CODEGEARC__,__TURBOC__
+           comeau:    __COMO__
+           kai:       __KCC
+           lcc:       __LCC__
+           sgi:       __sgi,sgi
+           microsoft: _MSC_VER
+           metrowerks: __MWERKS__
+           watcom:    __WATCOMC__
+           portland:  __PGI
+	   tcc:       __TINYC__
+           unknown:   UNKNOWN"
+  for ventest in $vendors; do
+    case $ventest in
+      *:) vendor=$ventest; continue ;;
+      *)  vencpp="defined("`echo $ventest | sed 's/,/) || defined(/g'`")" ;;
+    esac
+    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[
+      #if !($vencpp)
+        thisisanerror;
+      #endif
+    ])], [break])
+  done
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_vendor=`echo $vendor | cut -d: -f1`
+ ])
+])
--- a/m4/ax_compiler_version.m4
+++ b/m4/ax_compiler_version.m4
@@ -0,0 +1,492 @@
+# ===========================================================================
+#    http://www.gnu.org/software/autoconf-archive/ax_compiler_version.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_COMPILER_VERSION
+#
+# DESCRIPTION
+#
+#   This macro retrieves the compiler version and returns it in the cache
+#   variable $ax_cv_c_compiler_version for C and $ax_cv_cxx_compiler_version
+#   for C++.
+#
+#   Version is returned as epoch:major.minor.patchversion
+#
+#   Epoch is used in order to have an increasing version number in case of
+#   marketing change.
+#
+#   Epoch use: * borland compiler use chronologically 0turboc for turboc
+#   era,
+#
+#     1borlanc BORLANC++ before 5, 2cppbuilder for cppbuilder era,
+#     3borlancpp for return of BORLANC++ (after version 5.5),
+#     4cppbuilder for cppbuilder with year version,
+#     and 5xe for XE era.
+#
+#   An empty string is returned otherwise.
+#
+# LICENSE
+#
+#   Copyright (c) 2014 Bastien ROUCARIES <roucaries.bastien+autoconf@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 4
+
+# for intel
+AC_DEFUN([_AX_COMPILER_VERSION_INTEL],
+  [ dnl
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    [__INTEL_COMPILER/100],,
+    AC_MSG_FAILURE([[[$0]] unknown intel compiler version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    [(__INTEL_COMPILER%100)/10],,
+    AC_MSG_FAILURE([[[$0]] unknown intel compiler version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+    [(__INTEL_COMPILER%10)],,
+    AC_MSG_FAILURE([[[$0]] unknown intel compiler version]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+  ])
+
+# for IBM
+AC_DEFUN([_AX_COMPILER_VERSION_IBM],
+  [ dnl
+  dnl check between z/OS C/C++  and XL C/C++
+  AC_COMPILE_IFELSE([
+    AC_LANG_PROGRAM([],
+      [
+        #if defined(__COMPILER_VER__)
+        choke me;
+        #endif
+      ])],
+    [
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+        [__xlC__/100],,
+      	AC_MSG_FAILURE([[[$0]] unknown IBM compiler major version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+        [__xlC__%100],,
+      	AC_MSG_FAILURE([[[$0]] unknown IBM compiler minor version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+        [__xlC_ver__/0x100],,
+      	AC_MSG_FAILURE([[[$0]] unknown IBM compiler patch version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_build,
+        [__xlC_ver__%0x100],,
+      	AC_MSG_FAILURE([[[$0]] unknown IBM compiler build version]))
+      ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_build"
+    ],
+    [
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+        [__xlC__%1000],,
+      	AC_MSG_FAILURE([[[$0]] unknown IBM compiler patch version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+        [(__xlC__/10000)%10],,
+      	AC_MSG_FAILURE([[[$0]] unknown IBM compiler minor version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+        [(__xlC__/100000)%10],,
+      	AC_MSG_FAILURE([[[$0]] unknown IBM compiler major version]))
+      ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+    ])
+])
+
+# for pathscale
+AC_DEFUN([_AX_COMPILER_VERSION_PATHSCALE],[
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    __PATHCC__,,
+    AC_MSG_FAILURE([[[$0]] unknown pathscale major]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    __PATHCC_MINOR__,,
+    AC_MSG_FAILURE([[[$0]] unknown pathscale minor]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+    [__PATHCC_PATCHLEVEL__],,
+    AC_MSG_FAILURE([[[$0]] unknown pathscale patch level]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+  ])
+
+# for clang
+AC_DEFUN([_AX_COMPILER_VERSION_CLANG],[
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    __clang_major__,,
+    AC_MSG_FAILURE([[[$0]] unknown clang major]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    __clang_minor__,,
+    AC_MSG_FAILURE([[[$0]] unknown clang minor]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+    [__clang_patchlevel__],,0)
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+  ])
+
+# for crayc
+AC_DEFUN([_AX_COMPILER_VERSION_CRAY],[
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    _RELEASE,,
+    AC_MSG_FAILURE([[[$0]] unknown crayc release]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    _RELEASE_MINOR,,
+    AC_MSG_FAILURE([[[$0]] unknown crayc minor]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor"
+  ])
+
+# for fujitsu
+AC_DEFUN([_AX_COMPILER_VERSION_FUJITSU],[
+  AC_COMPUTE_INT(ax_cv_[]_AC_LANG_ABBREV[]_compiler_version,
+                 __FCC_VERSION,,
+		 AC_MSG_FAILURE([[[$0]]unknown fujitsu release]))
+  ])
+
+# for GNU
+AC_DEFUN([_AX_COMPILER_VERSION_GNU],[
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    __GNUC__,,
+    AC_MSG_FAILURE([[[$0]] unknown gcc major]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    __GNUC_MINOR__,,
+    AC_MSG_FAILURE([[[$0]] unknown gcc minor]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+    [__GNUC_PATCHLEVEL__],,
+    AC_MSG_FAILURE([[[$0]] unknown gcc patch level]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+  ])
+
+# For sun
+AC_DEFUN([_AX_COMPILER_VERSION_SUN],[
+  m4_define([_AX_COMPILER_VERSION_SUN_NUMBER],
+            [
+	     #if defined(__SUNPRO_CC)
+	     __SUNPRO_CC
+	     #else
+	     __SUNPRO_C
+	     #endif
+	    ])
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_until59,
+    !!(_AX_COMPILER_VERSION_SUN_NUMBER < 0x1000),,
+    AC_MSG_FAILURE([[[$0]] unknown sun release version]))
+  AS_IF([test "X$_ax_[]_AC_LANG_ABBREV[]_compiler_version_until59" = X1],
+    [dnl
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+        _AX_COMPILER_VERSION_SUN_NUMBER % 0x10,,
+	AC_MSG_FAILURE([[[$0]] unknown sun patch version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+        (_AX_COMPILER_VERSION_SUN_NUMBER / 0x10) % 0x10,,
+        AC_MSG_FAILURE([[[$0]] unknown sun minor version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+        (_AX_COMPILER_VERSION_SUN_NUMBER / 0x100),,
+        AC_MSG_FAILURE([[[$0]] unknown sun major version]))
+    ],
+    [dnl
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+        _AX_COMPILER_VERSION_SUN_NUMBER % 0x10,,
+        AC_MSG_FAILURE([[[$0]] unknown sun patch version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+        (_AX_COMPILER_VERSION_SUN_NUMBER / 0x100) % 0x100,,
+        AC_MSG_FAILURE([[[$0]] unknown sun minor version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+        (_AX_COMPILER_VERSION_SUN_NUMBER / 0x1000),,
+        AC_MSG_FAILURE([[[$0]] unknown sun major version]))
+    ])
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+])
+
+AC_DEFUN([_AX_COMPILER_VERSION_HP],[
+  m4_define([_AX_COMPILER_VERSION_HP_NUMBER],
+            [
+	     #if defined(__HP_cc)
+	     __HP_cc
+	     #else
+	     __HP_aCC
+	     #endif
+	    ])
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_untilA0121,
+    !!(_AX_COMPILER_VERSION_HP_NUMBER <= 1),,
+    AC_MSG_FAILURE([[[$0]] unknown hp release version]))
+  AS_IF([test "X$_ax_[]_AC_LANG_ABBREV[]_compiler_version_untilA0121" = X1],
+    [dnl By default output last version with this behavior.
+     dnl it is so old
+      ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="01.21.00"
+    ],
+    [dnl
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+        (_AX_COMPILER_VERSION_HP_NUMBER % 100),,
+        AC_MSG_FAILURE([[[$0]] unknown hp release version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+        ((_AX_COMPILER_VERSION_HP_NUMBER / 100)%100),,
+        AC_MSG_FAILURE([[[$0]] unknown hp minor version]))
+      AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+        ((_AX_COMPILER_VERSION_HP_NUMBER / 10000)%100),,
+        AC_MSG_FAILURE([[[$0]] unknown hp major version]))
+      ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+    ])
+])
+
+AC_DEFUN([_AX_COMPILER_VERSION_DEC],[dnl
+  m4_define([_AX_COMPILER_VERSION_DEC_NUMBER],
+            [
+	     #if defined(__DECC_VER)
+	     __DECC_VER
+	     #else
+	     __DECCXX_VER
+	     #endif
+	    ])
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+    (_AX_COMPILER_VERSION_DEC_NUMBER % 10000),,
+    AC_MSG_FAILURE([[[$0]] unknown dec release version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    ((_AX_COMPILER_VERSION_DEC_NUMBER / 100000UL)%100),,
+    AC_MSG_FAILURE([[[$0]] unknown dec minor version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    ((_AX_COMPILER_VERSION_DEC_NUMBER / 10000000UL)%100),,
+    AC_MSG_FAILURE([[[$0]] unknown dec major version]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+  ])
+
+# borland
+AC_DEFUN([_AX_COMPILER_VERSION_BORLAND],[dnl
+  m4_define([_AX_COMPILER_VERSION_TURBOC_NUMBER],
+            [
+	     #if defined(__TURBOC__)
+	     __TURBOC__
+	     #else
+	     choke me
+	     #endif
+	    ])
+  m4_define([_AX_COMPILER_VERSION_BORLANDC_NUMBER],
+            [
+	     #if defined(__BORLANDC__)
+	     __BORLANDC__
+	     #else
+	     __CODEGEARC__
+	     #endif
+	    ])
+ AC_COMPILE_IFELSE(
+   [AC_LANG_PROGRAM(,
+     _AX_COMPILER_VERSION_TURBOC_NUMBER)],
+   [dnl TURBOC
+     AC_COMPUTE_INT(
+       _ax_[]_AC_LANG_ABBREV[]_compiler_version_turboc_raw,
+       _AX_COMPILER_VERSION_TURBOC_NUMBER,,
+       AC_MSG_FAILURE([[[$0]] unknown turboc version]))
+     AS_IF(
+       [test $_ax_[]_AC_LANG_ABBREV[]_compiler_version_turboc_raw -lt 661 || test $_ax_[]_AC_LANG_ABBREV[]_compiler_version_turboc_raw -gt 1023],
+       [dnl compute normal version
+        AC_COMPUTE_INT(
+	  _ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+	  _AX_COMPILER_VERSION_TURBOC_NUMBER % 0x100,,
+	  AC_MSG_FAILURE([[[$0]] unknown turboc minor version]))
+	AC_COMPUTE_INT(
+	  _ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+	  (_AX_COMPILER_VERSION_TURBOC_NUMBER/0x100)%0x100,,
+	  AC_MSG_FAILURE([[[$0]] unknown turboc major version]))
+	ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="0turboc:$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor"],
+      [dnl special version
+       AS_CASE([$_ax_[]_AC_LANG_ABBREV[]_compiler_version_turboc_raw],
+         [661],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="0turboc:1.00"],
+	 [662],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="0turboc:1.01"],
+         [663],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="0turboc:2.00"],
+	 [
+	 AC_MSG_WARN([[[$0]] unknown turboc version between 0x295 and 0x400 please report bug])
+	 ax_cv_[]_AC_LANG_ABBREV[]_compiler_version=""
+	 ])
+      ])
+    ],
+    # borlandc
+    [
+    AC_COMPUTE_INT(
+      _ax_[]_AC_LANG_ABBREV[]_compiler_version_borlandc_raw,
+      _AX_COMPILER_VERSION_BORLANDC_NUMBER,,
+      AC_MSG_FAILURE([[[$0]] unknown borlandc version]))
+    AS_CASE([$_ax_[]_AC_LANG_ABBREV[]_compiler_version_borlandc_raw],
+      dnl BORLANC++ before 5.5
+      [512] ,[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="1borlanc:2.00"],
+      [1024],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="1borlanc:3.00"],
+      [1024],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="1borlanc:3.00"],
+      [1040],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="1borlanc:3.1"],
+      [1106],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="1borlanc:4.0"],
+      [1280],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="1borlanc:5.0"],
+      [1312],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="1borlanc:5.02"],
+      dnl C++ Builder era
+      [1328],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="2cppbuilder:3.0"],
+      [1344],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="2cppbuilder:4.0"],
+      dnl BORLANC++ after 5.5
+      [1360],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="3borlancpp:5.5"],
+      [1361],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="3borlancpp:5.51"],
+      [1378],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="3borlancpp:5.6.4"],
+      dnl C++ Builder with year number
+      [1392],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="4cppbuilder:2006"],
+      [1424],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="4cppbuilder:2007"],
+      [1555],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="4cppbuilder:2009"],
+      [1569],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="4cppbuilder:2010"],
+      dnl XE version
+      [1584],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="5xe"],
+      [1600],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="5xe:2"],
+      [1616],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="5xe:3"],
+      [1632],[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="5xe:4"],
+      [
+      AC_MSG_WARN([[[$0]] Unknow borlanc compiler version $_ax_[]_AC_LANG_ABBREV[]_compiler_version_borlandc_raw please report bug])
+      ])
+    ])
+  ])
+
+# COMO
+AC_DEFUN([_AX_COMPILER_VERSION_COMEAU],
+  [ dnl
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    [__COMO_VERSION__%100],,
+    AC_MSG_FAILURE([[[$0]] unknown comeau compiler minor version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    [(__COMO_VERSION__/100)%10],,
+    AC_MSG_FAILURE([[[$0]] unknown comeau compiler major version]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor"
+  ])
+
+# KAI
+AC_DEFUN([_AX_COMPILER_VERSION_KAI],[
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+    [__KCC_VERSION%100],,
+    AC_MSG_FAILURE([[[$0]] unknown kay compiler patch version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    [(__KCC_VERSION/100)%10],,
+    AC_MSG_FAILURE([[[$0]] unknown kay compiler minor version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    [(__KCC_VERSION/1000)%10],,
+    AC_MSG_FAILURE([[[$0]] unknown kay compiler major version]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+  ])
+
+dnl LCC
+dnl LCC does not output version...
+
+# SGI
+AC_DEFUN([_AX_COMPILER_VERSION_SGI],[
+   m4_define([_AX_COMPILER_VERSION_SGI_NUMBER],
+            [
+	     #if defined(_COMPILER_VERSION)
+	     _COMPILER_VERSION
+	     #else
+	     _SGI_COMPILER_VERSION
+	     #endif
+	    ])
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+    [_AX_COMPILER_VERSION_SGI_NUMBER%10],,
+    AC_MSG_FAILURE([[[$0]] unknown SGI compiler patch version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    [(_AX_COMPILER_VERSION_SGI_NUMBER/10)%10],,
+    AC_MSG_FAILURE([[[$0]] unknown SGI compiler minor version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    [(_AX_COMPILER_VERSION_SGI_NUMBER/100)%10],,
+    AC_MSG_FAILURE([[[$0]] unknown SGI compiler major version]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+  ])
+
+# microsoft
+AC_DEFUN([_AX_COMPILER_VERSION_MICROSOFT],[
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    _MSC_VER%100,,
+    AC_MSG_FAILURE([[[$0]] unknown microsoft compiler minor version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    (_MSC_VER/100)%100,,
+    AC_MSG_FAILURE([[[$0]] unknown microsoft compiler major version]))
+  dnl could be overriden
+  _ax_[]_AC_LANG_ABBREV[]_compiler_version_patch=0
+  _ax_[]_AC_LANG_ABBREV[]_compiler_version_build=0
+  # special case for version 6
+  AS_IF([test "X$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major" = "X12"],
+    [AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+       _MSC_FULL_VER%1000,,
+       _ax_[]_AC_LANG_ABBREV[]_compiler_version_patch=0)])
+  # for version 7
+  AS_IF([test "X$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major" = "X13"],
+    [AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+       _MSC_FULL_VER%1000,,
+       AC_MSG_FAILURE([[[$0]] unknown microsoft compiler patch version]))
+    ])
+  # for version > 8
+ AS_IF([test $_ax_[]_AC_LANG_ABBREV[]_compiler_version_major -ge 14],
+    [AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+       _MSC_FULL_VER%10000,,
+       AC_MSG_FAILURE([[[$0]] unknown microsoft compiler patch version]))
+    ])
+ AS_IF([test $_ax_[]_AC_LANG_ABBREV[]_compiler_version_major -ge 15],
+    [AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_build,
+       _MSC_BUILD,,
+       AC_MSG_FAILURE([[[$0]] unknown microsoft compiler build version]))
+    ])
+ ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_build"
+ ])
+
+# for metrowerks
+AC_DEFUN([_AX_COMPILER_VERSION_METROWERKS],[dnl
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+    __MWERKS__%0x100,,
+    AC_MSG_FAILURE([[[$0]] unknown metrowerks compiler patch version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    (__MWERKS__/0x100)%0x10,,
+    AC_MSG_FAILURE([[[$0]] unknown metrowerks compiler minor version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    (__MWERKS__/0x1000)%0x10,,
+    AC_MSG_FAILURE([[[$0]] unknown metrowerks compiler major version]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+  ])
+
+# for watcom
+AC_DEFUN([_AX_COMPILER_VERSION_WATCOM],[dnl
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    __WATCOMC__%100,,
+    AC_MSG_FAILURE([[[$0]] unknown watcom compiler minor version]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    (__WATCOMC__/100)%100,,
+    AC_MSG_FAILURE([[[$0]] unknown watcom compiler major version]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor"
+  ])
+
+# for PGI
+AC_DEFUN([_AX_COMPILER_VERSION_PORTLAND],[
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_major,
+    __PGIC__,,
+    AC_MSG_FAILURE([[[$0]] unknown pgi major]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor,
+    __PGIC_MINOR__,,
+    AC_MSG_FAILURE([[[$0]] unknown pgi minor]))
+  AC_COMPUTE_INT(_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch,
+    [__PGIC_PATCHLEVEL__],,
+    AC_MSG_FAILURE([[[$0]] unknown pgi patch level]))
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version="$_ax_[]_AC_LANG_ABBREV[]_compiler_version_major.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_minor.$_ax_[]_AC_LANG_ABBREV[]_compiler_version_patch"
+  ])
+
+# tcc
+AC_DEFUN([_AX_COMPILER_VERSION_TCC],[
+  ax_cv_[]_AC_LANG_ABBREV[]_compiler_version=[`tcc -v | $SED 's/^[ ]*tcc[ ]\+version[ ]\+\([0-9.]\+\).*/\1/g'`]
+  ])
+# main entry point
+AC_DEFUN([AX_COMPILER_VERSION],[dnl
+  AC_REQUIRE([AX_COMPILER_VENDOR])
+  AC_REQUIRE([AC_PROG_SED])
+  AC_CACHE_CHECK([for _AC_LANG compiler version],
+    ax_cv_[]_AC_LANG_ABBREV[]_compiler_version,
+    [ dnl
+      AS_CASE([$ax_cv_[]_AC_LANG_ABBREV[]_compiler_vendor],
+        [intel],[_AX_COMPILER_VERSION_INTEL],
+	[ibm],[_AX_COMPILER_VERSION_IBM],
+	[pathscale],[_AX_COMPILER_VERSION_PATHSCALE],
+	[clang],[_AX_COMPILER_VERSION_CLANG],
+	[cray],[_AX_COMPILER_VERSION_CRAY],
+	[fujitsu],[_AX_COMPILER_VERSION_FUJITSU],
+        [gnu],[_AX_COMPILER_VERSION_GNU],
+	[sun],[_AX_COMPILER_VERSION_SUN],
+	[hp],[_AX_COMPILER_VERSION_HP],
+	[dec],[_AX_COMPILER_VERSION_DEC],
+	[borland],[_AX_COMPILER_VERSION_BORLAND],
+	[comeau],[_AX_COMPILER_VERSION_COMEAU],
+	[kai],[_AX_COMPILER_VERSION_KAI],
+	[sgi],[_AX_COMPILER_VERSION_SGI],
+	[microsoft],[_AX_COMPILER_VERSION_MICROSOFT],
+	[metrowerks],[_AX_COMPILER_VERSION_METROWERKS],
+	[watcom],[_AX_COMPILER_VERSION_WATCOM],
+	[portland],[_AX_COMPILER_VERSION_PORTLAND],
+	[tcc],[_AX_COMPILER_VERSION_TCC],
+  	[ax_cv_[]_AC_LANG_ABBREV[]_compiler_version=""])
+    ])
+])
--- a/m4/ax_gcc_option.m4
+++ b/m4/ax_gcc_option.m4
@@ -0,0 +1,34 @@
+AC_DEFUN([AX_GCC_OPTION], [
+AC_REQUIRE([AC_PROG_CC])
+
+AC_MSG_CHECKING([if gcc accepts $1 option])
+
+AS_IF([ test "x$GCC" = "xyes" ],[
+AS_IF([ test -z "$3" ],[
+ax_gcc_option_test="int main()
+{
+return 0;
+}"
+],[
+ax_gcc_option_test="$3"
+])
+
+# Dump the test program to file
+cat <<EOF > conftest.c
+$ax_gcc_option_test
+EOF
+
+# Dump back the file to the log, useful for debugging purposes
+AC_TRY_COMMAND(cat conftest.c 1>&AS_MESSAGE_LOG_FD)
+
+AS_IF([ AC_TRY_COMMAND($CC $2 $1 -c conftest.c 1>&AS_MESSAGE_LOG_FD) ],[
+AC_MSG_RESULT([yes])
+$4
+],[
+AC_MSG_RESULT([no])
+$5
+])
+],[
+AC_MSG_RESULT([no gcc available])
+])
+])
--- a/m4/ax_gcc_version.m4
+++ b/m4/ax_gcc_version.m4
@@ -0,0 +1,64 @@
+# ===========================================================================
+#      http://www.gnu.org/software/autoconf-archive/ax_gcc_version.html
+# ===========================================================================
+#
+# OBSOLETE MACRO
+#
+#   Use AX_COMPILER_VERSION instead
+#
+# SYNOPSIS
+#
+#   AX_GCC_VERSION
+#
+# DESCRIPTION
+#
+#   This macro retrieves the gcc version and returns it in the GCC_VERSION
+#   variable if available, an empty string otherwise.
+#
+# LICENSE
+#
+#   Copyright (c) 2009 Francesco Salvestrini <salvestrini@users.sourceforge.net>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 10
+
+AC_DEFUN([AX_GCC_VERSION], [
+  AC_OBSOLETE([$0], [;please use AX_COMPILER_VERSION instead])
+  AC_LANG_PUSH([C])
+  AC_REQUIRE([AX_COMPILER_VENDOR])
+  AC_REQUIRE([AX_COMPILER_VERSION])
+  AC_LANG_POP([C])
+  GCC_VERSION=""
+  ax_cv_gcc_version=""
+  AS_IF([test "X$ax_cv_c_compiler_vendor" = "Xgnu"],
+    [dnl
+    ax_cv_gcc_version=$ax_cv_c_compiler_version
+    GCC_VERSION=$ax_cv_gcc_version
+    ])
+  AC_SUBST([GCC_VERSION])
+])
--- a/m4/ax_gxx_version.m4
+++ b/m4/ax_gxx_version.m4
@@ -0,0 +1,67 @@
+# ===========================================================================
+#             http://autoconf-archive.cryp.to/ax_gxx_version.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_GXX_VERSION
+#
+# DESCRIPTION
+#
+#   This macro retrieves the g++ version and returns it in the GXX_VERSION
+#   variable if available, an empty string otherwise.
+#
+# LAST MODIFICATION
+#
+#   2008-04-12
+#
+# COPYLEFT
+#
+#   Copyright (c) 2008 Francesco Salvestrini <salvestrini@users.sourceforge.net>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Macro Archive. When you make and
+#   distribute a modified version of the Autoconf Macro, you may extend this
+#   special exception to the GPL to apply to your modified version as well.
+
+AC_DEFUN([AX_GXX_VERSION], [
+  GXX_VERSION=""
+  AX_GCC_OPTION([-dumpversion],[],[],[
+    ax_gcc_version_option=yes
+  ],[
+    ax_gcc_version_option=no
+  ])
+  AS_IF([test "x$GXX" = "xyes"],[
+    AS_IF([test "x$ax_gxx_version_option" != "no"],[
+      AC_CACHE_CHECK([gxx version],[ax_cv_gxx_version],[
+        ax_cv_gxx_version="`$CXX -dumpversion`"
+        AS_IF([test "x$ax_cv_gxx_version" = "x"],[
+          ax_cv_gxx_version=""
+        ])
+      ])
+      GXX_VERSION=$ax_cv_gxx_version
+    ])
+  ])
+  AC_SUBST([GXX_VERSION])
+])
--- a/m4/lx_find_mpi.m4
+++ b/m4/lx_find_mpi.m4
@@ -0,0 +1,203 @@
+#################################################################################################
+# Copyright (c) 2010, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory
+# Written by Todd Gamblin, tgamblin@llnl.gov.
+# LLNL-CODE-417602
+# All rights reserved.
+#
+# This file is part of Libra. For details, see http://github.com/tgamblin/libra.
+# Please also read the LICENSE file for further information.
+#
+# Redistribution and use in source and binary forms, with or without modification, are
+# permitted provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright notice, this list of
+#    conditions and the disclaimer below.
+#  * Redistributions in binary form must reproduce the above copyright notice, this list of
+#    conditions and the disclaimer (as noted below) in the documentation and/or other materials
+#    provided with the distribution.
+#  * Neither the name of the LLNS/LLNL nor the names of its contributors may be used to endorse
+#    or promote products derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
+# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+# LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#################################################################################################
+
+#
+# LX_FIND_MPI()
+#  ------------------------------------------------------------------------
+# This macro finds an MPI compiler and extracts includes and libraries from
+# it for use in automake projects.  The script exports the following variables:
+#
+# AC_DEFINE variables:
+#     HAVE_MPI         AC_DEFINE'd to 1 if we found MPI
+#
+# AC_SUBST variables:
+#     MPICC            Name of MPI compiler
+#     MPI_CFLAGS       Includes and defines for MPI C compilation
+#     MPI_CLDFLAGS     Libraries and library paths for linking MPI C programs
+#
+#     MPICXX           Name of MPI C++ compiler
+#     MPI_CXXFLAGS     Includes and defines for MPI C++ compilation
+#     MPI_CXXLDFLAGS   Libraries and library paths for linking MPI C++ programs
+#
+#     MPIF77           Name of MPI Fortran 77 compiler
+#     MPI_F77FLAGS     Includes and defines for MPI Fortran 77 compilation
+#     MPI_F77LDFLAGS   Libraries and library paths for linking MPI Fortran 77 programs
+#
+#     MPIFC            Name of MPI Fortran compiler
+#     MPI_FFLAGS       Includes and defines for MPI Fortran compilation
+#     MPI_FLDFLAGS     Libraries and library paths for linking MPI Fortran programs
+#
+# Shell variables output by this macro:
+#     have_C_mpi       'yes' if we found MPI for C, 'no' otherwise
+#     have_CXX_mpi     'yes' if we found MPI for C++, 'no' otherwise
+#     have_F77_mpi     'yes' if we found MPI for F77, 'no' otherwise
+#     have_F_mpi       'yes' if we found MPI for Fortran, 'no' otherwise
+#
+AC_DEFUN([LX_FIND_MPI],
+[
+     AC_LANG_CASE(
+     [C], [
+         AC_REQUIRE([AC_PROG_CC])
+         if [[ ! -z "$MPICC" ]]; then
+             LX_QUERY_MPI_COMPILER(MPICC, [$MPICC], C)
+         else
+             LX_QUERY_MPI_COMPILER(MPICC, [mpicc mpiicc mpixlc mpipgcc], C)
+         fi
+     ],
+     [C++], [
+         AC_REQUIRE([AC_PROG_CXX])
+         if [[ ! -z "$MPICXX" ]]; then
+             LX_QUERY_MPI_COMPILER(MPICXX, [$MPICXX], CXX)
+         else
+             LX_QUERY_MPI_COMPILER(MPICXX, [mpicxx mpiCC mpic++ mpig++ mpiicpc mpipgCC mpixlC], CXX)
+         fi
+     ],
+     [F77], [
+         AC_REQUIRE([AC_PROG_F77])
+         if [[ ! -z "$MPIF77" ]]; then
+             LX_QUERY_MPI_COMPILER(MPIF77, [$MPIF77], F77)
+         else
+             LX_QUERY_MPI_COMPILER(MPIF77, [mpif77 mpiifort mpixlf77 mpixlf77_r], F77)
+         fi
+     ],
+     [Fortran], [
+         AC_REQUIRE([AC_PROG_FC])
+         if [[ ! -z "$MPIFC" ]]; then
+             LX_QUERY_MPI_COMPILER(MPIFC, [$MPIFC], F)
+         else
+             mpi_default_fc="mpif95 mpif90 mpigfortran mpif2003"
+             mpi_intel_fc="mpiifort"
+             mpi_xl_fc="mpixlf95 mpixlf95_r mpixlf90 mpixlf90_r mpixlf2003 mpixlf2003_r"
+             mpi_pg_fc="mpipgf95 mpipgf90"
+             LX_QUERY_MPI_COMPILER(MPIFC, [$mpi_default_fc $mpi_intel_fc $mpi_xl_fc $mpi_pg_fc], F)
+         fi
+     ])
+])
+
+
+#
+# LX_QUERY_MPI_COMPILER([compiler-var-name], [compiler-names], [output-var-prefix])
+#  ------------------------------------------------------------------------
+# AC_SUBST variables:
+#     MPI_<prefix>FLAGS       Includes and defines for MPI compilation
+#     MPI_<prefix>LDFLAGS     Libraries and library paths for linking MPI C programs
+#
+# Shell variables output by this macro:
+#     found_mpi_flags         'yes' if we were able to get flags, 'no' otherwise
+#
+AC_DEFUN([LX_QUERY_MPI_COMPILER],
+[
+     # Try to find a working MPI compiler from the supplied names
+     AC_PATH_PROGS($1, [$2], [not-found])
+
+     # Figure out what the compiler responds to to get it to show us the compile
+     # and link lines.  After this part of the macro, we'll have a valid
+     # lx_mpi_command_line
+     printf "checking whether $$1 responds to '-showme:compile'... "
+     lx_mpi_compile_line=`$$1 -showme:compile 2>/dev/null`
+     if [[ "$?" -eq 0 ]]; then
+         echo yes
+         lx_mpi_link_line=`$$1 -showme:link 2>/dev/null`
+     else
+         echo no
+         printf "checking whether $$1 responds to '-showme'... "
+         lx_mpi_command_line=`$$1 -showme 2>/dev/null`
+         if [[ "$?" -ne 0 ]]; then
+             echo no
+             printf "checking whether $$1 responds to '-compile-info'... "
+             lx_mpi_compile_line=`$$1 -compile-info 2>/dev/null`
+             if [[ "$?" -eq 0 ]]; then
+                 echo yes
+                 lx_mpi_link_line=`$$1 -link-info 2>/dev/null`
+             else
+                 echo no
+                 printf "checking whether $$1 responds to '-show'... "
+                 lx_mpi_command_line=`$$1 -show 2>/dev/null`
+                 if [[ "$?" -eq 0 ]]; then
+                     echo yes
+                 else
+                     echo no
+                 fi
+             fi
+         else
+             echo yes
+         fi
+     fi
+
+     if [[ ! -z "$lx_mpi_compile_line" -a ! -z "$lx_mpi_link_line" ]]; then
+         lx_mpi_command_line="$lx_mpi_compile_line $lx_mpi_link_line"
+     fi
+
+     if [[ ! -z "$lx_mpi_command_line" ]]; then
+         # Now extract the different parts of the MPI command line.  Do these separately in case we need to
+         # parse them all out in future versions of this macro.
+         lx_mpi_defines=`    echo "$lx_mpi_command_line" | grep -o -- '\(^\| \)-D\([[^\"[:space:]]]\+\|\"[[^\"[:space:]]]\+\"\)'`
+         lx_mpi_includes=`   echo "$lx_mpi_command_line" | grep -o -- '\(^\| \)-I\([[^\"[:space:]]]\+\|\"[[^\"[:space:]]]\+\"\)'`
+         lx_mpi_link_paths=` echo "$lx_mpi_command_line" | grep -o -- '\(^\| \)-L\([[^\"[:space:]]]\+\|\"[[^\"[:space:]]]\+\"\)'`
+         lx_mpi_libs=`       echo "$lx_mpi_command_line" | grep -o -- '\(^\| \)-l\([[^\"[:space:]]]\+\|\"[[^\"[:space:]]]\+\"\)'`
+         lx_mpi_link_args=`  echo "$lx_mpi_command_line" | grep -o -- '\(^\| \)-Wl,\([[^\"[:space:]]]\+\|\"[[^\"[:space:]]]\+\"\)'`
+
+         # Create variables and clean up newlines and multiple spaces
+         MPI_$3FLAGS="$lx_mpi_defines $lx_mpi_includes"
+         MPI_$3LDFLAGS="$lx_mpi_link_paths $lx_mpi_libs $lx_mpi_link_args"
+         MPI_$3FLAGS=`  echo "$MPI_$3FLAGS"   | tr '\n' ' ' | sed 's/^[[ \t]]*//;s/[[ \t]]*$//' | sed 's/  +/ /g'`
+         MPI_$3LDFLAGS=`echo "$MPI_$3LDFLAGS" | tr '\n' ' ' | sed 's/^[[ \t]]*//;s/[[ \t]]*$//' | sed 's/  +/ /g'`
+
+         OLD_CPPFLAGS=$CPPFLAGS
+         OLD_LIBS=$LIBS
+         CPPFLAGS=$MPI_$3FLAGS
+         LIBS=$MPI_$3LDFLAGS
+
+         AC_TRY_LINK([#include <mpi.h>],
+                     [int rank, size;
+                      MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+                      MPI_Comm_size(MPI_COMM_WORLD, &size);],
+                     [# Add a define for testing at compile time.
+                      AC_DEFINE([HAVE_MPI], [1], [Define to 1 if you have MPI libs and headers.])
+                      have_$3_mpi='yes'],
+                     [# zero out mpi flags so we don't link against the faulty library.
+                      MPI_$3FLAGS=""
+                      MPI_$3LDFLAGS=""
+                      have_$3_mpi='no'])
+
+         # AC_SUBST everything.
+         AC_SUBST($1)
+         AC_SUBST(MPI_$3FLAGS)
+         AC_SUBST(MPI_$3LDFLAGS)
+
+         LIBS=$OLD_LIBS
+         CPPFLAGS=$OLD_CPPFLAGS
+     else
+         echo Unable to find suitable MPI Compiler. Try setting $1.
+         have_$3_mpi='no'
+     fi
+])
--- a/prerequisites/Makefile.am
+++ b/prerequisites/Makefile.am
@@ -1,38 +0,0 @@
-FFTFLAGS=$(filter-out -std=c++11, $(CXXFLAGS) )
-
-EIGENVER=3.2.8
-EIGEN=eigen$(EIGENVER)
-EIGENTAR=$(EIGEN).tar.bz2
-EIGENURL=https://bitbucket.org/eigen/eigen/get/$(EIGENVER).tar.bz2
-
-FFTWVER=3.3.4
-FFTW=fftw-$(FFTWVER)
-FFTWTAR=fftw-$(FFTWVER).tar.gz
-FFTWURL=http://www.fftw.org/$(FFTWTAR)
-
-all: Eigen FFTW headerlist
-
-$(top_srcdir)/prerequisites/$(EIGENTAR): 
-	curl -v $(EIGENURL) -o $(top_srcdir)/prerequisites/$(EIGENTAR)
-
-$(top_srcdir)/prerequisites/$(FFTWTAR): 
-	curl -v $(FFTWURL) -o $(top_srcdir)/prerequisites/$(FFTWTAR)
-
-Eigen:  $(top_srcdir)/prerequisites/$(EIGENTAR)
-	tar xvf $(top_srcdir)/prerequisites/$(EIGENTAR)
-	- rm -rf  $(top_srcdir)/lib/Eigen
-	mv eigen-eigen*/Eigen .
-	echo EFILES=`find Eigen -type f -name '*.h' ` > $(top_srcdir)/lib/Eigen.inc
-	mv Eigen $(top_srcdir)/lib/
-	touch Eigen
-
-FFTW: $(top_srcdir)/prerequisites/$(FFTWTAR)
-	tar xvf $(top_srcdir)/prerequisites/$(FFTWTAR)
-	cd $(FFTW) &&	./configure --prefix=@abs_top_builddir@/prerequisites/fftwinstall CFLAGS="$(FFTFLAGS)" CC=$(CC) LDFLAGS="$(LDFLAGS)" && make all install
-	cp -pr fftwinstall/include/fftw3.h ../include/Grid/
-	cp -pr fftwinstall/lib/libfftw3.a  ../lib/
-	touch FFTW
-
-headerlist:
-	cd $(top_srcdir) && ./scripts/filelist
-	touch headerlist
--- a/prerequisites/eigen3.2.8.tar.bz2
+++ b/prerequisites/eigen3.2.8.tar.bz2
--- a/prerequisites/eigenIGENVER.tar.bz2
+++ b/prerequisites/eigenIGENVER.tar.bz2
--- a/prerequisites/fftw-3.3.4.tar.gz
+++ b/prerequisites/fftw-3.3.4.tar.gz
--- a/scripts/Make.inc
+++ b/scripts/Make.inc
@@ -1,4 +0,0 @@
-
-HFILES=
-
-CCFILES=
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -1,63 +1,47 @@
-#!/bin/bash
+#!/usr/bin/env bash

 home=`pwd`
- 
+
+# library Make.inc
 cd $home/lib
-HFILES=`find . -type f -name '*.h' -not -path '*/Old/*'`
-HFILES="$HFILES Config.h"
+HFILES=`find . -type f -name '*.h' -not -path '*/Old/*' -not -path '*/Eigen/*'`
+HFILES="$HFILES"
 CCFILES=`find . -type f -name '*.cc' -not  -name '*ommunicator*.cc'`
-echo> Make.inc
-echo HFILES=$HFILES >> Make.inc
+echo HFILES=$HFILES > Make.inc
 echo >> Make.inc
 echo CCFILES=$CCFILES >> Make.inc

+# tests Make.inc
 cd $home/tests
-
 dirs=`find . -type d `
-
-for subdir in $dirs
-do
-
-pwd
-echo subdir is $subdir of $dirs
-
-cd $home/tests/$subdir
-
-TESTS=`ls T*.cc`
-TESTLIST=`echo ${TESTS} | sed s/.cc//g `
-
-echo > Make.inc
-echo bin_PROGRAMS += ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
-echo >> Make.inc
-for f in $TESTS
-do
-BNAME=`basename $f .cc`
-echo >> Make.inc
-echo ${BNAME}_SOURCES=$f  >> Make.inc
-echo ${BNAME}_LDADD=-lGrid>> Make.inc
-echo >> Make.inc
-done
-
-
+for subdir in $dirs; do
+	cd $home/tests/$subdir
+	TESTS=`ls T*.cc`
+	TESTLIST=`echo ${TESTS} | sed s/.cc//g `
+	PREF=`[ $subdir = '.' ] && echo noinst || echo EXTRA`
+	echo "tests: ${TESTLIST}" > Make.inc
+	echo ${PREF}_PROGRAMS = ${TESTLIST} >> Make.inc
+	echo >> Make.inc
+	for f in $TESTS; do
+		BNAME=`basename $f .cc`
+		echo ${BNAME}_SOURCES=$f  >> Make.inc
+		echo ${BNAME}_LDADD=-lGrid>> Make.inc
+		echo >> Make.inc
+	done
 done

+# benchmarks Make.inc
 cd $home/benchmarks
-
 echo> Make.inc
 TESTS=`ls B*.cc`
 TESTLIST=`echo ${TESTS} | sed s/.cc//g `
-
-echo > Make.inc
-echo bin_PROGRAMS = ${TESTLIST} >> Make.inc
-echo >> Make.inc
-
-for f in $TESTS
-do
-BNAME=`basename $f .cc`
-echo >> Make.inc
-echo ${BNAME}_SOURCES=$f  >> Make.inc
-echo ${BNAME}_LDADD=-lGrid>> Make.inc
+echo bin_PROGRAMS = ${TESTLIST} > Make.inc
 echo >> Make.inc
+for f in $TESTS; do
+	BNAME=`basename $f .cc`
+	echo ${BNAME}_SOURCES=$f  >> Make.inc
+	echo ${BNAME}_LDADD=-lGrid>> Make.inc
+	echo >> Make.inc
 done

 cd ..
--- a/scripts/update_eigen.sh
+++ b/scripts/update_eigen.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+if (( $# != 1 )); then
+    echo "usage: `basename $0` <archive>" 1>&2
+    exit 1
+fi
+ARC=$1
+
+INITDIR=`pwd`
+rm -rf lib/Eigen
+ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
+tar -xf ${ARC}
+cd ${ARCDIR}
+(tar -cf - Eigen --exclude='*.txt' 2>/dev/null) | tar -xf - -C ../lib/
+cd ../lib
+echo 'eigen_files =\' > Eigen.inc
+find Eigen -type f -print | sed 's/^/  /;$q;s/$/ \\/' >> Eigen.inc
+cd ${INITDIR}
+rm -rf ${ARCDIR}
--- a/scripts/update_fftw.sh
+++ b/scripts/update_fftw.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+if (( $# != 1 )); then
+    echo "usage: `basename $0` <archive>" 1>&2
+    exit 1
+fi
+ARC=$1
+
+INITDIR=`pwd`
+rm -rf lib/fftw
+mkdir lib/fftw
+
+ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
+tar -xf ${ARC}
+cp ${ARCDIR}/api/fftw3.h lib/fftw/
+
+cd ${INITDIR}
+rm -rf ${ARCDIR}
--- a/tests/IO/Make.inc
+++ b/tests/IO/Make.inc
@@ -1,11 +0,0 @@
-
-bin_PROGRAMS += Test_nersc_io Test_serialisation
-
-
-Test_nersc_io_SOURCES=Test_nersc_io.cc
-Test_nersc_io_LDADD=-lGrid
-
-
-Test_serialisation_SOURCES=Test_serialisation.cc
-Test_serialisation_LDADD=-lGrid
-
--- a/tests/IO/Makefile.am
+++ b/tests/IO/Makefile.am
@@ -1,19 +1 @@
-# additional include paths necessary to compile the C++ library
-
-bin_PROGRAMS =
-SUBDIRS =
-
-AM_CXXFLAGS = -I$(top_srcdir)/include
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-if USE_LAPACK
-AM_CXXFLAGS += -DUSE_LAPACK
-if USE_LAPACK_LIB
-#if test "X${ac_LAPACK}X" != XyesX 
-AM_CXXFLAGS += -I$(ac_LAPACK)/include
-AM_LDFLAGS += -L$(ac_LAPACK)/lib
-#fi
-endif
-endif
-
 include Make.inc
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,19 +0,0 @@
-
-bin_PROGRAMS += Test_cshift Test_dwf_mixedcg_prec Test_simd Test_stencil
-
-
-Test_cshift_SOURCES=Test_cshift.cc
-Test_cshift_LDADD=-lGrid
-
-
-Test_dwf_mixedcg_prec_SOURCES=Test_dwf_mixedcg_prec.cc
-Test_dwf_mixedcg_prec_LDADD=-lGrid
-
-
-Test_simd_SOURCES=Test_simd.cc
-Test_simd_LDADD=-lGrid
-
-
-Test_stencil_SOURCES=Test_stencil.cc
-Test_stencil_LDADD=-lGrid
-
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -1,26 +1,7 @@
-# additional include paths necessary to compile the C++ library
-
-#SUBDIRS = core
-
-# Uncomment to enable complete test suite build
 SUBDIRS = core forces hmc solver debug	

 if BUILD_CHROMA_REGRESSION
  SUBDIRS+= qdpxx
 endif
-bin_PROGRAMS =
-
-AM_CXXFLAGS = -I$(top_srcdir)/include
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-if USE_LAPACK
-AM_CXXFLAGS += -DUSE_LAPACK
-if USE_LAPACK_LIB
-#if test "X${ac_LAPACK}X" != XyesX 
-AM_CXXFLAGS += -I$(ac_LAPACK)/include
-AM_LDFLAGS += -L$(ac_LAPACK)/lib
-#fi
-endif
-endif

 include Make.inc
--- a/tests/core/Make.inc
+++ b/tests/core/Make.inc
@@ -1,83 +1,69 @@
-
-bin_PROGRAMS += Test_cf_coarsen_support Test_checker Test_contfrac_even_odd Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_even_odd Test_dwf_rb5d Test_gamma Test_GaugeAction Test_gparity Test_gpwilson_even_odd Test_lie_generators Test_main Test_quenched_update Test_RectPlaq Test_rng Test_rng_fixed Test_wilson_even_odd Test_wilson_tm_even_odd
-
+tests: Test_cf_coarsen_support Test_checker Test_contfrac_even_odd Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_even_odd Test_dwf_rb5d Test_fft Test_fftf Test_gamma Test_GaugeAction Test_gparity Test_gpwilson_even_odd Test_lie_generators Test_main Test_quenched_update Test_RectPlaq Test_rng Test_rng_fixed Test_wilson_even_odd Test_wilson_tm_even_odd
+EXTRA_PROGRAMS = Test_cf_coarsen_support Test_checker Test_contfrac_even_odd Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_even_odd Test_dwf_rb5d Test_fft Test_fftf Test_gamma Test_GaugeAction Test_gparity Test_gpwilson_even_odd Test_lie_generators Test_main Test_quenched_update Test_RectPlaq Test_rng Test_rng_fixed Test_wilson_even_odd Test_wilson_tm_even_odd

 Test_cf_coarsen_support_SOURCES=Test_cf_coarsen_support.cc
 Test_cf_coarsen_support_LDADD=-lGrid

-
 Test_checker_SOURCES=Test_checker.cc
 Test_checker_LDADD=-lGrid

-
 Test_contfrac_even_odd_SOURCES=Test_contfrac_even_odd.cc
 Test_contfrac_even_odd_LDADD=-lGrid

-
 Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
 Test_cshift_red_black_LDADD=-lGrid

-
 Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
 Test_cshift_red_black_rotate_LDADD=-lGrid

-
 Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
 Test_cshift_rotate_LDADD=-lGrid

-
 Test_dwf_even_odd_SOURCES=Test_dwf_even_odd.cc
 Test_dwf_even_odd_LDADD=-lGrid

-
 Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
 Test_dwf_rb5d_LDADD=-lGrid

+Test_fft_SOURCES=Test_fft.cc
+Test_fft_LDADD=-lGrid
+
+Test_fftf_SOURCES=Test_fftf.cc
+Test_fftf_LDADD=-lGrid

 Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid

-
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 Test_GaugeAction_LDADD=-lGrid

-
 Test_gparity_SOURCES=Test_gparity.cc
 Test_gparity_LDADD=-lGrid

-
 Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
 Test_gpwilson_even_odd_LDADD=-lGrid

-
 Test_lie_generators_SOURCES=Test_lie_generators.cc
 Test_lie_generators_LDADD=-lGrid

-
 Test_main_SOURCES=Test_main.cc
 Test_main_LDADD=-lGrid

-
 Test_quenched_update_SOURCES=Test_quenched_update.cc
 Test_quenched_update_LDADD=-lGrid

-
 Test_RectPlaq_SOURCES=Test_RectPlaq.cc
 Test_RectPlaq_LDADD=-lGrid

-
 Test_rng_SOURCES=Test_rng.cc
 Test_rng_LDADD=-lGrid

-
 Test_rng_fixed_SOURCES=Test_rng_fixed.cc
 Test_rng_fixed_LDADD=-lGrid

-
 Test_wilson_even_odd_SOURCES=Test_wilson_even_odd.cc
 Test_wilson_even_odd_LDADD=-lGrid

-
 Test_wilson_tm_even_odd_SOURCES=Test_wilson_tm_even_odd.cc
 Test_wilson_tm_even_odd_LDADD=-lGrid

--- a/tests/core/Makefile.am
+++ b/tests/core/Makefile.am
@@ -1,19 +1 @@
-# additional include paths necessary to compile the C++ library
-
-bin_PROGRAMS =
-SUBDIRS =
-
-AM_CXXFLAGS = -I$(top_srcdir)/include
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-if USE_LAPACK
-AM_CXXFLAGS += -DUSE_LAPACK
-if USE_LAPACK_LIB
-#if test "X${ac_LAPACK}X" != XyesX 
-AM_CXXFLAGS += -I$(ac_LAPACK)/include
-AM_LDFLAGS += -L$(ac_LAPACK)/lib
-#fi
-endif
-endif
-
 include Make.inc
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -0,0 +1,111 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
+
+  LatticeComplexD     one(&Fine);
+  LatticeComplexD      zz(&Fine);
+  LatticeComplexD       C(&Fine);
+  LatticeComplexD  Ctilde(&Fine);
+  LatticeComplexD    coor(&Fine);
+
+  LatticeSpinMatrixD    S(&Fine);
+  LatticeSpinMatrixD    Stilde(&Fine);
+  
+  std::vector<int> p({1,2,3,2});
+
+  one = ComplexD(1.0,0.0);
+  zz  = ComplexD(0.0,0.0);
+
+  ComplexD ci(0.0,1.0);
+
+  C=zero;
+  for(int mu=0;mu<4;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    C = C - (TwoPiL * p[mu]) * coor;
+  }
+
+  C = exp(C*ci);
+
+  S=zero;
+  S = S+C;
+
+  FFT theFFT(&Fine);
+
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
+
+  //  C=zero;
+  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
+  TComplexD cVol;
+  cVol()()() = vol;
+
+  C=zero;
+  pokeSite(cVol,C,p);
+  C=C-Ctilde;
+  std::cout << "diff scalar "<<norm2(C) << std::endl;
+
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
+
+  SpinMatrixD Sp; 
+  Sp = zero; Sp = Sp+cVol;
+
+  S=zero;
+  pokeSite(Sp,S,p);
+
+  S= S-Stilde;
+  std::cout << "diff FT[SpinMat] "<<norm2(S) << std::endl;
+
+  Grid_finalize();
+}
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@@ -0,0 +1,111 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout( { vComplexF::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
+
+  LatticeComplexF     one(&Fine);
+  LatticeComplexF      zz(&Fine);
+  LatticeComplexF       C(&Fine);
+  LatticeComplexF  Ctilde(&Fine);
+  LatticeComplexF    coor(&Fine);
+
+  LatticeSpinMatrixF    S(&Fine);
+  LatticeSpinMatrixF    Stilde(&Fine);
+  
+  std::vector<int> p({1,2,3,2});
+
+  one = ComplexF(1.0,0.0);
+  zz  = ComplexF(0.0,0.0);
+
+  ComplexF ci(0.0,1.0);
+
+  C=zero;
+  for(int mu=0;mu<4;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    C = C - (TwoPiL * p[mu]) * coor;
+  }
+
+  C = exp(C*ci);
+
+  S=zero;
+  S = S+C;
+
+  FFT theFFT(&Fine);
+
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
+
+  //  C=zero;
+  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
+  TComplexF cVol;
+  cVol()()() = vol;
+
+  C=zero;
+  pokeSite(cVol,C,p);
+  C=C-Ctilde;
+  std::cout << "diff scalar "<<norm2(C) << std::endl;
+
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
+
+  SpinMatrixF Sp; 
+  Sp = zero; Sp = Sp+cVol;
+
+  S=zero;
+  pokeSite(Sp,S,p);
+
+  S= S-Stilde;
+  std::cout << "diff FT[SpinMat] "<<norm2(S) << std::endl;
+
+  Grid_finalize();
+}
--- a/tests/debug/Make.inc
+++ b/tests/debug/Make.inc
@@ -1,35 +0,0 @@
-
-bin_PROGRAMS += Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_even_odd_vec Test_cayley_ldop_cr Test_cheby Test_synthetic_lanczos 
-
-
-Test_cayley_cg_SOURCES=Test_cayley_cg.cc
-Test_cayley_cg_LDADD=-lGrid
-
-
-Test_cayley_coarsen_support_SOURCES=Test_cayley_coarsen_support.cc
-Test_cayley_coarsen_support_LDADD=-lGrid
-
-
-Test_cayley_even_odd_SOURCES=Test_cayley_even_odd.cc
-Test_cayley_even_odd_LDADD=-lGrid
-
-
-Test_cayley_even_odd_vec_SOURCES=Test_cayley_even_odd_vec.cc
-Test_cayley_even_odd_vec_LDADD=-lGrid
-
-
-Test_cayley_ldop_cr_SOURCES=Test_cayley_ldop_cr.cc
-Test_cayley_ldop_cr_LDADD=-lGrid
-
-
-Test_cheby_SOURCES=Test_cheby.cc
-Test_cheby_LDADD=-lGrid
-
-
-Test_synthetic_lanczos_SOURCES=Test_synthetic_lanczos.cc
-Test_synthetic_lanczos_LDADD=-lGrid
-
-
-Test_zmm_SOURCES=Test_zmm.cc
-Test_zmm_LDADD=-lGrid
-
--- a/tests/debug/Makefile.am
+++ b/tests/debug/Makefile.am
@@ -1,19 +1 @@
-# additional include paths necessary to compile the C++ library
-
-bin_PROGRAMS =
-SUBDIRS =
-
-AM_CXXFLAGS = -I$(top_srcdir)/include
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-if USE_LAPACK
-AM_CXXFLAGS += -DUSE_LAPACK
-if USE_LAPACK_LIB
-#if test "X${ac_LAPACK}X" != XyesX 
-AM_CXXFLAGS += -I$(ac_LAPACK)/include
-AM_LDFLAGS += -L$(ac_LAPACK)/lib
-#fi
-endif
-endif
-
 include Make.inc
--- a/tests/debug/Test_cayley_even_odd_vec.cc
+++ b/tests/debug/Test_cayley_even_odd_vec.cc
@@ -44,6 +44,7 @@ struct scal {
  };

 typedef DomainWallFermion<DomainWallVec5dImplR>                      DomainWallVecFermionR;
+typedef ZMobiusFermion<ZDomainWallVec5dImplR>                        ZMobiusVecFermionR;
 typedef MobiusFermion<DomainWallVec5dImplR>                          MobiusVecFermionR;
 typedef MobiusZolotarevFermion<DomainWallVec5dImplR>                 MobiusZolotarevVecFermionR;
 typedef ScaledShamirFermion<DomainWallVec5dImplR>                    ScaledShamirVecFermionR;
@@ -117,6 +118,17 @@ int main (int argc, char ** argv)
  TestWhat<MobiusFermionR>(Dmob,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);
  TestWhat<MobiusVecFermionR>(sDmob,sFGrid,sFrbGrid,sUGrid,mass,M5,&sRNG4,&sRNG5);

+
+  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
+  std::cout<<GridLogMessage <<"Z-MobiusFermion test"<<std::endl;
+  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
+  std::vector<ComplexD> gamma(Ls,std::complex<double>(1.0,0.0));
+  ZMobiusFermionR     zDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,gamma,b,c);
+  ZMobiusVecFermionR szDmob(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c);
+  TestMoo(zDmob,szDmob);
+  TestWhat<ZMobiusFermionR>(zDmob,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);
+  TestWhat<ZMobiusVecFermionR>(szDmob,sFGrid,sFrbGrid,sUGrid,mass,M5,&sRNG4,&sRNG5);
+
  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusZolotarevFermion test"<<std::endl;
  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
--- a/tests/debug/Test_zmm.cc
+++ b/tests/debug/Test_zmm.cc
@@ -26,9 +26,15 @@ See the full license in the file "LICENSE" in the top level distribution directo
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
-#include <PerfCount.h>
+#include <Grid/PerfCount.h>
+
+#ifdef TEST_ZMM
+
+int main(int argc,char **argv)
+{
+  return 0;
+}

-int main(int argc, char **argv) { return 0; }
 #if 0
 #include <simd/Intel512wilson.h>
 using namespace Grid;
@@ -478,5 +484,12 @@ void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3)

  return;
 }
-
+#endif
+#else
+int main(int argc, char **argv)
+{
+  std::cerr << "error: no ZMM test for the selected architecture" << std::endl;
+
+  return 1;
+}
 #endif
--- a/tests/forces/Make.inc
+++ b/tests/forces/Make.inc
@@ -1,47 +1,36 @@
-
-bin_PROGRAMS += Test_contfrac_force Test_dwf_force Test_dwf_gpforce Test_gpdwf_force Test_gp_rect_force Test_gpwilson_force Test_partfrac_force Test_rect_force Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
-
+tests: Test_contfrac_force Test_dwf_force Test_dwf_gpforce Test_gpdwf_force Test_gp_rect_force Test_gpwilson_force Test_partfrac_force Test_rect_force Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
+EXTRA_PROGRAMS = Test_contfrac_force Test_dwf_force Test_dwf_gpforce Test_gpdwf_force Test_gp_rect_force Test_gpwilson_force Test_partfrac_force Test_rect_force Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi

 Test_contfrac_force_SOURCES=Test_contfrac_force.cc
 Test_contfrac_force_LDADD=-lGrid

-
 Test_dwf_force_SOURCES=Test_dwf_force.cc
 Test_dwf_force_LDADD=-lGrid

-
 Test_dwf_gpforce_SOURCES=Test_dwf_gpforce.cc
 Test_dwf_gpforce_LDADD=-lGrid

-
 Test_gpdwf_force_SOURCES=Test_gpdwf_force.cc
 Test_gpdwf_force_LDADD=-lGrid

-
 Test_gp_rect_force_SOURCES=Test_gp_rect_force.cc
 Test_gp_rect_force_LDADD=-lGrid

-
 Test_gpwilson_force_SOURCES=Test_gpwilson_force.cc
 Test_gpwilson_force_LDADD=-lGrid

-
 Test_partfrac_force_SOURCES=Test_partfrac_force.cc
 Test_partfrac_force_LDADD=-lGrid

-
 Test_rect_force_SOURCES=Test_rect_force.cc
 Test_rect_force_LDADD=-lGrid

-
 Test_wilson_force_SOURCES=Test_wilson_force.cc
 Test_wilson_force_LDADD=-lGrid

-
 Test_wilson_force_phiMdagMphi_SOURCES=Test_wilson_force_phiMdagMphi.cc
 Test_wilson_force_phiMdagMphi_LDADD=-lGrid

-
 Test_wilson_force_phiMphi_SOURCES=Test_wilson_force_phiMphi.cc
 Test_wilson_force_phiMphi_LDADD=-lGrid

--- a/tests/forces/Makefile.am
+++ b/tests/forces/Makefile.am
@@ -1,19 +1 @@
-# additional include paths necessary to compile the C++ library
-
-bin_PROGRAMS =
-SUBDIRS =
-
-AM_CXXFLAGS = -I$(top_srcdir)/include
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-if USE_LAPACK
-AM_CXXFLAGS += -DUSE_LAPACK
-if USE_LAPACK_LIB
-#if test "X${ac_LAPACK}X" != XyesX 
-AM_CXXFLAGS += -I$(ac_LAPACK)/include
-AM_LDFLAGS += -L$(ac_LAPACK)/lib
-#fi
-endif
-endif
-
 include Make.inc
--- a/tests/hmc/Make.inc
+++ b/tests/hmc/Make.inc
@@ -1,75 +1,57 @@
-
-bin_PROGRAMS += Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonAdjointFermionGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_multishift_sqrt Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio
-
+tests: Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonAdjointFermionGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_multishift_sqrt Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio
+EXTRA_PROGRAMS = Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonAdjointFermionGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_multishift_sqrt Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio

 Test_hmc_EODWFRatio_SOURCES=Test_hmc_EODWFRatio.cc
 Test_hmc_EODWFRatio_LDADD=-lGrid

-
 Test_hmc_EODWFRatio_Gparity_SOURCES=Test_hmc_EODWFRatio_Gparity.cc
 Test_hmc_EODWFRatio_Gparity_LDADD=-lGrid

-
 Test_hmc_EOWilsonFermionGauge_SOURCES=Test_hmc_EOWilsonFermionGauge.cc
 Test_hmc_EOWilsonFermionGauge_LDADD=-lGrid

-
 Test_hmc_EOWilsonRatio_SOURCES=Test_hmc_EOWilsonRatio.cc
 Test_hmc_EOWilsonRatio_LDADD=-lGrid

-
 Test_hmc_GparityIwasakiGauge_SOURCES=Test_hmc_GparityIwasakiGauge.cc
 Test_hmc_GparityIwasakiGauge_LDADD=-lGrid

-
 Test_hmc_GparityWilsonGauge_SOURCES=Test_hmc_GparityWilsonGauge.cc
 Test_hmc_GparityWilsonGauge_LDADD=-lGrid

-
 Test_hmc_IwasakiGauge_SOURCES=Test_hmc_IwasakiGauge.cc
 Test_hmc_IwasakiGauge_LDADD=-lGrid

-
 Test_hmc_RectGauge_SOURCES=Test_hmc_RectGauge.cc
 Test_hmc_RectGauge_LDADD=-lGrid

-
 Test_hmc_WilsonAdjointFermionGauge_SOURCES=Test_hmc_WilsonAdjointFermionGauge.cc
 Test_hmc_WilsonAdjointFermionGauge_LDADD=-lGrid

-
 Test_hmc_WilsonFermionGauge_SOURCES=Test_hmc_WilsonFermionGauge.cc
 Test_hmc_WilsonFermionGauge_LDADD=-lGrid

-
 Test_hmc_WilsonGauge_SOURCES=Test_hmc_WilsonGauge.cc
 Test_hmc_WilsonGauge_LDADD=-lGrid

-
 Test_hmc_WilsonRatio_SOURCES=Test_hmc_WilsonRatio.cc
 Test_hmc_WilsonRatio_LDADD=-lGrid

-
 Test_multishift_sqrt_SOURCES=Test_multishift_sqrt.cc
 Test_multishift_sqrt_LDADD=-lGrid

-
 Test_remez_SOURCES=Test_remez.cc
 Test_remez_LDADD=-lGrid

-
 Test_rhmc_EOWilson1p1_SOURCES=Test_rhmc_EOWilson1p1.cc
 Test_rhmc_EOWilson1p1_LDADD=-lGrid

-
 Test_rhmc_EOWilsonRatio_SOURCES=Test_rhmc_EOWilsonRatio.cc
 Test_rhmc_EOWilsonRatio_LDADD=-lGrid

-
 Test_rhmc_Wilson1p1_SOURCES=Test_rhmc_Wilson1p1.cc
 Test_rhmc_Wilson1p1_LDADD=-lGrid

-
 Test_rhmc_WilsonRatio_SOURCES=Test_rhmc_WilsonRatio.cc
 Test_rhmc_WilsonRatio_LDADD=-lGrid

--- a/tests/hmc/Makefile.am
+++ b/tests/hmc/Makefile.am
@@ -1,19 +1 @@
-# additional include paths necessary to compile the C++ library
-
-bin_PROGRAMS =
-SUBDIRS =
-
-AM_CXXFLAGS = -I$(top_srcdir)/include
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-if USE_LAPACK
-AM_CXXFLAGS += -DUSE_LAPACK
-if USE_LAPACK_LIB
-#if test "X${ac_LAPACK}X" != XyesX 
-AM_CXXFLAGS += -I$(ac_LAPACK)/include
-AM_LDFLAGS += -L$(ac_LAPACK)/lib
-#fi
-endif
-endif
-
 include Make.inc
--- a/tests/qdpxx/Make.inc
+++ b/tests/qdpxx/Make.inc
@@ -1,11 +0,0 @@
-
-bin_PROGRAMS += Test_qdpxx_loops_staples Test_qdpxx_munprec
-
-
-Test_qdpxx_loops_staples_SOURCES=Test_qdpxx_loops_staples.cc
-Test_qdpxx_loops_staples_LDADD=-lGrid
-
-
-Test_qdpxx_munprec_SOURCES=Test_qdpxx_munprec.cc
-Test_qdpxx_munprec_LDADD=-lGrid
-
--- a/tests/qdpxx/Makefile.am
+++ b/tests/qdpxx/Makefile.am
@@ -1,6 +1,4 @@
-# additional include paths necessary to compile the C++ library
+AM_CXXFLAGS += `chroma-config --cxxflags`
+AM_LDFLAGS  += `chroma-config --ldflags` `chroma-config --libs`

-AM_CXXFLAGS = -I$(top_srcdir)/include `chroma-config --cxxflags`
-AM_LDFLAGS = -L$(top_builddir)/lib `chroma-config --ldflags` `chroma-config --libs`
-bin_PROGRAMS=
 include Make.inc
--- a/tests/solver/Make.inc
+++ b/tests/solver/Make.inc
@@ -1,55 +0,0 @@
-
-bin_PROGRAMS += Test_cf_cr_unprec Test_contfrac_cg Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_fpgcr Test_dwf_hdcr Test_dwf_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec
-
-
-Test_cf_cr_unprec_SOURCES=Test_cf_cr_unprec.cc
-Test_cf_cr_unprec_LDADD=-lGrid
-
-
-Test_contfrac_cg_SOURCES=Test_contfrac_cg.cc
-Test_contfrac_cg_LDADD=-lGrid
-
-
-Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
-Test_dwf_cg_prec_LDADD=-lGrid
-
-
-Test_dwf_cg_schur_SOURCES=Test_dwf_cg_schur.cc
-Test_dwf_cg_schur_LDADD=-lGrid
-
-
-Test_dwf_cg_unprec_SOURCES=Test_dwf_cg_unprec.cc
-Test_dwf_cg_unprec_LDADD=-lGrid
-
-
-Test_dwf_cr_unprec_SOURCES=Test_dwf_cr_unprec.cc
-Test_dwf_cr_unprec_LDADD=-lGrid
-
-
-Test_dwf_fpgcr_SOURCES=Test_dwf_fpgcr.cc
-Test_dwf_fpgcr_LDADD=-lGrid
-
-
-Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
-Test_dwf_hdcr_LDADD=-lGrid
-
-
-Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
-Test_dwf_lanczos_LDADD=-lGrid
-
-
-Test_wilson_cg_prec_SOURCES=Test_wilson_cg_prec.cc
-Test_wilson_cg_prec_LDADD=-lGrid
-
-
-Test_wilson_cg_schur_SOURCES=Test_wilson_cg_schur.cc
-Test_wilson_cg_schur_LDADD=-lGrid
-
-
-Test_wilson_cg_unprec_SOURCES=Test_wilson_cg_unprec.cc
-Test_wilson_cg_unprec_LDADD=-lGrid
-
-
-Test_wilson_cr_unprec_SOURCES=Test_wilson_cr_unprec.cc
-Test_wilson_cr_unprec_LDADD=-lGrid
-
--- a/tests/solver/Makefile.am
+++ b/tests/solver/Makefile.am
@@ -1,19 +1 @@
-# additional include paths necessary to compile the C++ library
-
-bin_PROGRAMS =
-SUBDIRS =
-
-AM_CXXFLAGS = -I$(top_srcdir)/include
-AM_LDFLAGS = -L$(top_builddir)/lib
-
-if USE_LAPACK
-AM_CXXFLAGS += -DUSE_LAPACK
-if USE_LAPACK_LIB
-#if test "X${ac_LAPACK}X" != XyesX 
-AM_CXXFLAGS += -I$(ac_LAPACK)/include
-AM_LDFLAGS += -L$(ac_LAPACK)/lib
-#fi
-endif
-endif
-
 include Make.inc