From df6c9f55d1af1c40607087f241c2a98edf021075 Mon Sep 17 00:00:00 2001
From: Richard Rollins <rprollins@users.noreply.github.com>
Date: Wed, 20 Jul 2016 17:38:56 +0100
Subject: [PATCH 01/29] Use common benchmark output format for dwf_sweep and
 zmm

---
 benchmarks/Benchmark_dwf_sweep.cc |  2 ++
 benchmarks/Benchmark_zmm.cc       | 16 ++++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc
index d52112ba..18d2f389 100644
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -61,6 +61,8 @@ int main (int argc, char ** argv)
     QCD::WilsonKernelsStatic::AsmOpt=0;
   }
 
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
   std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
   std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
   std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
diff --git a/benchmarks/Benchmark_zmm.cc b/benchmarks/Benchmark_zmm.cc
index 6de5604b..c8ece12c 100644
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -40,14 +40,20 @@ int main(int argc,char **argv)
   std::ofstream os("zmm.dat");
 
   os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
+  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
+  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
+  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
   for(int L=4;L<=32;L+=4){
     for(int m=1;m<=2;m++){
       for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
+  std::cout << GridLogMessage <<"\t";
 	for(int i=0;i<4;i++) { 
 	  std::cout << grid[i]<<"x";
 	}
-	std::cout << Ls<<std::endl;
+	std::cout << Ls<<"\t\t";
 	bench(os,grid,Ls);
       }
     }
@@ -104,7 +110,6 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
   RealD M5  =1.8;
   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 
-  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
   int ncall=50;
   double t0=usecond();
   for(int i=0;i<ncall;i++){
@@ -116,7 +121,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
   double flops=1344*volume/2;
 
   mfc = flops*ncall/(t1-t0);
-  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
+  std::cout<<mfc<<"\t\t";
 
   QCD::WilsonKernelsStatic::AsmOpt=1;
   t0=usecond();
@@ -125,7 +130,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
   }
   t1=usecond();
   mfa = flops*ncall/(t1-t0);
-  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
+  std::cout<<mfa<<"\t\t";
   /*
   int dag=DaggerNo;
   t0=usecond();
@@ -163,8 +168,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
   //resulta = (-0.5) * resulta;
 
   diff = resulto-resulta;
-  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
-  std::cout<<std::endl;
+  std::cout<<norm2(diff)<<std::endl;
   return 0;
 }
 

From d1aaff65e874e3e016e660ed82176cf4e9ebffc0 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Thu, 4 Aug 2016 16:27:02 +0100
Subject: [PATCH 02/29] README update

---
 README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a41564d4..2a0acad6 100644
--- a/README.md
+++ b/README.md
@@ -68,10 +68,18 @@ Now you can execute the `configure` script to generate makefiles (here from a bu
 
 ``` bash
 mkdir build; cd build
-../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi --prefix=<path>
+../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
 ```
 
-where `--enable-precision=` set the default precision (`single` or `double`), `--enable-simd=` set the SIMD type (see possible values below), `--enable-comms=` set the protocol used for communications (`none`, `mpi` or `shmem`), and `<path>` should be replaced by the prefix path where you want to install Grid. Other options are available, use `configure --help` to display them. Like with any other program using GNU autotool, the `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to customise the build.
+where `--enable-precision=` set the default precision (`single` or `double`),
+`--enable-simd=` set the SIMD type (see possible values below), `--enable-
+comms=` set the protocol used for communications (`none`, `mpi`, `mpi-auto` or
+`shmem`), and `<path>` should be replaced by the prefix path where you want to
+install Grid. The `mpi-auto` communication option set `configure` to determine
+automatically how to link to MPI. Other options are available, use `configure
+--help` to display them. Like with any other program using GNU autotool, the
+`CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
+customise the build.
 
 Finally, you can build and install Grid:
 

From 32bc7a6ab893e79ba033fb29ca4618e59278b2c5 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 5 Aug 2016 10:36:00 +0100
Subject: [PATCH 03/29] MPI back out of change that hangs AVX2 for clang, gcc
 needs the -mfma flag.

---
 benchmarks/Benchmark_comms.cc          |  4 +--
 configure.ac                           |  2 +-
 lib/communicator/Communicator_base.h   |  9 ------
 lib/communicator/Communicator_mpi.cc   | 39 ++++++++------------------
 lib/communicator/Communicator_none.cc  | 13 ---------
 lib/communicator/Communicator_shmem.cc | 13 ---------
 6 files changed, 14 insertions(+), 66 deletions(-)

diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 7162a0f9..d0cd96c7 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -194,7 +194,7 @@ int main (int argc, char ** argv)
     }
   }  
 
-
+#if 0
 
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
@@ -315,7 +315,7 @@ int main (int argc, char ** argv)
     }
   }
 
-
+#endif
 
   Grid_finalize();
 }
diff --git a/configure.ac b/configure.ac
index 905a665b..89bb3568 100644
--- a/configure.ac
+++ b/configure.ac
@@ -108,7 +108,7 @@ case ${ax_cv_cxx_compiler_vendor} in
         SIMD_FLAGS='-mavx -mfma4';;
       AVX2)
         AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
-        SIMD_FLAGS='-mavx2';;
+        SIMD_FLAGS='-mavx2 -mfma';;
       AVX512|AVX512MIC|KNL)
         AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
         SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h
index dd19c4f4..94d277e9 100644
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -127,21 +127,12 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);
 
-    void SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-			    void *xmit,
-			    int xmit_to_rank,
-			    void *recv,
-			    int recv_from_rank,
-			    int bytes);
-
     void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
-
-    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list);
     void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 
     ////////////////////////////////////////////////////////////
diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc
index 6830c14a..dff9811a 100644
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -144,28 +144,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }
 
 // Basic Halo comms primitive
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-					       void *xmit,
-					       int dest,
-					       void *recv,
-					       int from,
-					       int bytes)
-{
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq);
-  assert(ierr==0);
-  list.push_back(xrq);
-  list.push_back(rrq);
-}
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
-{
-  MPI_Startall(list.size(),&list[0]);
-}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -173,12 +151,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromBegin(reqs);
-  for(int i=0;i<reqs.size();i++){
-    list.push_back(reqs[i]);
-  }
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  
+  assert(ierr==0);
+
+  list.push_back(xrq);
+  list.push_back(rrq);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc
index 2873462a..8601255a 100644
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -84,19 +84,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
   assert(0);
 }
-void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0);
-}
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
-{
-  assert(0);
-}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
   assert(0);
diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc
index c415b13d..091e266e 100644
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -268,10 +268,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }
 
 // Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
-{
-  assert(0); //unimplemented
-}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -284,15 +280,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
   //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
   shmem_putmem(recv,xmit,bytes,dest);
 }
-void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0); // Unimplemented
-}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
   //  shmem_quiet();      // I'm done

From 5a68715be34f70594d0c08643afe0a2d0c4316be Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 5 Aug 2016 10:51:57 +0100
Subject: [PATCH 04/29] Richards sweep test

---
 benchmarks/Benchmark_wilson_sweep.cc | 117 +++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 benchmarks/Benchmark_wilson_sweep.cc

diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc
new file mode 100644
index 00000000..d7c712d3
--- /dev/null
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -0,0 +1,117 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+    Source file: ./benchmarks/Benchmark_wilson.cc
+    Copyright (C) 2015
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Richard Rollins <rprollins@users.noreply.github.com>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+Gamma::GammaMatrix Gmu [] = {
+  Gamma::GammaX,
+  Gamma::GammaY,
+  Gamma::GammaZ,
+  Gamma::GammaT
+};
+
+bool overlapComms = false;
+
+void bench_wilson (
+		   LatticeFermion &    src,
+		   LatticeFermion & result,
+		   WilsonFermionR &     Dw,
+		   double const     volume,
+		   int const           dag );
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; }
+  typename WilsonFermionR::ImplParams params;
+  params.overlapCommsCompute = overlapComms;
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  std::vector<int> seeds({1,2,3,4});
+  RealD mass = 0.1;
+
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+
+  int Lmax = 32;
+  int dmin = 0;
+  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
+  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
+  for (int L=8; L<=Lmax; L*=2)
+    {
+      std::vector<int> latt_size = std::vector<int>(4,L);
+      for(int d=4; d>dmin; d--)
+	{
+	  if ( d<=3 ) { latt_size[d] *= 2; }
+
+	  std::cout << GridLogMessage;
+	  std::copy( latt_size.begin(), --latt_size.end(), std::ostream_iterator<int>( std::cout, std::string("x").c_str() ) );
+	  std::cout << latt_size.back() << "\t\t";
+
+	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
+	  GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
+
+	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
+	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
+	  LatticeFermion    src(&Grid); random(pRNG,src);
+	  LatticeFermion result(&Grid); result=zero;
+
+	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
+
+	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
+      
+	  bench_wilson(src,result,Dw,volume,DaggerNo);
+	  bench_wilson(src,result,Dw,volume,DaggerYes);
+	  std::cout << std::endl;
+	}
+    }
+
+  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
+  Grid_finalize();
+}
+
+void bench_wilson (
+		   LatticeFermion &    src,
+		   LatticeFermion & result,
+		   WilsonFermionR &     Dw,
+		   double const     volume,
+		   int const           dag )
+{
+  int ncall    = 1000;
+  double t0    = usecond();
+  for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
+  double t1    = usecond();
+  double flops = 1344 * volume * ncall;
+  std::cout << flops/(t1-t0) << "\t\t";
+}

From 836f93780c4faa579e4a28bcb8ea6bc3484db4c0 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 5 Aug 2016 13:41:52 +0100
Subject: [PATCH 05/29] first try at including MPI tests in Travis

---
 .travis.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 0733d037..c7c10d24 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,6 +23,8 @@ matrix:
             - libmpfr-dev
             - libgmp-dev
             - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
             - binutils-dev
       env: VERSION=-4.9
     - compiler: gcc
@@ -35,6 +37,8 @@ matrix:
             - libmpfr-dev
             - libgmp-dev
             - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
             - binutils-dev
       env: VERSION=-5
     - compiler: clang
@@ -47,6 +51,8 @@ matrix:
             - libmpfr-dev
             - libgmp-dev
             - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
             - binutils-dev
       env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
     - compiler: clang
@@ -59,6 +65,8 @@ matrix:
             - libmpfr-dev
             - libgmp-dev
             - libmpc-dev
+            - libopenmpi-dev
+            - openmpi-bin
             - binutils-dev
       env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
       
@@ -69,6 +77,7 @@ before_install:
     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
     - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
     
 install:
@@ -92,3 +101,8 @@ script:
     - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
     - make -j4
     - ./benchmarks/Benchmark_dwf --threads 1
+    - echo make clean
+    - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
+    - make -j4
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1; fi

From 8dc2cfcedbf635fced78845d86aeec2a00fe0239 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 6 Aug 2016 00:28:28 +0100
Subject: [PATCH 06/29] Adding fftw header pulling

---
 bootstrap.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/bootstrap.sh b/bootstrap.sh
index 5e9d4041..461eb121 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,11 +1,18 @@
 #!/usr/bin/env bash
 
 EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
+FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
 
 echo "-- deploying Eigen source..."
 wget ${EIGEN_URL}
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`
+
+echo "-- copying fftw prototypes..."
+wget ${FFTW_URL}
+./scripts/update_fftw.sh `basename ${FFTW_URL}`
+rm `basename ${FFTW_URL}`
+
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'

From fc25d2295cdaaed7ce5e7d629ef2a30696b0d12e Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sat, 6 Aug 2016 00:28:52 +0100
Subject: [PATCH 07/29] fftw download

---
 scripts/update_fftw.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100755 scripts/update_fftw.sh

diff --git a/scripts/update_fftw.sh b/scripts/update_fftw.sh
new file mode 100755
index 00000000..20e42080
--- /dev/null
+++ b/scripts/update_fftw.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+if (( $# != 1 )); then
+    echo "usage: `basename $0` <archive>" 1>&2
+    exit 1
+fi
+ARC=$1
+
+INITDIR=`pwd`
+rm -rf lib/fftw
+mkdir lib/fftw
+
+ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
+tar -xf ${ARC}
+cp ${ARCDIR}/api/fftw3.h lib/fftw/
+
+cd ${INITDIR}
+rm -rf ${ARCDIR}

From ec68e08dd26c1dab129e5650fa8d93c9a3e60394 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Sat, 6 Aug 2016 00:36:05 +0100
Subject: [PATCH 08/29] Travis MPI fix

---
 .travis.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index c7c10d24..2fe52a68 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -102,7 +102,8 @@ script:
     - make -j4
     - ./benchmarks/Benchmark_dwf --threads 1
     - echo make clean
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED'; fi
     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
     - make -j4
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi

From 15218ec57fdff7e6b4af10a0c730bbddac5c2e06 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Sat, 6 Aug 2016 00:49:14 +0100
Subject: [PATCH 09/29] more Travis MPI fix

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 2fe52a68..73430b4d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -102,7 +102,7 @@ script:
     - make -j4
     - ./benchmarks/Benchmark_dwf --threads 1
     - echo make clean
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED'; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
     - make -j4
     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi

From 573b8c602072f8daa7596969f19dda69252b41aa Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Sat, 6 Aug 2016 01:26:24 +0100
Subject: [PATCH 10/29] build system: -O3 is not overriden by env CXXFLAGS

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 89bb3568..cdcd9d58 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
 ############### Checks for programs
 AC_LANG(C++)
-: ${CXXFLAGS="-O3"}
+CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
 AC_OPENMP
 AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"

From 90e70790f312bff33a397d78a5b133c7f3d7a136 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 15 Aug 2016 22:31:29 +0100
Subject: [PATCH 11/29] Feature for z-Mobius prep

---
 configure.ac                                  |  2 +-
 lib/qcd/action/Actions.h                      | 12 ++-
 lib/qcd/action/fermion/CayleyFermion5D.cc     | 58 ++++++++------
 lib/qcd/action/fermion/CayleyFermion5D.h      | 49 ++++++------
 .../action/fermion/CayleyFermion5Dcache.cc    | 14 ++--
 lib/qcd/action/fermion/CayleyFermion5Dssp.cc  | 12 +--
 lib/qcd/action/fermion/CayleyFermion5Dvec.cc  | 24 +++---
 lib/qcd/action/fermion/FermionOperatorImpl.h  | 25 +++++-
 lib/qcd/action/fermion/WilsonKernels.cc       | 23 +++---
 lib/qcd/action/fermion/WilsonKernels.h        |  4 +
 lib/qcd/action/fermion/WilsonKernelsAsm.cc    | 20 +++++
 lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 65 ++++++++++++++-
 lib/qcd/action/fermion/ZMobiusFermion.h       | 79 +++++++++++++++++++
 tests/debug/Test_cayley_even_odd_vec.cc       | 12 +++
 14 files changed, 312 insertions(+), 87 deletions(-)
 create mode 100644 lib/qcd/action/fermion/ZMobiusFermion.h

diff --git a/configure.ac b/configure.ac
index cdcd9d58..2e46a411 100644
--- a/configure.ac
+++ b/configure.ac
@@ -12,7 +12,7 @@ CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
 AC_OPENMP
 AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
-LT_INIT([disable-shared])
+LT_INIT
 
 ############### Checks for header files
 AC_CHECK_HEADERS(stdint.h)
diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h
index be08cdde..1ad3a584 100644
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -111,12 +111,16 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 #define FermOp4dVecTemplateInstantiate(A) \
   template class A<WilsonImplF>;		\
   template class A<WilsonImplD>;		\
+  template class A<ZWilsonImplF>;		\
+  template class A<ZWilsonImplD>;		\
   template class A<GparityWilsonImplF>;		\
   template class A<GparityWilsonImplD>;		
 
 #define FermOp5dVecTemplateInstantiate(A) \
   template class A<DomainWallVec5dImplF>;	\
-  template class A<DomainWallVec5dImplD>;	
+  template class A<DomainWallVec5dImplD>;	\
+  template class A<ZDomainWallVec5dImplF>;	\
+  template class A<ZDomainWallVec5dImplD>;	
 
 #define FermOpTemplateInstantiate(A) \
  FermOp4dVecTemplateInstantiate(A) \
@@ -138,6 +142,7 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
 #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
@@ -176,6 +181,11 @@ typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
+
+typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
+typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
+typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
+
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc
index 066cede4..c4196043 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -54,18 +54,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
   int Ls=this->Ls;
-  std::vector<RealD> diag (Ls,1.0);
-  std::vector<RealD> upper(Ls,-1.0); upper[Ls-1]=mass;
-  std::vector<RealD> lower(Ls,-1.0); lower[0]   =mass;
+  std::vector<Coeff_t> diag (Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
   M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
   int Ls=this->Ls;
-  std::vector<RealD> diag = bs;
-  std::vector<RealD> upper= cs;
-  std::vector<RealD> lower= cs; 
+  std::vector<Coeff_t> diag = bs;
+  std::vector<Coeff_t> upper= cs;
+  std::vector<Coeff_t> lower= cs; 
   upper[Ls-1]=-mass*upper[Ls-1];
   lower[0]   =-mass*lower[0];
   M5D(psi,psi,Din,lower,diag,upper);
@@ -73,9 +73,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
   int Ls=this->Ls;
-  std::vector<RealD> diag = beo;
-  std::vector<RealD> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> diag = beo;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
   for(int i=0;i<Ls;i++) {
     upper[i]=-ceo[i];
     lower[i]=-ceo[i];
@@ -88,9 +88,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
   int Ls=this->Ls;
-  std::vector<RealD> diag = bee;
-  std::vector<RealD> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
   for(int i=0;i<Ls;i++) {
     upper[i]=-cee[i];
     lower[i]=-cee[i];
@@ -104,9 +104,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
   int Ls=this->Ls;
-  std::vector<RealD> diag = bee;
-  std::vector<RealD> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
 
   for (int s=0;s<Ls;s++){
     // Assemble the 5d matrix
@@ -129,9 +129,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
   int Ls=this->Ls;
-  std::vector<RealD> diag(Ls,1.0);
-  std::vector<RealD> upper(Ls,-1.0);
-  std::vector<RealD> lower(Ls,-1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);
+  std::vector<Coeff_t> lower(Ls,-1.0);
   upper[Ls-1]=-mass*upper[Ls-1];
   lower[0]   =-mass*lower[0];
   M5Ddag(psi,chi,chi,lower,diag,upper);
@@ -141,9 +141,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
   int Ls=this->Ls;
-  std::vector<RealD> diag =bs;
-  std::vector<RealD> upper=cs;
-  std::vector<RealD> lower=cs;
+  std::vector<Coeff_t> diag =bs;
+  std::vector<Coeff_t> upper=cs;
+  std::vector<Coeff_t> lower=cs;
   upper[Ls-1]=-mass*upper[Ls-1];
   lower[0]   =-mass*lower[0];
   M5Ddag(psi,psi,Din,lower,diag,upper);
@@ -273,11 +273,21 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  SetCoefficientsZolotarev(1.0,zdata,b,c);
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(1.0,gamma,b,c);
 }
 //Zolo
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(zolo_hi,gamma,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
 {
   int Ls=this->Ls;
 
@@ -315,7 +325,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
   double bmc = b-c;
   for(int i=0; i < Ls; i++){
     as[i] = 1.0;
-    omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
     bs[i] = 0.5*(bpc/omega[i] + bmc);
     cs[i] = 0.5*(bpc/omega[i] - bmc);
   }
@@ -377,7 +387,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
   }
 	
   { 
-    double delta_d=mass*cee[Ls-1];
+    Coeff_t delta_d=mass*cee[Ls-1];
     for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
     dee[Ls-1] += delta_d;
   }  
diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h
index a5af4e73..53a93a05 100644
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -62,16 +62,16 @@ namespace Grid {
       void M5D(const FermionField &psi,
 	       const FermionField &phi, 
 	       FermionField &chi,
-	       std::vector<RealD> &lower,
-	       std::vector<RealD> &diag,
-	       std::vector<RealD> &upper);
+	       std::vector<Coeff_t> &lower,
+	       std::vector<Coeff_t> &diag,
+	       std::vector<Coeff_t> &upper);
 
       void M5Ddag(const FermionField &psi,
 		  const FermionField &phi, 
 		  FermionField &chi,
-		  std::vector<RealD> &lower,
-		  std::vector<RealD> &diag,
-		  std::vector<RealD> &upper);
+		  std::vector<Coeff_t> &lower,
+		  std::vector<Coeff_t> &diag,
+		  std::vector<Coeff_t> &upper);
       void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
 
       virtual void   Instantiatable(void)=0;
@@ -91,23 +91,23 @@ namespace Grid {
       RealD mass;
 
       // Cayley form Moebius (tanh and zolotarev)
-      std::vector<RealD> omega; 
-      std::vector<RealD> bs;    // S dependent coeffs
-      std::vector<RealD> cs;    
-      std::vector<RealD> as;    
+      std::vector<Coeff_t> omega; 
+      std::vector<Coeff_t> bs;    // S dependent coeffs
+      std::vector<Coeff_t> cs;    
+      std::vector<Coeff_t> as;    
       // For preconditioning Cayley form
-      std::vector<RealD> bee;    
-      std::vector<RealD> cee;    
-      std::vector<RealD> aee;    
-      std::vector<RealD> beo;    
-      std::vector<RealD> ceo;    
-      std::vector<RealD> aeo;    
+      std::vector<Coeff_t> bee;    
+      std::vector<Coeff_t> cee;    
+      std::vector<Coeff_t> aee;    
+      std::vector<Coeff_t> beo;    
+      std::vector<Coeff_t> ceo;    
+      std::vector<Coeff_t> aeo;    
       // LDU factorisation of the eeoo matrix
-      std::vector<RealD> lee;    
-      std::vector<RealD> leem;    
-      std::vector<RealD> uee;    
-      std::vector<RealD> ueem;    
-      std::vector<RealD> dee;    
+      std::vector<Coeff_t> lee;    
+      std::vector<Coeff_t> leem;    
+      std::vector<Coeff_t> uee;    
+      std::vector<Coeff_t> ueem;    
+      std::vector<Coeff_t> dee;    
 
       // Constructors
       CayleyFermion5D(GaugeField &_Umu,
@@ -117,20 +117,19 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
 
-
-
     protected:
       void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
       void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
+      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
     };
 
   }
 }
 #define INSTANTIATE_DPERP(A)\
 template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					   std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
 template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
 
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
index 4791e7a2..62e91dd4 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -43,9 +43,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
-				std::vector<RealD> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
 {
   int Ls =this->Ls;
   GridBase *grid=psi._grid;
@@ -82,9 +82,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
-				   std::vector<RealD> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
 {
   int Ls =this->Ls;
   GridBase *grid=psi._grid;
@@ -204,6 +204,8 @@ PARALLEL_FOR_LOOP
   INSTANTIATE_DPERP(WilsonImplD);
   INSTANTIATE_DPERP(GparityWilsonImplF);
   INSTANTIATE_DPERP(GparityWilsonImplD);
+  INSTANTIATE_DPERP(ZWilsonImplF);
+  INSTANTIATE_DPERP(ZWilsonImplD);
 #endif
 
 }}
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
index a2c96d87..ad7daddb 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@@ -43,9 +43,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
-				std::vector<RealD> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
 {
   int Ls=this->Ls;
   for(int s=0;s<Ls;s++){
@@ -65,9 +65,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
-				   std::vector<RealD> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
 {
   int Ls=this->Ls;
   for(int s=0;s<Ls;s++){
diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
index 26c0999e..f6569923 100644
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -53,9 +53,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
-				std::vector<RealD> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
 {
   GridBase *grid=psi._grid;
   int Ls   = this->Ls;
@@ -121,9 +121,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
-				   std::vector<RealD> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
 {
   GridBase *grid=psi._grid;
   int Ls   = this->Ls;
@@ -194,8 +194,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
 
   chi.checkerboard=psi.checkerboard;
   
-  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
-  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
   
   for(int s=0;s<Ls;s++){
     Pplus(s,s) = bee[s];
@@ -212,8 +212,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
   Pplus (0,Ls-1) = mass*cee[0];
   Pminus(Ls-1,0) = mass*cee[Ls-1];
   
-  Eigen::MatrixXd PplusMat ;
-  Eigen::MatrixXd PminusMat;
+  Eigen::MatrixXcd PplusMat ;
+  Eigen::MatrixXcd PminusMat;
   
   if ( inv ) {
     PplusMat =Pplus.inverse();
@@ -298,8 +298,12 @@ PARALLEL_FOR_LOOP
 
 INSTANTIATE_DPERP(DomainWallVec5dImplD);
 INSTANTIATE_DPERP(DomainWallVec5dImplF);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
 
 template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 
 }}
diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index 4126d185..b756936b 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -100,7 +100,8 @@ namespace Grid {
     typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
     typedef typename Impl::Compressor               Compressor;		\
     typedef typename Impl::StencilImpl             StencilImpl;	  \
-    typedef typename Impl::ImplParams ImplParams;
+    typedef typename Impl::ImplParams ImplParams; \
+    typedef typename Impl::Coeff_t       Coeff_t;
 
 #define INHERIT_IMPL_TYPES(Base) \
     INHERIT_GIMPL_TYPES(Base)\
@@ -109,12 +110,14 @@ namespace Grid {
     ///////
     // Single flavour four spinors with colour index
     ///////
-    template<class S,int Nrepresentation=Nc>
+    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
     class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S, Nrepresentation> > { 
     public:
 
+
       const bool LsVectorised=false;
 
+      typedef _Coeff_t Coeff_t;
       typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
 
       INHERIT_GIMPL_TYPES(Gimpl);
@@ -192,12 +195,13 @@ PARALLEL_FOR_LOOP
     ///////
     // Single flavour four spinors with colour index, 5d redblack
     ///////
-    template<class S,int Nrepresentation=Nc>
+    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
     class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
     public:
     
       const bool LsVectorised=true;
 
+      typedef _Coeff_t Coeff_t;
       typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
 
       INHERIT_GIMPL_TYPES(Gimpl);
@@ -287,12 +291,13 @@ PARALLEL_FOR_LOOP
     // Flavour doubled spinors; is Gparity the only? what about C*?
     ////////////////////////////////////////////////////////////////////////////////////////
 
-    template<class S,int Nrepresentation>
+    template<class S,int Nrepresentation,class _Coeff_t = RealD>
     class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
     public:
 
       const bool LsVectorised=false;
 
+      typedef _Coeff_t Coeff_t;
       typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
 
       INHERIT_GIMPL_TYPES(Gimpl);
@@ -483,6 +488,18 @@ PARALLEL_FOR_LOOP
     typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
     typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
 
+    typedef WilsonImpl<vComplex ,Nc,ComplexD>  ZWilsonImplR; // Real.. whichever prec
+    typedef WilsonImpl<vComplexF,Nc,ComplexD> ZWilsonImplF; // Float
+    typedef WilsonImpl<vComplexD,Nc,ComplexD> ZWilsonImplD; // Double
+
+    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
+
+    typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD>  ZDomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
+
     typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
     typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
     typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc
index d644f6ef..a12a2b16 100644
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -68,16 +68,21 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
-  // No asm implementation yet.
-  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-  //  else
-  for(int site=0;site<Ns;site++) {
-    for(int s=0;s<Ls;s++) {
-      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-      sF++;
+#ifdef AVX512
+  if ( AsmOpt ) {
+    WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+  } else {
+#else
+  {  
+#endif
+    for(int site=0;site<Ns;site++) {
+      for(int s=0;s<Ls;s++) {
+	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	sF++;
+      }
+      sU++;
     }
-    sU++;
   }
 }
 
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index 231fa293..b679d3f9 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -79,6 +79,10 @@ namespace Grid {
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 
+     void DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
+
 
      void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index 83871b7b..b443ccf9 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -73,12 +73,21 @@ static int signInit = setupSigns();
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
+
+#undef KERNEL_DAG
 template<>
 void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 
+#define KERNEL_DAG
+template<>
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@@ -89,14 +98,25 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+
+#undef KERNEL_DAG
 template<>
 void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 
+#define KERNEL_DAG
+template<>
+void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
 #endif
 
+
+
 template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
index 4f3ef861..d236a774 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -30,7 +30,11 @@
   basep = st.GetPFInfo(nent,plocal); nent++;
   if ( local ) {
     LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+    XP_PROJMEM(base);
+#else 
     XM_PROJMEM(base);
+#endif
     MAYBEPERM(PERMUTE_DIR3,perm);
   } else { 
     LOAD_CHI(base);
@@ -41,15 +45,22 @@
     MULT_2SPIN_DIR_PFXP(Xp,basep);
   }
   LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+  XP_RECON;
+#else
   XM_RECON;
-
+#endif
   ////////////////////////////////
   // Yp
   ////////////////////////////////
   basep = st.GetPFInfo(nent,plocal); nent++;
   if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YP_PROJMEM(base);
+#else
     YM_PROJMEM(base);
+#endif
     MAYBEPERM(PERMUTE_DIR2,perm);
   } else { 
     LOAD_CHI(base);
@@ -60,7 +71,11 @@
     MULT_2SPIN_DIR_PFYP(Yp,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YP_RECON_ACCUM;
+#else
   YM_RECON_ACCUM;
+#endif
 
   ////////////////////////////////
   // Zp
@@ -68,7 +83,11 @@
   basep = st.GetPFInfo(nent,plocal); nent++;
   if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZP_PROJMEM(base);
+#else
     ZM_PROJMEM(base);
+#endif
     MAYBEPERM(PERMUTE_DIR1,perm);
   } else { 
     LOAD_CHI(base);
@@ -79,7 +98,11 @@
     MULT_2SPIN_DIR_PFZP(Zp,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZP_RECON_ACCUM;
+#else
   ZM_RECON_ACCUM;
+#endif
 
   ////////////////////////////////
   // Tp
@@ -87,7 +110,11 @@
   basep = st.GetPFInfo(nent,plocal); nent++;
   if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TP_PROJMEM(base);
+#else
     TM_PROJMEM(base);
+#endif
     MAYBEPERM(PERMUTE_DIR0,perm);
   } else { 
     LOAD_CHI(base);
@@ -98,7 +125,11 @@
     MULT_2SPIN_DIR_PFTP(Tp,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TP_RECON_ACCUM;
+#else
   TM_RECON_ACCUM;
+#endif
 
   ////////////////////////////////
   // Xm
@@ -107,7 +138,11 @@
   //  basep= st.GetPFInfo(nent,plocal); nent++;
   if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    XM_PROJMEM(base);
+#else
     XP_PROJMEM(base);
+#endif
     MAYBEPERM(PERMUTE_DIR3,perm);
   } else { 
     LOAD_CHI(base);
@@ -118,7 +153,11 @@
     MULT_2SPIN_DIR_PFXM(Xm,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  XM_RECON_ACCUM;
+#else
   XP_RECON_ACCUM;
+#endif
 
   ////////////////////////////////
   // Ym
@@ -126,7 +165,11 @@
   basep= st.GetPFInfo(nent,plocal); nent++;
   if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YM_PROJMEM(base);
+#else
     YP_PROJMEM(base);
+#endif
     MAYBEPERM(PERMUTE_DIR2,perm);
   } else { 
     LOAD_CHI(base);
@@ -137,7 +180,11 @@
     MULT_2SPIN_DIR_PFYM(Ym,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YM_RECON_ACCUM;
+#else
   YP_RECON_ACCUM;
+#endif
 
   ////////////////////////////////
   // Zm
@@ -145,7 +192,11 @@
   basep= st.GetPFInfo(nent,plocal); nent++;
   if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZM_PROJMEM(base);
+#else
     ZP_PROJMEM(base);
+#endif
     MAYBEPERM(PERMUTE_DIR1,perm);
   } else { 
     LOAD_CHI(base);
@@ -156,7 +207,11 @@
     MULT_2SPIN_DIR_PFZM(Zm,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZM_RECON_ACCUM;
+#else
   ZP_RECON_ACCUM;
+#endif
 
   ////////////////////////////////
   // Tm
@@ -164,7 +219,11 @@
   basep= st.GetPFInfo(nent,plocal); nent++;
   if ( local ) {
     LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TM_PROJMEM(base);
+#else
     TP_PROJMEM(base);
+#endif
     MAYBEPERM(PERMUTE_DIR0,perm);
   } else { 
     LOAD_CHI(base);
@@ -175,7 +234,11 @@
     MULT_2SPIN_DIR_PFTM(Tm,basep);
   }
   LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TM_RECON_ACCUM;
+#else
   TP_RECON_ACCUM;
+#endif
 
   basep= st.GetPFInfo(nent,plocal); nent++;
   SAVE_RESULT(base,basep);
diff --git a/lib/qcd/action/fermion/ZMobiusFermion.h b/lib/qcd/action/fermion/ZMobiusFermion.h
new file mode 100644
index 00000000..d0e00657
--- /dev/null
+++ b/lib/qcd/action/fermion/ZMobiusFermion.h
@@ -0,0 +1,79 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/MobiusFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_ZMOBIUS_FERMION_H
+#define  GRID_QCD_ZMOBIUS_FERMION_H
+
+#include <Grid/Grid.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class ZMobiusFermion : public CayleyFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void) {};
+      // Constructors
+      ZMobiusFermion(GaugeField &_Umu,
+		     GridCartesian         &FiveDimGrid,
+		     GridRedBlackCartesian &FiveDimRedBlackGrid,
+		     GridCartesian         &FourDimGrid,
+		     GridRedBlackCartesian &FourDimRedBlackGrid,
+		     RealD _mass,RealD _M5,
+		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
+      
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)
+
+      {
+	RealD eps = 1.0;
+	
+	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+	std::vector<Coeff_t> zgamma(this->Ls);
+	for(int s=0;s<this->Ls;s++){
+	  zgamma[s] = gamma[s];
+	}
+
+	// Call base setter
+	this->SetCoefficientsInternal(1.0,zgamma,b,c);
+      }
+
+    };
+
+  }
+}
+
+#endif
diff --git a/tests/debug/Test_cayley_even_odd_vec.cc b/tests/debug/Test_cayley_even_odd_vec.cc
index 68315b88..f8600782 100644
--- a/tests/debug/Test_cayley_even_odd_vec.cc
+++ b/tests/debug/Test_cayley_even_odd_vec.cc
@@ -44,6 +44,7 @@ struct scal {
   };
 
 typedef DomainWallFermion<DomainWallVec5dImplR>                      DomainWallVecFermionR;
+typedef ZMobiusFermion<ZDomainWallVec5dImplR>                        ZMobiusVecFermionR;
 typedef MobiusFermion<DomainWallVec5dImplR>                          MobiusVecFermionR;
 typedef MobiusZolotarevFermion<DomainWallVec5dImplR>                 MobiusZolotarevVecFermionR;
 typedef ScaledShamirFermion<DomainWallVec5dImplR>                    ScaledShamirVecFermionR;
@@ -117,6 +118,17 @@ int main (int argc, char ** argv)
   TestWhat<MobiusFermionR>(Dmob,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);
   TestWhat<MobiusVecFermionR>(sDmob,sFGrid,sFrbGrid,sUGrid,mass,M5,&sRNG4,&sRNG5);
 
+
+  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
+  std::cout<<GridLogMessage <<"Z-MobiusFermion test"<<std::endl;
+  std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
+  std::vector<ComplexD> gamma(Ls,std::complex<double>(1.0,0.0));
+  ZMobiusFermionR     zDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,gamma,b,c);
+  ZMobiusVecFermionR szDmob(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c);
+  TestMoo(zDmob,szDmob);
+  TestWhat<ZMobiusFermionR>(zDmob,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);
+  TestWhat<ZMobiusVecFermionR>(szDmob,sFGrid,sFrbGrid,sUGrid,mass,M5,&sRNG4,&sRNG5);
+
   std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
   std::cout<<GridLogMessage <<"MobiusZolotarevFermion test"<<std::endl;
   std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;

From 4ab7dbfd57e5aa79f7f8079ff44f6ceedf731b20 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 15 Aug 2016 23:00:40 +0100
Subject: [PATCH 12/29] Instantiate

---
 lib/qcd/action/fermion/FermionOperatorImpl.h |  4 +-
 lib/qcd/action/fermion/WilsonKernelsHand.cc  | 55 ++++++--------------
 2 files changed, 18 insertions(+), 41 deletions(-)

diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index b756936b..6ca72b93 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -488,7 +488,7 @@ PARALLEL_FOR_LOOP
     typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
     typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
 
-    typedef WilsonImpl<vComplex ,Nc,ComplexD>  ZWilsonImplR; // Real.. whichever prec
+    typedef WilsonImpl<vComplex ,Nc,ComplexD> ZWilsonImplR; // Real.. whichever prec
     typedef WilsonImpl<vComplexF,Nc,ComplexD> ZWilsonImplF; // Float
     typedef WilsonImpl<vComplexD,Nc,ComplexD> ZWilsonImplD; // Double
 
@@ -496,7 +496,7 @@ PARALLEL_FOR_LOOP
     typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
     typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
 
-    typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD>  ZDomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
     typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
     typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
 
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index d073e677..11b22209 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -839,46 +839,23 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 ////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //
 
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+#define INSTANTIATE_THEM(A) \
+template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+							       int ss,int sU,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 								  int ss,int sU,const FermionField &in, FermionField &out);
 
-
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-
-
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-
+INSTANTIATE_THEM(WilsonImplF);
+INSTANTIATE_THEM(WilsonImplD);
+INSTANTIATE_THEM(ZWilsonImplF);
+INSTANTIATE_THEM(ZWilsonImplD);
+INSTANTIATE_THEM(GparityWilsonImplF);
+INSTANTIATE_THEM(GparityWilsonImplD);
+INSTANTIATE_THEM(DomainWallVec5dImplF);
+INSTANTIATE_THEM(DomainWallVec5dImplD);
+INSTANTIATE_THEM(ZDomainWallVec5dImplF);
+INSTANTIATE_THEM(ZDomainWallVec5dImplD);
 
 }}

From 17097a93ece0df40fbc09877245ac6298d198f92 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 17 Aug 2016 01:33:55 +0100
Subject: [PATCH 13/29] FFTW test ran over 4 mpi processes.

---
 lib/FFT.h                      | 198 +++++++++++++++++++++++++++++++++
 lib/Grid.h                     |   4 +-
 lib/lattice/Lattice_transfer.h |  75 ++++++++++++-
 lib/simd/Grid_vector_types.h   |   6 +
 lib/simd/Grid_vector_unops.h   |  10 +-
 lib/tensors/Tensor_unary.h     |   1 +
 tests/core/Test_fft.cc         |  87 +++++++++++++++
 7 files changed, 372 insertions(+), 9 deletions(-)
 create mode 100644 lib/FFT.h
 create mode 100644 tests/core/Test_fft.cc

diff --git a/lib/FFT.h b/lib/FFT.h
new file mode 100644
index 00000000..262e525b
--- /dev/null
+++ b/lib/FFT.h
@@ -0,0 +1,198 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_FFT_H_
+#define _GRID_FFT_H_
+
+#include <Grid/fftw/fftw3.h>
+
+namespace Grid {
+
+  
+  class FFT { 
+  private:
+
+    GridCartesian *vgrid;
+    GridCartesian *sgrid;
+
+    int Nd;
+    std::vector<int> dimensions;
+    std::vector<int> processors;
+    std::vector<int> processor_coor;
+
+  public:
+
+    static const int forward=FFTW_FORWARD;
+    static const int backward=FFTW_BACKWARD;
+
+    FFT ( GridCartesian * grid ) : 
+      vgrid(grid),
+      Nd(grid->_ndimension),
+      dimensions(grid->_fdimensions),
+      processors(grid->_processors),
+      processor_coor(grid->_processor_coor)
+    {
+      std::vector<int> layout(Nd,1);
+      sgrid = new GridCartesian(dimensions,layout,processors);
+    };
+
+    ~FFT ( void)  { 
+      delete sgrid; 
+    }
+    
+    template<class vobj>
+    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
+
+      conformable(result._grid,vgrid);
+      conformable(source._grid,vgrid);
+
+      int L = vgrid->_ldimensions[dim];
+      int G = vgrid->_fdimensions[dim];
+
+      std::vector<int> layout(Nd,1);
+      std::vector<int> pencil_gd(vgrid->_fdimensions);
+      std::vector<int> pencil_ld(processors);
+
+      pencil_gd[dim] = G*processors[dim];    
+      pencil_ld[dim] = G*processors[dim];    
+
+      // Pencil global vol LxLxGxLxL per node
+      GridCartesian pencil_g(pencil_gd,layout,processors);
+      GridCartesian pencil_l(pencil_ld,layout,processors);
+
+      // Construct pencils
+      typedef typename vobj::scalar_object sobj;
+      Lattice<vobj> ssource(vgrid); ssource =source;
+      Lattice<sobj> pgsource(&pencil_g);
+      Lattice<sobj> pgresult(&pencil_g);
+      Lattice<sobj> plsource(&pencil_l);
+      Lattice<sobj> plresult(&pencil_l);
+
+      {
+
+	assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD));
+	assert(sizeof(fftw_complex)==sizeof(ComplexD));
+	assert(sizeof(fftw_complex)==sizeof(ComplexD));
+
+	int Ncomp = sizeof(sobj)/sizeof(fftw_complex);
+
+	std::cout << "Ncomp = "<<Ncomp<<std::endl;
+
+	int rank = 1;   /* not 2: we are computing 1d transforms */
+	int n[] = {G}; /* 1d transforms of length G */
+	int howmany = Ncomp;
+	int odist,idist,istride,ostride;
+	idist   = odist   = 1;
+	istride = ostride = Ncomp; /* distance between two elements in the same column */
+	int *inembed = n, *onembed = n;
+	
+	fftw_complex *in = (fftw_complex *)&plsource._odata[0];
+	fftw_complex *out= (fftw_complex *)&plresult._odata[0];
+	
+	int sign = FFTW_FORWARD;
+	if (inverse) sign = FFTW_BACKWARD;
+
+	fftw_plan p = fftw_plan_many_dft(rank,n,howmany,
+					 in,inembed,
+					 istride,idist,
+					 out,onembed,
+					 ostride, odist,
+					 sign,FFTW_ESTIMATE);
+
+
+	// Barrel shift and collect global pencil
+	for(int p=0;p<processors[dim];p++) { 
+	  
+	      
+	  for(int idx=0;idx<sgrid->lSites();idx++) { 
+
+	    std::vector<int> lcoor(Nd);
+    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
+
+	    sobj s;
+
+	    peekLocalSite(s,ssource,lcoor);
+
+	    lcoor[dim]+=p*L;
+	   
+	    pokeLocalSite(s,pgsource,lcoor);
+	  }
+
+	  ssource = Cshift(ssource,dim,L);
+	}
+
+	std::cout << " pgsource pencil " << pgsource<<std::endl ;
+	
+	// Loop over orthog coords
+	for(int idx=0;idx<sgrid->lSites();idx++) { 
+
+	  std::vector<int> pcoor(Nd,0);
+	  std::vector<int> lcoor(Nd);
+	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
+
+	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
+	  
+	    // Project to local pencil array
+	    for(int l=0;l<G;l++){
+	      sobj s;
+	      pcoor[dim]=l;
+	      lcoor[dim]=l;
+	      peekLocalSite(s,pgsource,lcoor);
+	      pokeLocalSite(s,plsource,pcoor);
+	    }
+
+
+	    if ( idx==0) {
+	      std::cout << " plsource pencil " << pgsource<<std::endl ;
+	    }
+
+	    // FFT the pencil
+	    fftw_execute(p);
+
+	    // Extract the result
+	    for(int l=0;l<L;l++){
+	      sobj s;
+	      int p = processor_coor[dim];
+	      lcoor[dim] = l;
+	      pcoor[dim] = l+L*p;
+	      peekLocalSite(s,plresult,pcoor);
+	      pokeLocalSite(s,result,lcoor);
+	    }
+
+	  }
+	}
+	  
+	fftw_destroy_plan(p);
+      }
+    }
+
+  };
+
+
+}
+
+#endif
diff --git a/lib/Grid.h b/lib/Grid.h
index 9de8470d..486ee4d3 100644
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -68,6 +68,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Simd.h>
 #include <Grid/Threads.h>
 #include <Grid/Lexicographic.h>
+#include <Grid/Init.h>
 #include <Grid/Communicator.h> 
 #include <Grid/Cartesian.h>    
 #include <Grid/Tensors.h>      
@@ -78,7 +79,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/parallelIO/NerscIO.h>
-#include <Grid/Init.h>
+
+#include <Grid/FFT.h>
 
 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h
index 2fa72014..cc4617de 100644
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
     assert(ig->_ldimensions[d] == og->_ldimensions[d]);
   }
 
-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
   for(int idx=0;idx<ig->lSites();idx++){
     std::vector<int> lcoor(ni);
     ig->LocalIndexToLocalCoor(idx,lcoor);
@@ -446,6 +446,79 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
 
 }
 
+
+template<class vobj>
+void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,lowDim,lcoor);
+      pokeLocalSite(s,higherDim,hcoor);
+    }
+  }
+}
+
+
+template<class vobj>
+void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,higherDim,hcoor);
+      pokeLocalSite(s,lowDim,lcoor);
+    }
+  }
+}
+
+
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 2f2b70c8..30576fb6 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -388,6 +388,12 @@ class Grid_simd {
 
 };  // end of Grid_simd class definition
 
+
+inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
+inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
+inline void permute(RealD &y,RealD b, int perm) {  y=b; }
+inline void permute(RealF &y,RealF b, int perm) {  y=b; }
+
 ////////////////////////////////////////////////////////////////////
 // General rotate
 ////////////////////////////////////////////////////////////////////
diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h
index a67f4e7d..2afac190 100644
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -67,15 +67,13 @@ template <class scalar>
 struct AsinRealFunctor {
   scalar operator()(const scalar &a) const { return asin(real(a)); }
 };
-
 template <class scalar>
 struct LogRealFunctor {
   scalar operator()(const scalar &a) const { return log(real(a)); }
 };
-
 template <class scalar>
-struct ExpRealFunctor {
-  scalar operator()(const scalar &a) const { return exp(real(a)); }
+struct ExpFunctor {
+  scalar operator()(const scalar &a) const { return exp(a); }
 };
 template <class scalar>
 struct NotFunctor {
@@ -85,7 +83,6 @@ template <class scalar>
 struct AbsRealFunctor {
   scalar operator()(const scalar &a) const { return std::abs(real(a)); }
 };
-
 template <class scalar>
 struct PowRealFunctor {
   double y;
@@ -135,7 +132,6 @@ template <class Scalar>
 inline Scalar rsqrt(const Scalar &r) {
   return (RSqrtRealFunctor<Scalar>(), r);
 }
-
 template <class S, class V>
 inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
   return SimdApply(CosRealFunctor<S>(), r);
@@ -162,7 +158,7 @@ inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
 }
 template <class S, class V>
 inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
-  return SimdApply(ExpRealFunctor<S>(), r);
+  return SimdApply(ExpFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {
diff --git a/lib/tensors/Tensor_unary.h b/lib/tensors/Tensor_unary.h
index dd05a4a7..92a968df 100644
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -36,6 +36,7 @@ template<class obj> inline auto func(const iScalar<obj> &z) -> iScalar<obj>\
 {\
     iScalar<obj> ret;\
     ret._internal = func( (z._internal));\
+    std::cout << "Unary "<<#func<<" " << z._internal <<" -> "<< ret._internal <<" "<< typeid(obj).name() <<std::endl; \
     return ret;\
 }\
 template<class obj,int N> inline auto func(const iVector<obj,N> &z) -> iVector<obj,N>\
diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
new file mode 100644
index 00000000..deed6f0a
--- /dev/null
+++ b/tests/core/Test_fft.cc
@@ -0,0 +1,87 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
+
+  LatticeComplexD     one(&Fine);
+  LatticeComplexD      zz(&Fine);
+  LatticeComplexD       C(&Fine);
+  LatticeComplexD  Ctilde(&Fine);
+  LatticeComplexD    coor(&Fine);
+  
+  std::vector<RealD> p({1.0,2.0,3.0,2.0});
+
+  one = ComplexD(1.0,0.0);
+  zz  = ComplexD(0.0,0.0);
+
+  ComplexD ci(0.0,1.0);
+
+  C=zero;
+  for(int mu=0;mu<4;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    C = C - TwoPiL * p[mu] * coor;
+  }
+
+  std::cout << GridLogMessage<< " C " << C<<std::endl;
+
+  C = C*ci;
+  std::cout << GridLogMessage<< " C " << C<<std::endl;
+
+  C = exp(C);
+  std::cout << GridLogMessage<< " C " << C<<std::endl;
+
+  FFT theFFT(&Fine);
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);
+  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
+
+  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);
+  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
+  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);
+  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
+  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);
+  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
+
+  Grid_finalize();
+}

From e423a09974a05ff94f0dc95a175ee7b73b700148 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Thu, 18 Aug 2016 02:23:21 +0100
Subject: [PATCH 14/29] FFT improved and test_FFT passing under MPI 8
 processes, 8^4 for LatticeComplexD and LatticeSpinMatrixD

---
 lib/FFT.h                  | 18 ++----------
 lib/tensors/Tensor_unary.h |  1 -
 tests/core/Test_fft.cc     | 57 ++++++++++++++++++++++++++------------
 3 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/lib/FFT.h b/lib/FFT.h
index 262e525b..f9470472 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -93,23 +93,20 @@ namespace Grid {
       Lattice<sobj> plresult(&pencil_l);
 
       {
-
 	assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD));
 	assert(sizeof(fftw_complex)==sizeof(ComplexD));
 	assert(sizeof(fftw_complex)==sizeof(ComplexD));
 
 	int Ncomp = sizeof(sobj)/sizeof(fftw_complex);
 
-	std::cout << "Ncomp = "<<Ncomp<<std::endl;
-
-	int rank = 1;   /* not 2: we are computing 1d transforms */
+	int rank = 1;  /* not 2: we are computing 1d transforms */
 	int n[] = {G}; /* 1d transforms of length G */
 	int howmany = Ncomp;
 	int odist,idist,istride,ostride;
 	idist   = odist   = 1;
 	istride = ostride = Ncomp; /* distance between two elements in the same column */
 	int *inembed = n, *onembed = n;
-	
+
 	fftw_complex *in = (fftw_complex *)&plsource._odata[0];
 	fftw_complex *out= (fftw_complex *)&plresult._odata[0];
 	
@@ -123,11 +120,9 @@ namespace Grid {
 					 ostride, odist,
 					 sign,FFTW_ESTIMATE);
 
-
 	// Barrel shift and collect global pencil
 	for(int p=0;p<processors[dim];p++) { 
-	  
-	      
+
 	  for(int idx=0;idx<sgrid->lSites();idx++) { 
 
 	    std::vector<int> lcoor(Nd);
@@ -144,8 +139,6 @@ namespace Grid {
 
 	  ssource = Cshift(ssource,dim,L);
 	}
-
-	std::cout << " pgsource pencil " << pgsource<<std::endl ;
 	
 	// Loop over orthog coords
 	for(int idx=0;idx<sgrid->lSites();idx++) { 
@@ -165,11 +158,6 @@ namespace Grid {
 	      pokeLocalSite(s,plsource,pcoor);
 	    }
 
-
-	    if ( idx==0) {
-	      std::cout << " plsource pencil " << pgsource<<std::endl ;
-	    }
-
 	    // FFT the pencil
 	    fftw_execute(p);
 
diff --git a/lib/tensors/Tensor_unary.h b/lib/tensors/Tensor_unary.h
index 92a968df..dd05a4a7 100644
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -36,7 +36,6 @@ template<class obj> inline auto func(const iScalar<obj> &z) -> iScalar<obj>\
 {\
     iScalar<obj> ret;\
     ret._internal = func( (z._internal));\
-    std::cout << "Unary "<<#func<<" " << z._internal <<" -> "<< ret._internal <<" "<< typeid(obj).name() <<std::endl; \
     return ret;\
 }\
 template<class obj,int N> inline auto func(const iVector<obj,N> &z) -> iVector<obj,N>\
diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
index deed6f0a..ed57800c 100644
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -39,6 +39,10 @@ int main (int argc, char ** argv)
   std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
   std::vector<int> mpi_layout  = GridDefaultMpi();
 
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
   GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
 
   LatticeComplexD     one(&Fine);
@@ -46,8 +50,11 @@ int main (int argc, char ** argv)
   LatticeComplexD       C(&Fine);
   LatticeComplexD  Ctilde(&Fine);
   LatticeComplexD    coor(&Fine);
+
+  LatticeSpinMatrixD    S(&Fine);
+  LatticeSpinMatrixD    Stilde(&Fine);
   
-  std::vector<RealD> p({1.0,2.0,3.0,2.0});
+  std::vector<int> p({1,2,3,2});
 
   one = ComplexD(1.0,0.0);
   zz  = ComplexD(0.0,0.0);
@@ -58,30 +65,44 @@ int main (int argc, char ** argv)
   for(int mu=0;mu<4;mu++){
     RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
     LatticeCoordinate(coor,mu);
-    C = C - TwoPiL * p[mu] * coor;
+    C = C - (TwoPiL * p[mu]) * coor;
   }
 
-  std::cout << GridLogMessage<< " C " << C<<std::endl;
+  C = exp(C*ci);
 
-  C = C*ci;
-  std::cout << GridLogMessage<< " C " << C<<std::endl;
-
-  C = exp(C);
-  std::cout << GridLogMessage<< " C " << C<<std::endl;
+  S=zero;
+  S = S+C;
 
   FFT theFFT(&Fine);
-  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);
-  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
 
-  C=Ctilde;
-  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);
-  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
-  C=Ctilde;
-  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);
-  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
-  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde;
   theFFT.FFT_dim(Ctilde,C,3,FFT::forward);
-  std::cout << GridLogMessage<< "FT[C] " << Ctilde<<std::endl;
+
+  //  C=zero;
+  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
+  TComplexD cVol;
+  cVol()()() = vol;
+
+  C=zero;
+  pokeSite(cVol,C,p);
+  C=C-Ctilde;
+  std::cout << "diff scalar "<<norm2(C) << std::endl;
+
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);
+
+  SpinMatrixD Sp; 
+  Sp = zero; Sp = Sp+cVol;
+
+  S=zero;
+  pokeSite(Sp,S,p);
+
+  S= S-Stilde;
+  std::cout << "diff FT[SpinMat] "<<norm2(S) << std::endl;
 
   Grid_finalize();
 }

From 29c4ef41de1680ce03ec455521c37068e29ccc70 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 22 Aug 2016 16:21:01 +0100
Subject: [PATCH 15/29] Adding a test for libfftw3

---
 configure.ac | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 89bb3568..0ed501f9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -43,6 +43,7 @@ AC_ARG_WITH([mpfr],
 AC_ARG_ENABLE([lapack],
     [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
     [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+
 case ${ac_LAPACK} in
     no)
         ;;
@@ -54,6 +55,17 @@ case ${ac_LAPACK} in
         AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
 esac
 
+AC_CHECK_LIB([fftw3],[fft_init_threads],
+	[AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes],
+        [ac_fftw=no])
+case ${ac_fftw} in
+    no)
+        ;;
+    yes)
+        AM_LDFLAGS="-lfftw3 $AM_LDFLAGS"
+esac
+
+
 ################ Get compiler informations
 AC_LANG([C++])
 AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
@@ -77,7 +89,7 @@ AC_CHECK_LIB([gmp],[__gmpf_init],
                  [have_mpfr=true]
                  [LIBS="$LIBS -lmpfr"],
                  [AC_MSG_ERROR([MPFR library not found])])]
-             [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
+   	     [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
              [have_gmp=true]
              [LIBS="$LIBS -lgmp"],
              [AC_MSG_WARN([**** GMP library not found, Grid can still compile but RHMC will not work ****])])

From 73ce476890b303d7f452dc33e91dc93f212167f7 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 22 Aug 2016 16:24:21 +0100
Subject: [PATCH 16/29] Include fftw headers

---
 lib/fftw/fftw3.h | 412 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 412 insertions(+)
 create mode 100644 lib/fftw/fftw3.h

diff --git a/lib/fftw/fftw3.h b/lib/fftw/fftw3.h
new file mode 100644
index 00000000..1bf34396
--- /dev/null
+++ b/lib/fftw/fftw3.h
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2003, 2007-14 Matteo Frigo
+ * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * The following statement of license applies *only* to this header file,
+ * and *not* to the other files distributed with FFTW or derived therefrom:
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/***************************** NOTE TO USERS *********************************
+ *
+ *                 THIS IS A HEADER FILE, NOT A MANUAL
+ *
+ *    If you want to know how to use FFTW, please read the manual,
+ *    online at http://www.fftw.org/doc/ and also included with FFTW.
+ *    For a quick start, see the manual's tutorial section.
+ *
+ *   (Reading header files to learn how to use a library is a habit
+ *    stemming from code lacking a proper manual.  Arguably, it's a
+ *    *bad* habit in most cases, because header files can contain
+ *    interfaces that are not part of the public, stable API.)
+ *
+ ****************************************************************************/
+
+#ifndef FFTW3_H
+#define FFTW3_H
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/* If <complex.h> is included, use the C99 complex type.  Otherwise
+   define a type bit-compatible with C99 complex */
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
+#else
+#  define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
+#endif
+
+#define FFTW_CONCAT(prefix, name) prefix ## name
+#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
+#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
+#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
+#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
+
+/* IMPORTANT: for Windows compilers, you should add a line
+        #define FFTW_DLL
+   here and in kernel/ifftw.h if you are compiling/using FFTW as a
+   DLL, in order to do the proper importing/exporting, or
+   alternatively compile with -DFFTW_DLL or the equivalent
+   command-line flag.  This is not necessary under MinGW/Cygwin, where
+   libtool does the imports/exports automatically. */
+#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
+   /* annoying Windows syntax for shared-library declarations */
+#  if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
+#    define FFTW_EXTERN extern __declspec(dllexport) 
+#  else /* user is calling FFTW; import symbol */
+#    define FFTW_EXTERN extern __declspec(dllimport) 
+#  endif
+#else
+#  define FFTW_EXTERN extern
+#endif
+
+enum fftw_r2r_kind_do_not_use_me {
+     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
+     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
+     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
+};
+
+struct fftw_iodim_do_not_use_me {
+     int n;                     /* dimension size */
+     int is;			/* input stride */
+     int os;			/* output stride */
+};
+
+#include <stddef.h> /* for ptrdiff_t */
+struct fftw_iodim64_do_not_use_me {
+     ptrdiff_t n;                     /* dimension size */
+     ptrdiff_t is;			/* input stride */
+     ptrdiff_t os;			/* output stride */
+};
+
+typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
+typedef int (*fftw_read_char_func_do_not_use_me)(void *);
+
+/*
+  huge second-order macro that defines prototypes for all API
+  functions.  We expand this macro for each supported precision
+ 
+  X: name-mangling macro
+  R: real data type
+  C: complex data type
+*/
+
+#define FFTW_DEFINE_API(X, R, C)					   \
+									   \
+FFTW_DEFINE_COMPLEX(R, C);						   \
+									   \
+typedef struct X(plan_s) *X(plan);					   \
+									   \
+typedef struct fftw_iodim_do_not_use_me X(iodim);			   \
+typedef struct fftw_iodim64_do_not_use_me X(iodim64);			   \
+									   \
+typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind);			   \
+									   \
+typedef fftw_write_char_func_do_not_use_me X(write_char_func);		   \
+typedef fftw_read_char_func_do_not_use_me X(read_char_func);		   \
+									   \
+FFTW_EXTERN void X(execute)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n,			   \
+		    C *in, C *out, int sign, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign,	   \
+		       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1,			   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2,		   \
+		       C *in, C *out, int sign, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         C *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         C *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         int sign, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims,	   \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
+			 int howmany_rank,				   \
+			 const X(iodim) *howmany_dims,			   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 C *in, C *out,					   \
+			 int sign, unsigned flags);			   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank,			   \
+                         const X(iodim64) *dims,			   \
+			 int howmany_rank,				   \
+			 const X(iodim64) *howmany_dims,		   \
+			 R *ri, R *ii, R *ro, R *io,			   \
+			 unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out);	   \
+FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii,	   \
+                                      R *ro, R *io);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n,	   \
+                             int howmany,				   \
+                             R *in, const int *inembed,			   \
+                             int istride, int idist,			   \
+                             C *out, const int *onembed,		   \
+                             int ostride, int odist,			   \
+                             unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n,		   \
+                        R *in, C *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1,			   \
+			   R *in, C *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   R *in, C *out, unsigned flags);		   \
+									   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n,	   \
+			     int howmany,				   \
+			     C *in, const int *inembed,			   \
+			     int istride, int idist,			   \
+			     R *out, const int *onembed,		   \
+			     int ostride, int odist,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n,		   \
+                        C *in, R *out, unsigned flags);			   \
+									   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1,			   \
+			   C *in, R *out, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1,			   \
+			   int n2,					   \
+			   C *in, R *out, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims,   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)(				   \
+                             int rank, const X(iodim) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, C *out,				   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank,			   \
+                             const X(iodim64) *dims,			   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     C *in, R *out,				   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *in, R *ro, R *io,			   \
+			     unsigned flags);				   \
+FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)(			   \
+                             int rank, const X(iodim64) *dims,		   \
+			     int howmany_rank,				   \
+			     const X(iodim64) *howmany_dims,		   \
+			     R *ri, R *ii, R *out,			   \
+			     unsigned flags);				   \
+									   \
+FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out);	   \
+FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p,		   \
+                                          R *in, R *ro, R *io);		   \
+FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p,		   \
+                                          R *ri, R *ii, R *out);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n,		   \
+                         int howmany,					   \
+                         R *in, const int *inembed,			   \
+                         int istride, int idist,			   \
+                         R *out, const int *onembed,			   \
+                         int ostride, int odist,			   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out,	   \
+                    const X(r2r_kind) *kind, unsigned flags);		   \
+									   \
+FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out,		   \
+                       X(r2r_kind) kind, unsigned flags);		   \
+FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out,	   \
+                       X(r2r_kind) kind0, X(r2r_kind) kind1,		   \
+                       unsigned flags);					   \
+FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2,		   \
+                       R *in, R *out, X(r2r_kind) kind0,		   \
+                       X(r2r_kind) kind1, X(r2r_kind) kind2,		   \
+                       unsigned flags);					   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims,	   \
+                         int howmany_rank,				   \
+                         const X(iodim) *howmany_dims,			   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims,   \
+                         int howmany_rank,				   \
+                         const X(iodim64) *howmany_dims,		   \
+                         R *in, R *out,					   \
+                         const X(r2r_kind) *kind, unsigned flags);	   \
+									   \
+FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out);	   \
+									   \
+FFTW_EXTERN void X(destroy_plan)(X(plan) p);				   \
+FFTW_EXTERN void X(forget_wisdom)(void);				   \
+FFTW_EXTERN void X(cleanup)(void);					   \
+									   \
+FFTW_EXTERN void X(set_timelimit)(double t);				   \
+									   \
+FFTW_EXTERN void X(plan_with_nthreads)(int nthreads);			   \
+FFTW_EXTERN int X(init_threads)(void);					   \
+FFTW_EXTERN void X(cleanup_threads)(void);				   \
+									   \
+FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename);	   \
+FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file);		   \
+FFTW_EXTERN char *X(export_wisdom_to_string)(void);			   \
+FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char,   	   \
+                                  void *data);				   \
+FFTW_EXTERN int X(import_system_wisdom)(void);				   \
+FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename);	   \
+FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file);		   \
+FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string);	   \
+FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
+									   \
+FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file);	   \
+FFTW_EXTERN void X(print_plan)(const X(plan) p);			   \
+FFTW_EXTERN char *X(sprint_plan)(const X(plan) p);			   \
+									   \
+FFTW_EXTERN void *X(malloc)(size_t n);					   \
+FFTW_EXTERN R *X(alloc_real)(size_t n);					   \
+FFTW_EXTERN C *X(alloc_complex)(size_t n);				   \
+FFTW_EXTERN void X(free)(void *p);					   \
+									   \
+FFTW_EXTERN void X(flops)(const X(plan) p,				   \
+                          double *add, double *mul, double *fmas);	   \
+FFTW_EXTERN double X(estimate_cost)(const X(plan) p);			   \
+FFTW_EXTERN double X(cost)(const X(plan) p);				   \
+									   \
+FFTW_EXTERN int X(alignment_of)(R *p);                                     \
+FFTW_EXTERN const char X(version)[];                                       \
+FFTW_EXTERN const char X(cc)[];						   \
+FFTW_EXTERN const char X(codelet_optim)[];
+
+
+/* end of FFTW_DEFINE_API macro */
+
+FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
+FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
+
+/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
+   for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
+#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
+ && !(defined(__ICC) || defined(__INTEL_COMPILER)) \
+ && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
+#  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
+/* note: __float128 is a typedef, which is not supported with the _Complex
+         keyword in gcc, so instead we use this ugly __attribute__ version.
+         However, we can't simply pass the __attribute__ version to
+         FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
+         types.  Hence redefining FFTW_DEFINE_COMPLEX.  Ugh. */
+#    undef FFTW_DEFINE_COMPLEX
+#    define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
+#  endif
+FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
+#endif
+
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+
+#define FFTW_NO_TIMELIMIT (-1.0)
+
+/* documented flags */
+#define FFTW_MEASURE (0U)
+#define FFTW_DESTROY_INPUT (1U << 0)
+#define FFTW_UNALIGNED (1U << 1)
+#define FFTW_CONSERVE_MEMORY (1U << 2)
+#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
+#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
+#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
+#define FFTW_ESTIMATE (1U << 6)
+#define FFTW_WISDOM_ONLY (1U << 21)
+
+/* undocumented beyond-guru flags */
+#define FFTW_ESTIMATE_PATIENT (1U << 7)
+#define FFTW_BELIEVE_PCOST (1U << 8)
+#define FFTW_NO_DFT_R2HC (1U << 9)
+#define FFTW_NO_NONTHREADED (1U << 10)
+#define FFTW_NO_BUFFERING (1U << 11)
+#define FFTW_NO_INDIRECT_OP (1U << 12)
+#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
+#define FFTW_NO_RANK_SPLITS (1U << 14)
+#define FFTW_NO_VRANK_SPLITS (1U << 15)
+#define FFTW_NO_VRECURSE (1U << 16)
+#define FFTW_NO_SIMD (1U << 17)
+#define FFTW_NO_SLOW (1U << 18)
+#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
+#define FFTW_ALLOW_PRUNING (1U << 20)
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* FFTW3_H */

From 356e7940fda0dc48f8a6d21e0138ca6e116699d2 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Mon, 22 Aug 2016 16:24:49 +0100
Subject: [PATCH 17/29] fftw can be switched off

---
 lib/FFT.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/lib/FFT.h b/lib/FFT.h
index f9470472..cf012e79 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -113,12 +113,17 @@ namespace Grid {
 	int sign = FFTW_FORWARD;
 	if (inverse) sign = FFTW_BACKWARD;
 
+#ifdef HAVE_FFTW
 	fftw_plan p = fftw_plan_many_dft(rank,n,howmany,
 					 in,inembed,
 					 istride,idist,
 					 out,onembed,
 					 ostride, odist,
 					 sign,FFTW_ESTIMATE);
+#else 
+	fftw_plan p ;
+	assert(0);
+#endif
 
 	// Barrel shift and collect global pencil
 	for(int p=0;p<processors[dim];p++) { 
@@ -159,7 +164,9 @@ namespace Grid {
 	    }
 
 	    // FFT the pencil
+#ifdef HAVE_FFTW
 	    fftw_execute(p);
+#endif
 
 	    // Extract the result
 	    for(int l=0;l<L;l++){

From 4d11a6f5f23b8c4558166533d90e2b1d35aacd72 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 23 Aug 2016 14:41:44 +0100
Subject: [PATCH 18/29] first commit for QPX intrinsics

---
 configure.ac        |   5 +-
 lib/simd/Grid_qpx.h | 376 +++++++++++++++++++++++---------------------
 2 files changed, 198 insertions(+), 183 deletions(-)

diff --git a/configure.ac b/configure.ac
index b14f9748..9c56971e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -125,11 +125,14 @@ case ${ax_cv_cxx_compiler_vendor} in
         AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
         SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
       IMCI|KNC)
-        AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
+        AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
         SIMD_FLAGS='';;
       GEN)
         AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
         SIMD_FLAGS='';;
+      QPX|BGQ)
+        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
+        SIMD_FLAGS='';;
       *)
         AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
     esac;;
diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h
index 10ac3841..6ac83290 100644
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -1,300 +1,312 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/simd/Grid_qpx.h
-
-    Copyright (C) 2015
-
-Author: neo <cossu@post.kek.jp>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-//----------------------------------------------------------------------
-/*! @file Grid_qpx.h
-  @brief Optimization libraries for QPX instructions set for BG/Q
-
-  Using intrinsics
-*/
-// Time-stamp: <2015-05-27 11:30:21 neo>
-//----------------------------------------------------------------------
-
-// lot of undefined functions
+/*******************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/simd/Grid_qpx.h
+ 
+ Copyright (C) 2016
+ 
+ Author: Antonin Portelli <antonin.portelli@me.com>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ ******************************************************************************/
 
+namespace Grid {
 namespace Optimization {
+  inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
+  {
+    stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
+    return stream;
+  };
   
   struct Vsplat{
     //Complex float
-    inline float operator()(float a, float b){
-      return {a,b,a,b};
+    inline vector4double operator()(float a, float b){
+      return (vector4double){(double)a, (double)b, (double)a, (double)b};
     }
     // Real float
-    inline float operator()(float a){
-      return {a,a,a,a};
+    inline vector4double operator()(float a){
+      return (vector4double){(double)a, (double)a, (double)a, (double)a};
     }
     //Complex double
     inline vector4double operator()(double a, double b){
-      return {a,b,a,b};
+      return (vector4double){a, b, a, b};
     }
     //Real double
     inline vector4double operator()(double a){
-      return {a,a,a,a};
+      return (vector4double){a, a, a, a};
     }
     //Integer
     inline int operator()(Integer a){
-#error
+      return a;
     }
   };
-
+  
   struct Vstore{
-    //Float 
-    inline void operator()(float a, float* F){
-      assert(0);
+    //Float
+    inline void operator()(vector4double a, float *f){
+      vec_st(a, 0, f);
     }
     //Double
-    inline void operator()(vector4double a, double* D){
-      assert(0);
+    inline void operator()(vector4double a, double *d){
+      vec_st(a, 0, d);
     }
     //Integer
-    inline void operator()(int a, Integer* I){
-      assert(0);
+    inline void operator()(int a, Integer *i){
+      i[0] = a;
     }
-
   };
-
-
+  
   struct Vstream{
     //Float
-    inline void operator()(float * a, float b){
-      assert(0);
+    inline void operator()(float *f, vector4double a){
+      vec_st(a, 0, f);
     }
     //Double
-    inline void operator()(double * a, vector4double b){
-      assert(0);
+    inline void operator()(double *d, vector4double a){
+      vec_st(a, 0, d);
     }
 
-
   };
-
-
-
+  
   struct Vset{
-    // Complex float 
-    inline float operator()(Grid::ComplexF *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+    // Complex float
+    inline vector4double operator()(Grid::ComplexF *a){
+      return vec_ld(0, (float *)a);
     }
-    // Complex double 
+    // Complex double
     inline vector4double operator()(Grid::ComplexD *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+      return vec_ld(0, (double *)a);
     }
-    // Real float 
-    inline float operator()(float *a){
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+    // Real float
+    inline vector4double operator()(float *a){
+      return vec_ld(0, a);
     }
     // Real double
     inline vector4double operator()(double *a){
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+      return vec_ld(0, a);
     }
     // Integer
     inline int operator()(Integer *a){
-#error
+      return a[0];
     }
-
-
+    
+    
   };
-
+  
   template <typename Out_type, typename In_type>
-    struct Reduce{
-      //Need templated class to overload output type
-      //General form must generate error if compiled
-      inline Out_type operator()(In_type in){
-	printf("Error, using wrong Reduce function\n");
-	exit(1);
-	return 0;
-      }
-    };
-
-
- 
-
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+  
   /////////////////////////////////////////////////////
   // Arithmetic operations
   /////////////////////////////////////////////////////
   struct Sum{
-    //Complex/Real float
-    inline float operator()(float a, float b){
-#error
-    }
     //Complex/Real double
     inline vector4double operator()(vector4double a, vector4double b){
-      return vec_add(a,b);
+      return vec_add(a, b);
     }
     //Integer
     inline int operator()(int a, int b){
-#error
+      return a + b;
     }
   };
-
+  
   struct Sub{
-    //Complex/Real float
-    inline float operator()(float a, float b){
-#error
-    }
     //Complex/Real double
     inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_sub(a, b);
     }
     //Integer
-    inline floati operator()(int a, int b){
-#error
+    inline int operator()(int a, int b){
+      return a - b;
     }
   };
-
-
+  
   struct MultComplex{
-    // Complex float
-    inline float operator()(float a, float b){
-#error
-    }
     // Complex double
     inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_xxnpmadd(a, b, vec_xmul(b, a));
     }
   };
-
+  
   struct Mult{
-    // Real float
-    inline float operator()(float a, float b){
-#error
-    }
     // Real double
     inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_mul(a, b);
     }
     // Integer
     inline int operator()(int a, int b){
-#error
+      return a*b;
     }
   };
-
-
+  
   struct Conj{
-    // Complex single
-    inline float operator()(float in){
-      assert(0);
-    }
     // Complex double
-    inline vector4double operator()(vector4double in){
-      assert(0);
+    inline vector4double operator()(vector4double v){
+      return vec_mul(v, (vector4double){1., -1., 1., -1.});
     }
-    // do not define for integer input
   };
-
+  
   struct TimesMinusI{
-    //Complex single
-    inline float operator()(float in, float ret){
-      assert(0);
-    }
     //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
-      assert(0);
+    inline vector4double operator()(vector4double v, vector4double ret){
+      return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
+                               (vector4double){0., 0., 0., 0.});
     }
-
-
   };
-
+  
   struct TimesI{
-    //Complex single
-    inline float operator()(float in, float ret){
-  
-    }
     //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
-  
+    inline vector4double operator()(vector4double v, vector4double ret){
+      return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
+                              (vector4double){0., 0., 0., 0.});
     }
-
-
   };
 
-
+  struct Permute{
+    static inline vector4double Permute0(vector4double v){ //0123 -> 2301
+      return vec_perm(v, v, vec_gpci(02301));
+    };
+    static inline vector4double Permute1(vector4double v){ //0123 -> 1032
+      return vec_perm(v, v, vec_gpci(01032));
+    };
+    static inline vector4double Permute2(vector4double v){
+      return v;
+    };
+    static inline vector4double Permute3(vector4double v){
+      return v;
+    };
+  };
   
-
-
-  //////////////////////////////////////////////
-  // Some Template specialization
+  struct Rotate{
+    static inline vector4double rotate(vector4double v, int n){
+      switch(n){
+        case 0:
+          return v;
+          break;
+        case 1:
+          return vec_perm(v, v, vec_gpci(01230));
+          break;
+        case 2:
+          return vec_perm(v, v, vec_gpci(02301));
+          break;
+        case 3:
+          return vec_perm(v, v, vec_gpci(03012));
+          break;
+        default: assert(0);
+      }
+    }
+  };
   
   //Complex float Reduce
   template<>
-    inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
-    assert(0);
+  inline Grid::ComplexF
+  Reduce<Grid::ComplexF, vector4double>::operator()(vector4double v) { //2 complex
+    vector4double v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    
+    return Grid::ComplexF((float)vec_extract(v1, 0), (float)vec_extract(v1, 1));
   }
   //Real float Reduce
   template<>
-    inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
-    assert(0);
+  inline Grid::RealF
+  Reduce<Grid::RealF, vector4double>::operator()(vector4double v){ //4 floats
+    vector4double v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = vec_add(v1, v2);
+    
+    return (float)vec_extract(v1, 0);
   }
   
   
   //Complex double Reduce
   template<>
-    inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
-    assert(0);
+  inline Grid::ComplexD
+  Reduce<Grid::ComplexD, vector4double>::operator()(vector4double v){ //2 complex
+    vector4double v1;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    
+    return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
   }
   
   //Real double Reduce
   template<>
-    inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
-    assert(0);
-  }
+  inline Grid::RealD
+  Reduce<Grid::RealD, vector4double>::operator()(vector4double v){ //4 doubles
+    vector4double v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = vec_add(v1, v2);
 
+    return vec_extract(v1, 0);
+  }
+  
   //Integer Reduce
   template<>
-    inline Integer Reduce<Integer, floati>::operator()(float in){
+  inline Integer Reduce<Integer, int>::operator()(int in){
+    // FIXME unimplemented
+    printf("Reduce : Missing integer implementation -> FIX\n");
     assert(0);
   }
-  
-  
 }
 
-//////////////////////////////////////////////////////////////////////////////////////
-// Here assign types 
-namespace Grid {
-  typedef float SIMD_Ftype  __attribute__ ((vector_size (16)));         // Single precision type
-  typedef vector4double SIMD_Dtype; // Double precision type
-  typedef int SIMD_Itype;           // Integer type
+////////////////////////////////////////////////////////////////////////////////
+// Here assign types
 
-  inline void v_prefetch0(int size, const char *ptr){};
+typedef vector4double SIMD_Ftype;  // Single precision type
+typedef vector4double SIMD_Dtype; // Double precision type
+typedef int SIMD_Itype; // Integer type
 
-  // Function name aliases
-  typedef Optimization::Vsplat   VsplatSIMD;
-  typedef Optimization::Vstore   VstoreSIMD;
-  typedef Optimization::Vset     VsetSIMD;
-  typedef Optimization::Vstream  VstreamSIMD;
-  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+// prefetch utilities
+inline void v_prefetch0(int size, const char *ptr){};
+inline void prefetch_HINT_T0(const char *ptr){};
 
 
-  // Arithmetic operations
-  typedef Optimization::Sum         SumSIMD;
-  typedef Optimization::Sub         SubSIMD;
-  typedef Optimization::Mult        MultSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
-  typedef Optimization::Conj        ConjSIMD;
-  typedef Optimization::TimesMinusI TimesMinusISIMD;
-  typedef Optimization::TimesI      TimesISIMD;
+// Function name aliases
+typedef Optimization::Vsplat   VsplatSIMD;
+typedef Optimization::Vstore   VstoreSIMD;
+typedef Optimization::Vset     VsetSIMD;
+typedef Optimization::Vstream  VstreamSIMD;
+template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 
+// Arithmetic operations
+typedef Optimization::Sum         SumSIMD;
+typedef Optimization::Sub         SubSIMD;
+typedef Optimization::Mult        MultSIMD;
+typedef Optimization::MultComplex MultComplexSIMD;
+typedef Optimization::Conj        ConjSIMD;
+typedef Optimization::TimesMinusI TimesMinusISIMD;
+typedef Optimization::TimesI      TimesISIMD;
+  
 }

From ff6da364e807b965ba49df55fd5dba58723d0a51 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 24 Aug 2016 15:05:00 +0100
Subject: [PATCH 19/29] FFT double and single precision gives good performance
 now in multithreaded code.

---
 configure.ac            |  47 +++--
 lib/FFT.h               | 199 +++++++++++++------
 lib/Init.cc             |   2 +-
 lib/fftw/fftw3.h        | 412 ----------------------------------------
 tests/core/Test_fft.cc  |  19 +-
 tests/core/Test_fftf.cc | 111 +++++++++++
 6 files changed, 298 insertions(+), 492 deletions(-)
 delete mode 100644 lib/fftw/fftw3.h
 create mode 100644 tests/core/Test_fftf.cc

diff --git a/configure.ac b/configure.ac
index b14f9748..98489162 100644
--- a/configure.ac
+++ b/configure.ac
@@ -10,8 +10,19 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 AC_LANG(C++)
 CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
+
+############ openmp  ###############
 AC_OPENMP
+
+ac_openmp=no
+
+if test "${OPENMP_CXXFLAGS}X" != "X"; then
+ac_openmp=yes
 AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
+AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
+fi
+
+############ libtool ###############
 LT_INIT
 
 ############### Checks for header files
@@ -29,7 +40,7 @@ AC_TYPE_SIZE_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T
 
-############### Options
+############### GMP and MPFR #################
 AC_ARG_WITH([gmp],
     [AS_HELP_STRING([--with-gmp=prefix],
     [try this for a non-standard install prefix of the GMP library])],
@@ -40,6 +51,8 @@ AC_ARG_WITH([mpfr],
     [try this for a non-standard install prefix of the MPFR library])],
     [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
     [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
+
+################## lapack ####################
 AC_ARG_ENABLE([lapack],
     [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], 
     [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
@@ -55,14 +68,27 @@ case ${ac_LAPACK} in
         AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
 esac
 
-AC_CHECK_LIB([fftw3],[fft_init_threads],
+################## FFTW3 ####################
+AC_ARG_WITH([fftw],    
+            [AS_HELP_STRING([--with-fftw=prefix],
+            [try this for a non-standard install prefix of the FFTW3 library])],
+            [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
+            [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
+
+#
+# What about the MKL library replacement for fftw3 ?  How do we know if fftw_execute
+# can be found in MKL? 
+#
+AC_CHECK_LIB([fftw3],[fftw_execute],
 	[AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes],
         [ac_fftw=no])
+
 case ${ac_fftw} in
     no)
+        echo WARNING libfftw3 not found FFT routines will not work
         ;;
     yes)
-        AM_LDFLAGS="-lfftw3 $AM_LDFLAGS"
+        AM_LDFLAGS="$AM_LDFLAGS -lfftw3 -lfftw3f"
 esac
 
 
@@ -304,19 +330,18 @@ Summary of configuration for $PACKAGE v$VERSION
 - compiler version              : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 - SIMD                          : ${ac_SIMD}
-- communications type           : ${ac_COMMS}
-- default precision             : ${ac_PRECISION}
+- Threading                     : ${ac_openmp} 
+- Communications type           : ${ac_COMMS}
+- Default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG} 
 - GMP                           : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 - LAPACK                        : ${ac_LAPACK}
+- FFTW                          : ${ac_fftw}
 - build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 - graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
-- CXXFLAGS: 
-`echo ${AM_CXXFLAGS} ${CXXFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
-- LDFLAGS:
-`echo ${AM_LDFLAGS} ${LDFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
-- LIBS:
-`echo ${LIBS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
+- CXXFLAGS:  "${AM_CXXFLAGS} ${CXXFLAGS}"
+- LDFLAGS:   "${AM_LDFLAGS} ${LDFLAGS}"
+- LIBS:      "${LIBS} "
 -------------------------------------------------------
 "
diff --git a/lib/FFT.h b/lib/FFT.h
index cf012e79..17c8a309 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -1,3 +1,4 @@
+
     /*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
@@ -28,11 +29,70 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef _GRID_FFT_H_
 #define _GRID_FFT_H_
 
-#include <Grid/fftw/fftw3.h>
-
+#ifdef HAVE_FFTW	
+#include <fftw3.h>
+#endif
 namespace Grid {
 
-  
+  template<class scalar> struct FFTW { };
+
+#ifdef HAVE_FFTW	
+  template<> struct FFTW<ComplexD> {
+  public:
+
+    typedef fftw_complex FFTW_scalar;
+    typedef fftw_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftw_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftw_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftw_destroy_plan(p);
+    }
+  };
+
+  template<> struct FFTW<ComplexF> {
+  public:
+
+    typedef fftwf_complex FFTW_scalar;
+    typedef fftwf_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftwf_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftwf_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftwf_destroy_plan(p);
+    }
+  };
+
+#endif
+
   class FFT { 
   private:
 
@@ -40,6 +100,10 @@ namespace Grid {
     GridCartesian *sgrid;
 
     int Nd;
+    double flops;
+    double flops_call;
+    uint64_t usec;
+
     std::vector<int> dimensions;
     std::vector<int> processors;
     std::vector<int> processor_coor;
@@ -49,6 +113,9 @@ namespace Grid {
     static const int forward=FFTW_FORWARD;
     static const int backward=FFTW_BACKWARD;
 
+    double Flops(void) {return flops;}
+    double MFlops(void) {return flops/usec;}
+
     FFT ( GridCartesian * grid ) : 
       vgrid(grid),
       Nd(grid->_ndimension),
@@ -56,6 +123,8 @@ namespace Grid {
       processors(grid->_processors),
       processor_coor(grid->_processor_coor)
     {
+      flops=0;
+      usec =0;
       std::vector<int> layout(Nd,1);
       sgrid = new GridCartesian(dimensions,layout,processors);
     };
@@ -75,55 +144,62 @@ namespace Grid {
 
       std::vector<int> layout(Nd,1);
       std::vector<int> pencil_gd(vgrid->_fdimensions);
-      std::vector<int> pencil_ld(processors);
 
       pencil_gd[dim] = G*processors[dim];    
-      pencil_ld[dim] = G*processors[dim];    
 
       // Pencil global vol LxLxGxLxL per node
       GridCartesian pencil_g(pencil_gd,layout,processors);
-      GridCartesian pencil_l(pencil_ld,layout,processors);
 
       // Construct pencils
       typedef typename vobj::scalar_object sobj;
+      typedef typename sobj::scalar_type   scalar;
+
       Lattice<vobj> ssource(vgrid); ssource =source;
       Lattice<sobj> pgsource(&pencil_g);
-      Lattice<sobj> pgresult(&pencil_g);
-      Lattice<sobj> plsource(&pencil_l);
-      Lattice<sobj> plresult(&pencil_l);
+      Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
+
+#ifndef HAVE_FFTW	
+      assert(0);
+#else 
+      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
+      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
 
       {
-	assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD));
-	assert(sizeof(fftw_complex)==sizeof(ComplexD));
-	assert(sizeof(fftw_complex)==sizeof(ComplexD));
+	int Ncomp = sizeof(sobj)/sizeof(scalar);
+	int Nlow  = 1;
+	for(int d=0;d<dim;d++){
+	  Nlow*=vgrid->_ldimensions[d];
+	}
 
-	int Ncomp = sizeof(sobj)/sizeof(fftw_complex);
-
-	int rank = 1;  /* not 2: we are computing 1d transforms */
+	int rank = 1;  /* 1d transforms */
 	int n[] = {G}; /* 1d transforms of length G */
 	int howmany = Ncomp;
 	int odist,idist,istride,ostride;
-	idist   = odist   = 1;
-	istride = ostride = Ncomp; /* distance between two elements in the same column */
+	idist   = odist   = 1;          /* Distance between consecutive FT's */
+	istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
 	int *inembed = n, *onembed = n;
 
-	fftw_complex *in = (fftw_complex *)&plsource._odata[0];
-	fftw_complex *out= (fftw_complex *)&plresult._odata[0];
 	
 	int sign = FFTW_FORWARD;
 	if (inverse) sign = FFTW_BACKWARD;
 
-#ifdef HAVE_FFTW
-	fftw_plan p = fftw_plan_many_dft(rank,n,howmany,
-					 in,inembed,
-					 istride,idist,
-					 out,onembed,
-					 ostride, odist,
-					 sign,FFTW_ESTIMATE);
-#else 
-	fftw_plan p ;
-	assert(0);
-#endif
+	FFTW_plan p;
+	{
+	  FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
+	  FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
+	  p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
+					       in,inembed,
+					       istride,idist,
+					       out,onembed,
+					       ostride, odist,
+					       sign,FFTW_ESTIMATE);
+	}
+
+	double add,mul,fma;
+	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
+	flops_call = add+mul+2.0*fma;
+	std::cout << "FFT Flops per call "<<flops_call<<std::endl;
+	GridStopWatch timer;
 
 	// Barrel shift and collect global pencil
 	for(int p=0;p<processors[dim];p++) { 
@@ -146,43 +222,46 @@ namespace Grid {
 	}
 	
 	// Loop over orthog coords
-	for(int idx=0;idx<sgrid->lSites();idx++) { 
+	int NN=pencil_g.lSites();
+
+	GridStopWatch Timer;
+	Timer.Start();
+
+PARALLEL_FOR_LOOP
+	for(int idx=0;idx<NN;idx++) { 
 
-	  std::vector<int> pcoor(Nd,0);
 	  std::vector<int> lcoor(Nd);
-	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
+	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
 
 	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
-	  
-	    // Project to local pencil array
-	    for(int l=0;l<G;l++){
-	      sobj s;
-	      pcoor[dim]=l;
-	      lcoor[dim]=l;
-	      peekLocalSite(s,pgsource,lcoor);
-	      pokeLocalSite(s,plsource,pcoor);
-	    }
-
-	    // FFT the pencil
-#ifdef HAVE_FFTW
-	    fftw_execute(p);
-#endif
-
-	    // Extract the result
-	    for(int l=0;l<L;l++){
-	      sobj s;
-	      int p = processor_coor[dim];
-	      lcoor[dim] = l;
-	      pcoor[dim] = l+L*p;
-	      peekLocalSite(s,plresult,pcoor);
-	      pokeLocalSite(s,result,lcoor);
-	    }
-
+	    FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
+	    FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
+	    FFTW<scalar>::fftw_execute_dft(p,in,out);
 	  }
 	}
-	  
-	fftw_destroy_plan(p);
+
+        Timer.Stop();
+	usec += Timer.useconds();
+	flops+= flops_call*NN;
+
+
+        int pc = processor_coor[dim];
+        for(int idx=0;idx<sgrid->lSites();idx++) { 
+	  std::vector<int> lcoor(Nd);
+	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
+	  std::vector<int> gcoor = lcoor;
+	  // extract the result
+	  sobj s;
+	  gcoor[dim] = lcoor[dim]+L*pc;
+	  peekLocalSite(s,pgresult,gcoor);
+	  pokeLocalSite(s,result,lcoor);
+	}
+      	  
+	FFTW<scalar>::fftw_destroy_plan(p);
       }
+#endif
+
+
     }
 
   };
diff --git a/lib/Init.cc b/lib/Init.cc
index 6543435c..34e503a4 100644
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc,
     assert(ompthreads.size()==1);
     GridThread::SetThreads(ompthreads[0]);
   }
+
   if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
     std::vector<int> cores(0);
     arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
@@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv)
     GridLogConfigure(logstreams);
   }
 
-
   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
     Grid_debug_handler_init();
   }
diff --git a/lib/fftw/fftw3.h b/lib/fftw/fftw3.h
deleted file mode 100644
index 1bf34396..00000000
--- a/lib/fftw/fftw3.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Copyright (c) 2003, 2007-14 Matteo Frigo
- * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
- *
- * The following statement of license applies *only* to this header file,
- * and *not* to the other files distributed with FFTW or derived therefrom:
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
- * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
- * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/***************************** NOTE TO USERS *********************************
- *
- *                 THIS IS A HEADER FILE, NOT A MANUAL
- *
- *    If you want to know how to use FFTW, please read the manual,
- *    online at http://www.fftw.org/doc/ and also included with FFTW.
- *    For a quick start, see the manual's tutorial section.
- *
- *   (Reading header files to learn how to use a library is a habit
- *    stemming from code lacking a proper manual.  Arguably, it's a
- *    *bad* habit in most cases, because header files can contain
- *    interfaces that are not part of the public, stable API.)
- *
- ****************************************************************************/
-
-#ifndef FFTW3_H
-#define FFTW3_H
-
-#include <stdio.h>
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif /* __cplusplus */
-
-/* If <complex.h> is included, use the C99 complex type.  Otherwise
-   define a type bit-compatible with C99 complex */
-#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
-#  define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
-#else
-#  define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
-#endif
-
-#define FFTW_CONCAT(prefix, name) prefix ## name
-#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
-#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
-#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
-#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
-
-/* IMPORTANT: for Windows compilers, you should add a line
-        #define FFTW_DLL
-   here and in kernel/ifftw.h if you are compiling/using FFTW as a
-   DLL, in order to do the proper importing/exporting, or
-   alternatively compile with -DFFTW_DLL or the equivalent
-   command-line flag.  This is not necessary under MinGW/Cygwin, where
-   libtool does the imports/exports automatically. */
-#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
-   /* annoying Windows syntax for shared-library declarations */
-#  if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
-#    define FFTW_EXTERN extern __declspec(dllexport) 
-#  else /* user is calling FFTW; import symbol */
-#    define FFTW_EXTERN extern __declspec(dllimport) 
-#  endif
-#else
-#  define FFTW_EXTERN extern
-#endif
-
-enum fftw_r2r_kind_do_not_use_me {
-     FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
-     FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
-     FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
-};
-
-struct fftw_iodim_do_not_use_me {
-     int n;                     /* dimension size */
-     int is;			/* input stride */
-     int os;			/* output stride */
-};
-
-#include <stddef.h> /* for ptrdiff_t */
-struct fftw_iodim64_do_not_use_me {
-     ptrdiff_t n;                     /* dimension size */
-     ptrdiff_t is;			/* input stride */
-     ptrdiff_t os;			/* output stride */
-};
-
-typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
-typedef int (*fftw_read_char_func_do_not_use_me)(void *);
-
-/*
-  huge second-order macro that defines prototypes for all API
-  functions.  We expand this macro for each supported precision
- 
-  X: name-mangling macro
-  R: real data type
-  C: complex data type
-*/
-
-#define FFTW_DEFINE_API(X, R, C)					   \
-									   \
-FFTW_DEFINE_COMPLEX(R, C);						   \
-									   \
-typedef struct X(plan_s) *X(plan);					   \
-									   \
-typedef struct fftw_iodim_do_not_use_me X(iodim);			   \
-typedef struct fftw_iodim64_do_not_use_me X(iodim64);			   \
-									   \
-typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind);			   \
-									   \
-typedef fftw_write_char_func_do_not_use_me X(write_char_func);		   \
-typedef fftw_read_char_func_do_not_use_me X(read_char_func);		   \
-									   \
-FFTW_EXTERN void X(execute)(const X(plan) p);				   \
-									   \
-FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n,			   \
-		    C *in, C *out, int sign, unsigned flags);		   \
-									   \
-FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign,	   \
-		       unsigned flags);					   \
-FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1,			   \
-		       C *in, C *out, int sign, unsigned flags);	   \
-FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2,		   \
-		       C *in, C *out, int sign, unsigned flags);	   \
-									   \
-FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n,		   \
-                         int howmany,					   \
-                         C *in, const int *inembed,			   \
-                         int istride, int idist,			   \
-                         C *out, const int *onembed,			   \
-                         int ostride, int odist,			   \
-                         int sign, unsigned flags);			   \
-									   \
-FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims,	   \
-			 int howmany_rank,				   \
-			 const X(iodim) *howmany_dims,			   \
-			 C *in, C *out,					   \
-			 int sign, unsigned flags);			   \
-FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
-			 int howmany_rank,				   \
-			 const X(iodim) *howmany_dims,			   \
-			 R *ri, R *ii, R *ro, R *io,			   \
-			 unsigned flags);				   \
-									   \
-FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank,			   \
-                         const X(iodim64) *dims,			   \
-			 int howmany_rank,				   \
-			 const X(iodim64) *howmany_dims,		   \
-			 C *in, C *out,					   \
-			 int sign, unsigned flags);			   \
-FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank,			   \
-                         const X(iodim64) *dims,			   \
-			 int howmany_rank,				   \
-			 const X(iodim64) *howmany_dims,		   \
-			 R *ri, R *ii, R *ro, R *io,			   \
-			 unsigned flags);				   \
-									   \
-FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out);	   \
-FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii,	   \
-                                      R *ro, R *io);			   \
-									   \
-FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n,	   \
-                             int howmany,				   \
-                             R *in, const int *inembed,			   \
-                             int istride, int idist,			   \
-                             C *out, const int *onembed,		   \
-                             int ostride, int odist,			   \
-                             unsigned flags);				   \
-									   \
-FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n,		   \
-                        R *in, C *out, unsigned flags);			   \
-									   \
-FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
-FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1,			   \
-			   R *in, C *out, unsigned flags);		   \
-FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1,			   \
-			   int n2,					   \
-			   R *in, C *out, unsigned flags);		   \
-									   \
-									   \
-FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n,	   \
-			     int howmany,				   \
-			     C *in, const int *inembed,			   \
-			     int istride, int idist,			   \
-			     R *out, const int *onembed,		   \
-			     int ostride, int odist,			   \
-			     unsigned flags);				   \
-									   \
-FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n,		   \
-                        C *in, R *out, unsigned flags);			   \
-									   \
-FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
-FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1,			   \
-			   C *in, R *out, unsigned flags);		   \
-FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1,			   \
-			   int n2,					   \
-			   C *in, R *out, unsigned flags);		   \
-									   \
-FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims,   \
-			     int howmany_rank,				   \
-			     const X(iodim) *howmany_dims,		   \
-			     R *in, C *out,				   \
-			     unsigned flags);				   \
-FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims,   \
-			     int howmany_rank,				   \
-			     const X(iodim) *howmany_dims,		   \
-			     C *in, R *out,				   \
-			     unsigned flags);				   \
-									   \
-FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)(				   \
-                             int rank, const X(iodim) *dims,		   \
-			     int howmany_rank,				   \
-			     const X(iodim) *howmany_dims,		   \
-			     R *in, R *ro, R *io,			   \
-			     unsigned flags);				   \
-FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)(				   \
-                             int rank, const X(iodim) *dims,		   \
-			     int howmany_rank,				   \
-			     const X(iodim) *howmany_dims,		   \
-			     R *ri, R *ii, R *out,			   \
-			     unsigned flags);				   \
-									   \
-FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank,			   \
-                             const X(iodim64) *dims,			   \
-			     int howmany_rank,				   \
-			     const X(iodim64) *howmany_dims,		   \
-			     R *in, C *out,				   \
-			     unsigned flags);				   \
-FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank,			   \
-                             const X(iodim64) *dims,			   \
-			     int howmany_rank,				   \
-			     const X(iodim64) *howmany_dims,		   \
-			     C *in, R *out,				   \
-			     unsigned flags);				   \
-									   \
-FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)(			   \
-                             int rank, const X(iodim64) *dims,		   \
-			     int howmany_rank,				   \
-			     const X(iodim64) *howmany_dims,		   \
-			     R *in, R *ro, R *io,			   \
-			     unsigned flags);				   \
-FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)(			   \
-                             int rank, const X(iodim64) *dims,		   \
-			     int howmany_rank,				   \
-			     const X(iodim64) *howmany_dims,		   \
-			     R *ri, R *ii, R *out,			   \
-			     unsigned flags);				   \
-									   \
-FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out);	   \
-FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out);	   \
-									   \
-FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p,		   \
-                                          R *in, R *ro, R *io);		   \
-FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p,		   \
-                                          R *ri, R *ii, R *out);	   \
-									   \
-FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n,		   \
-                         int howmany,					   \
-                         R *in, const int *inembed,			   \
-                         int istride, int idist,			   \
-                         R *out, const int *onembed,			   \
-                         int ostride, int odist,			   \
-                         const X(r2r_kind) *kind, unsigned flags);	   \
-									   \
-FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out,	   \
-                    const X(r2r_kind) *kind, unsigned flags);		   \
-									   \
-FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out,		   \
-                       X(r2r_kind) kind, unsigned flags);		   \
-FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out,	   \
-                       X(r2r_kind) kind0, X(r2r_kind) kind1,		   \
-                       unsigned flags);					   \
-FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2,		   \
-                       R *in, R *out, X(r2r_kind) kind0,		   \
-                       X(r2r_kind) kind1, X(r2r_kind) kind2,		   \
-                       unsigned flags);					   \
-									   \
-FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims,	   \
-                         int howmany_rank,				   \
-                         const X(iodim) *howmany_dims,			   \
-                         R *in, R *out,					   \
-                         const X(r2r_kind) *kind, unsigned flags);	   \
-									   \
-FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims,   \
-                         int howmany_rank,				   \
-                         const X(iodim64) *howmany_dims,		   \
-                         R *in, R *out,					   \
-                         const X(r2r_kind) *kind, unsigned flags);	   \
-									   \
-FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out);	   \
-									   \
-FFTW_EXTERN void X(destroy_plan)(X(plan) p);				   \
-FFTW_EXTERN void X(forget_wisdom)(void);				   \
-FFTW_EXTERN void X(cleanup)(void);					   \
-									   \
-FFTW_EXTERN void X(set_timelimit)(double t);				   \
-									   \
-FFTW_EXTERN void X(plan_with_nthreads)(int nthreads);			   \
-FFTW_EXTERN int X(init_threads)(void);					   \
-FFTW_EXTERN void X(cleanup_threads)(void);				   \
-									   \
-FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename);	   \
-FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file);		   \
-FFTW_EXTERN char *X(export_wisdom_to_string)(void);			   \
-FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char,   	   \
-                                  void *data);				   \
-FFTW_EXTERN int X(import_system_wisdom)(void);				   \
-FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename);	   \
-FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file);		   \
-FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string);	   \
-FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
-									   \
-FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file);	   \
-FFTW_EXTERN void X(print_plan)(const X(plan) p);			   \
-FFTW_EXTERN char *X(sprint_plan)(const X(plan) p);			   \
-									   \
-FFTW_EXTERN void *X(malloc)(size_t n);					   \
-FFTW_EXTERN R *X(alloc_real)(size_t n);					   \
-FFTW_EXTERN C *X(alloc_complex)(size_t n);				   \
-FFTW_EXTERN void X(free)(void *p);					   \
-									   \
-FFTW_EXTERN void X(flops)(const X(plan) p,				   \
-                          double *add, double *mul, double *fmas);	   \
-FFTW_EXTERN double X(estimate_cost)(const X(plan) p);			   \
-FFTW_EXTERN double X(cost)(const X(plan) p);				   \
-									   \
-FFTW_EXTERN int X(alignment_of)(R *p);                                     \
-FFTW_EXTERN const char X(version)[];                                       \
-FFTW_EXTERN const char X(cc)[];						   \
-FFTW_EXTERN const char X(codelet_optim)[];
-
-
-/* end of FFTW_DEFINE_API macro */
-
-FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
-FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
-FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
-
-/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
-   for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
-#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
- && !(defined(__ICC) || defined(__INTEL_COMPILER)) \
- && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
-#  if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
-/* note: __float128 is a typedef, which is not supported with the _Complex
-         keyword in gcc, so instead we use this ugly __attribute__ version.
-         However, we can't simply pass the __attribute__ version to
-         FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
-         types.  Hence redefining FFTW_DEFINE_COMPLEX.  Ugh. */
-#    undef FFTW_DEFINE_COMPLEX
-#    define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
-#  endif
-FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
-#endif
-
-#define FFTW_FORWARD (-1)
-#define FFTW_BACKWARD (+1)
-
-#define FFTW_NO_TIMELIMIT (-1.0)
-
-/* documented flags */
-#define FFTW_MEASURE (0U)
-#define FFTW_DESTROY_INPUT (1U << 0)
-#define FFTW_UNALIGNED (1U << 1)
-#define FFTW_CONSERVE_MEMORY (1U << 2)
-#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
-#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
-#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
-#define FFTW_ESTIMATE (1U << 6)
-#define FFTW_WISDOM_ONLY (1U << 21)
-
-/* undocumented beyond-guru flags */
-#define FFTW_ESTIMATE_PATIENT (1U << 7)
-#define FFTW_BELIEVE_PCOST (1U << 8)
-#define FFTW_NO_DFT_R2HC (1U << 9)
-#define FFTW_NO_NONTHREADED (1U << 10)
-#define FFTW_NO_BUFFERING (1U << 11)
-#define FFTW_NO_INDIRECT_OP (1U << 12)
-#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
-#define FFTW_NO_RANK_SPLITS (1U << 14)
-#define FFTW_NO_VRANK_SPLITS (1U << 15)
-#define FFTW_NO_VRECURSE (1U << 16)
-#define FFTW_NO_SIMD (1U << 17)
-#define FFTW_NO_SLOW (1U << 18)
-#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
-#define FFTW_ALLOW_PRUNING (1U << 20)
-
-#ifdef __cplusplus
-}  /* extern "C" */
-#endif /* __cplusplus */
-
-#endif /* FFTW3_H */
diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc
index ed57800c..2fdaeed2 100644
--- a/tests/core/Test_fft.cc
+++ b/tests/core/Test_fft.cc
@@ -35,6 +35,9 @@ int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
 
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
   std::vector<int> latt_size   = GridDefaultLatt();
   std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
   std::vector<int> mpi_layout  = GridDefaultMpi();
@@ -75,10 +78,10 @@ int main (int argc, char ** argv)
 
   FFT theFFT(&Fine);
 
-  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde;
-  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde;
-  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde;
-  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
 
   //  C=zero;
   //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
@@ -90,10 +93,10 @@ int main (int argc, char ** argv)
   C=C-Ctilde;
   std::cout << "diff scalar "<<norm2(C) << std::endl;
 
-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
 
   SpinMatrixD Sp; 
   Sp = zero; Sp = Sp+cVol;
diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc
new file mode 100644
index 00000000..54b1ddb3
--- /dev/null
+++ b/tests/core/Test_fftf.cc
@@ -0,0 +1,111 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout( { vComplexF::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
+
+  LatticeComplexF     one(&Fine);
+  LatticeComplexF      zz(&Fine);
+  LatticeComplexF       C(&Fine);
+  LatticeComplexF  Ctilde(&Fine);
+  LatticeComplexF    coor(&Fine);
+
+  LatticeSpinMatrixF    S(&Fine);
+  LatticeSpinMatrixF    Stilde(&Fine);
+  
+  std::vector<int> p({1,2,3,2});
+
+  one = ComplexF(1.0,0.0);
+  zz  = ComplexF(0.0,0.0);
+
+  ComplexF ci(0.0,1.0);
+
+  C=zero;
+  for(int mu=0;mu<4;mu++){
+    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
+    LatticeCoordinate(coor,mu);
+    C = C - (TwoPiL * p[mu]) * coor;
+  }
+
+  C = exp(C*ci);
+
+  S=zero;
+  S = S+C;
+
+  FFT theFFT(&Fine);
+
+  theFFT.FFT_dim(Ctilde,C,0,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,1,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,2,FFT::forward);  C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Ctilde,C,3,FFT::forward);  std::cout << theFFT.MFlops()<<std::endl;
+
+  //  C=zero;
+  //  Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
+  TComplexF cVol;
+  cVol()()() = vol;
+
+  C=zero;
+  pokeSite(cVol,C,p);
+  C=C-Ctilde;
+  std::cout << "diff scalar "<<norm2(C) << std::endl;
+
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
+
+  SpinMatrixF Sp; 
+  Sp = zero; Sp = Sp+cVol;
+
+  S=zero;
+  pokeSite(Sp,S,p);
+
+  S= S-Stilde;
+  std::cout << "diff FT[SpinMat] "<<norm2(S) << std::endl;
+
+  Grid_finalize();
+}

From 744691097f303f1d21fd7046376caa207e453e2d Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 24 Aug 2016 15:05:56 +0100
Subject: [PATCH 20/29] Printing

---
 lib/FFT.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/FFT.h b/lib/FFT.h
index 17c8a309..b25e8198 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -198,7 +198,7 @@ namespace Grid {
 	double add,mul,fma;
 	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
 	flops_call = add+mul+2.0*fma;
-	std::cout << "FFT Flops per call "<<flops_call<<std::endl;
+
 	GridStopWatch timer;
 
 	// Barrel shift and collect global pencil

From bfac5195b8fb9520e81140bf7eaba633a906380a Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 24 Aug 2016 16:38:36 +0100
Subject: [PATCH 21/29] tidy up

---
 lib/FFT.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/FFT.h b/lib/FFT.h
index b25e8198..0c79c527 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -244,7 +244,6 @@ PARALLEL_FOR_LOOP
 	usec += Timer.useconds();
 	flops+= flops_call*NN;
 
-
         int pc = processor_coor[dim];
         for(int idx=0;idx<sgrid->lSites();idx++) { 
 	  std::vector<int> lcoor(Nd);

From 8c89391c02207bb9dbcbf198f8474a6d203aeb8b Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 24 Aug 2016 16:41:47 +0100
Subject: [PATCH 22/29] FFTW unresolved fixed when no fftw3.h

---
 lib/FFT.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/FFT.h b/lib/FFT.h
index 0c79c527..4cda6483 100644
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -91,6 +91,11 @@ namespace Grid {
     }
   };
 
+#endif
+
+#ifndef FFTW_FORWARD
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
 #endif
 
   class FFT { 

From aa20cc8b52e90bbe0bbc960e2ea8e49b7f0baa5b Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Fri, 9 Sep 2016 02:53:22 -0700
Subject: [PATCH 23/29] Fixing compilation error with AVX512 flag

---
 bootstrap.sh                               |  2 +-
 lib/qcd/action/fermion/WilsonKernels.h     | 19 ++++++++++++-
 lib/qcd/action/fermion/WilsonKernelsAsm.cc | 32 +++++++++++++++++++++-
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/bootstrap.sh b/bootstrap.sh
index 461eb121..f847b7ab 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -4,7 +4,7 @@ EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
 
 echo "-- deploying Eigen source..."
-wget ${EIGEN_URL}
+wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`
 
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index b679d3f9..b551319b 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -96,7 +96,24 @@ namespace Grid {
      WilsonKernels(const ImplParams &p= ImplParams());
      
     };
-
+    
+    
+    template<class Impl>
+      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+      {
+	assert(0);
+      }
+    template<class Impl>
+      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+      {
+	assert(0);
+      }
+    
+    
   }
 }
 #endif
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index b443ccf9..ce592540 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -38,6 +38,7 @@ namespace QCD {
   ///////////////////////////////////////////////////////////
   // Default to no assembler implementation
   ///////////////////////////////////////////////////////////
+  /*
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
@@ -45,6 +46,14 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & l
 {
   assert(0);
 }
+template<class Impl>
+void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+  */
 
 #if defined(AVX512) 
 
@@ -116,7 +125,7 @@ void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st
 #endif
 
 
-
+/*
 template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
@@ -136,5 +145,26 @@ template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilIm
 template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+
+template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+
+template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+*/
 }}
 

From f76f281e58e5e703ea0cd9806b1f4c729014a785 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Fri, 9 Sep 2016 11:34:25 +0100
Subject: [PATCH 24/29] Cleaning files after fix

---
 lib/parallelIO/BinaryIO.h                  | 166 ++++++++++-----------
 lib/qcd/action/fermion/WilsonKernels.h     |  31 ++--
 lib/qcd/action/fermion/WilsonKernelsAsm.cc | 159 ++++++--------------
 3 files changed, 147 insertions(+), 209 deletions(-)

diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index 184209dc..5eddb57d 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -194,22 +194,22 @@ class BinaryIO {
 
       std::vector<int> site({x,y,z,t});
 
-      if ( grid->IsBoss() ) {
-	fin.read((char *)&file_object,sizeof(file_object));
-	bytes += sizeof(file_object);
-	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
+      if (grid->IsBoss()) {
+        fin.read((char *)&file_object, sizeof(file_object));
+        bytes += sizeof(file_object);
+        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));
 
-	munge(file_object,munged,csum);
+        munge(file_object, munged, csum);
       }
       // The boss who read the file has their value poked
       pokeSite(munged,Umu,site);
     }}}}
     timer.Stop();
     std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
 
     return csum;
   }
@@ -254,20 +254,20 @@ class BinaryIO {
 
       
       if ( grid->IsBoss() ) {
-	
-	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+  
+  if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+  if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
 
-	// NB could gather an xstrip as an optimisation.
-	fout.write((char *)&file_object,sizeof(file_object));
-	bytes+=sizeof(file_object);
+  // NB could gather an xstrip as an optimisation.
+  fout.write((char *)&file_object,sizeof(file_object));
+  bytes+=sizeof(file_object);
       }
     }}}}
     timer.Stop();
     std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 
     return csum;
   }
@@ -305,15 +305,15 @@ class BinaryIO {
       int l_idx=parallel.generator_idx(o_idx,i_idx);
 
       if( rank == grid->ThisRank() ){
-	//	std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
-	parallel.GetState(saved,l_idx);
+  //  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
+  parallel.GetState(saved,l_idx);
       }
 
       grid->Broadcast(rank,(void *)&saved[0],bytes);
 
       if ( grid->IsBoss() ) {
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-	fout.write((char *)&saved[0],bytes);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  fout.write((char *)&saved[0],bytes);
       }
 
     }
@@ -355,14 +355,14 @@ class BinaryIO {
       int l_idx=parallel.generator_idx(o_idx,i_idx);
 
       if ( grid->IsBoss() ) {
-	fin.read((char *)&saved[0],bytes);
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  fin.read((char *)&saved[0],bytes);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
       }
 
       grid->Broadcast(0,(void *)&saved[0],bytes);
 
       if( rank == grid->ThisRank() ){
-	parallel.SetState(saved,l_idx);
+  parallel.SetState(saved,l_idx);
       }
 
     }
@@ -415,15 +415,15 @@ class BinaryIO {
 
       if ( d == 0 ) parallel[d] = 0;
       if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  range[d] = grid->_ldimensions[d];
+  start[d] = grid->_processor_coor[d]*range[d];
+  ioproc[d]= grid->_processor_coor[d];
       } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
+  range[d] = grid->_gdimensions[d];
+  start[d] = 0;
+  ioproc[d]= 0;
 
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
       }
       slice_vol = slice_vol * range[d];
     }
@@ -434,9 +434,9 @@ class BinaryIO {
       std::cout<< std::dec ;
       std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
       for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+  std::cout<< range[d];
+  if( d< grid->_ndimension-1 ) 
+    std::cout<< " x ";
       }
       std::cout << std::endl;
     }
@@ -463,7 +463,7 @@ class BinaryIO {
 
       // need to implement these loops in Nd independent way with a lexico conversion
     for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
       std::vector<int> tsite(nd); // temporary mixed up site
       std::vector<int> gsite(nd);
       std::vector<int> lsite(nd);
@@ -472,8 +472,8 @@ class BinaryIO {
       Lexicographic::CoorFromIndex(tsite,tlex,range);
 
       for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  gsite[d] = tsite[d]+start[d];               // global site
       }
 
       /////////////////////////
@@ -487,29 +487,29 @@ class BinaryIO {
       // iorank reads from the seek
       ////////////////////////////////
       if (myrank == iorank) {
-	
-	fin.seekg(offset+g_idx*sizeof(fileObj));
-	fin.read((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
-	
-	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
-	
-	munge(fileObj,siteObj,csum);
+  
+  fin.seekg(offset+g_idx*sizeof(fileObj));
+  fin.read((char *)&fileObj,sizeof(fileObj));
+  bytes+=sizeof(fileObj);
+  
+  if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+  
+  munge(fileObj,siteObj,csum);
 
-      }	
+      } 
 
       // Possibly do transport through pt2pt 
       if ( rank != iorank ) { 
-	if ( (myrank == rank) || (myrank==iorank) ) {
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
-	}
+  if ( (myrank == rank) || (myrank==iorank) ) {
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
+  }
       }
       // Poke at destination
       if ( myrank == rank ) {
-	  pokeLocalSite(siteObj,Umu,lsite);
+    pokeLocalSite(siteObj,Umu,lsite);
       }
       grid->Barrier(); // necessary?
     }
@@ -520,7 +520,7 @@ class BinaryIO {
 
     timer.Stop();
     std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
     
     return csum;
   }
@@ -558,15 +558,15 @@ class BinaryIO {
       if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
 
       if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  range[d] = grid->_ldimensions[d];
+  start[d] = grid->_processor_coor[d]*range[d];
+  ioproc[d]= grid->_processor_coor[d];
       } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
+  range[d] = grid->_gdimensions[d];
+  start[d] = 0;
+  ioproc[d]= 0;
 
-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
       }
 
       slice_vol = slice_vol * range[d];
@@ -577,9 +577,9 @@ class BinaryIO {
       grid->GlobalSum(tmp);
       std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
       for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+  std::cout<< range[d];
+  if( d< grid->_ndimension-1 ) 
+    std::cout<< " x ";
       }
       std::cout << std::endl;
     }
@@ -610,7 +610,7 @@ class BinaryIO {
     // should aggregate a whole chunk and then write.
     // need to implement these loops in Nd independent way with a lexico conversion
     for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
       std::vector<int> tsite(nd); // temporary mixed up site
       std::vector<int> gsite(nd);
       std::vector<int> lsite(nd);
@@ -619,8 +619,8 @@ class BinaryIO {
       Lexicographic::CoorFromIndex(tsite,tlex,range);
 
       for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  gsite[d] = tsite[d]+start[d];               // global site
       }
 
 
@@ -640,26 +640,26 @@ class BinaryIO {
 
       // Pair of nodes may need to do pt2pt send
       if ( rank != iorank ) { // comms is necessary
-	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-	  // Send to IOrank 
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
-	}
+  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+    // Send to IOrank 
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
+  }
       }
 
       grid->Barrier(); // necessary?
 
       if (myrank == iorank) {
-	
-	munge(siteObj,fileObj,csum);
+  
+  munge(siteObj,fileObj,csum);
 
-	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
-	
-	fout.seekp(offset+g_idx*sizeof(fileObj));
-	fout.write((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+  
+  fout.seekp(offset+g_idx*sizeof(fileObj));
+  fout.write((char *)&fileObj,sizeof(fileObj));
+  bytes+=sizeof(fileObj);
       }
     }
 
@@ -668,7 +668,7 @@ class BinaryIO {
 
     timer.Stop();
     std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
 
     return csum;
   }
diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h
index b551319b..d796e235 100644
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -97,23 +97,24 @@ namespace Grid {
      
     };
     
-    
+    ///////////////////////////////////////////////////////////
+    // Default to no assembler implementation
+    ///////////////////////////////////////////////////////////
     template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-      {
-	assert(0);
-      }
+    void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }
     template<class Impl>
-      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-      {
-	assert(0);
-      }
-    
-    
+    void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }
+        
   }
 }
 #endif
diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
index ce592540..d2cb4285 100644
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************
 
     Grid physics library, www.github.com/paboyle/Grid 
 
@@ -26,77 +26,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
     See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 
 #include <Grid.h>
 
 namespace Grid {
-namespace QCD {
-
-
-  ///////////////////////////////////////////////////////////
-  // Default to no assembler implementation
-  ///////////////////////////////////////////////////////////
-  /*
-template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-  */
-
+  namespace QCD {
+    
 #if defined(AVX512) 
-
-
-  ///////////////////////////////////////////////////////////
-  // If we are AVX512 specialise the single precision routine
-  ///////////////////////////////////////////////////////////
-
+    
+    
+    ///////////////////////////////////////////////////////////
+    // If we are AVX512 specialise the single precision routine
+    ///////////////////////////////////////////////////////////
+    
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
-
-static Vector<vComplexF> signs;
-
-int setupSigns(void ){
-  Vector<vComplexF> bother(2);
-  signs = bother;
-  vrsign(signs[0]);
-  visign(signs[1]);
-  return 1;
-}
-static int signInit = setupSigns();
-
+    
+    static Vector<vComplexF> signs;
+    
+    int setupSigns(void ){
+      Vector<vComplexF> bother(2);
+      signs = bother;
+      vrsign(signs[0]);
+      visign(signs[1]);
+      return 1;
+    }
+    static int signInit = setupSigns();
+  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
-
+  
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
-
+  
 #undef KERNEL_DAG
-template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    template<>
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+      
 #define KERNEL_DAG
-template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    template<>
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@@ -107,64 +86,22 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebesgue
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-
+				    
 #undef KERNEL_DAG
-template<>
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    template<>
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #define KERNEL_DAG
-template<>
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    template<>
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
 #endif
-
-
-/*
-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-
-template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-
-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-
-template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-*/
-}}
+  }
+}
 

From 5df5d52d417decf8d9a229a674a9d4a082149767 Mon Sep 17 00:00:00 2001
From: Guido Cossu <guido.cossu@ed.ac.uk>
Date: Mon, 12 Sep 2016 17:17:20 +0100
Subject: [PATCH 25/29] Fix for the Intel compiler

---
 lib/qcd/QCD.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/qcd/QCD.h b/lib/qcd/QCD.h
index ec11c837..0e9d3c17 100644
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -55,11 +55,14 @@ namespace QCD {
     //////////////////////////////////////////////////////////////////////////////
     // QCD iMatrix types
     // Index conventions:                            Lorentz x Spin x Colour
+    // note: static const int or constexpr will work for type deductions
+    //       with the intel compiler (up to version 17)
     //////////////////////////////////////////////////////////////////////////////
-    static const int ColourIndex = 2;
-    static const int SpinIndex   = 1;
-    static const int LorentzIndex= 0;
+    #define ColourIndex  2
+    #define SpinIndex    1
+    #define LorentzIndex 0
 
+  
     // Also should make these a named enum type
     static const int DaggerNo=0;
     static const int DaggerYes=1;

From 2e74520821e4b0549d83ffcf61effd5334a589a2 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 16 Sep 2016 15:25:49 +0100
Subject: [PATCH 26/29] removed libtool use (BG/Q compatibility)

---
 configure.ac    | 4 +---
 lib/Makefile.am | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/configure.ac b/configure.ac
index d064345b..a987eabb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -10,6 +10,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 AC_LANG(C++)
 CXXFLAGS="-O3 $CXXFLAGS"
 AC_PROG_CXX
+AC_PROG_RANLIB
 
 ############ openmp  ###############
 AC_OPENMP
@@ -22,9 +23,6 @@ AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
 AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
 fi
 
-############ libtool ###############
-LT_INIT
-
 ############### Checks for header files
 AC_CHECK_HEADERS(stdint.h)
 AC_CHECK_HEADERS(mm_malloc.h)
diff --git a/lib/Makefile.am b/lib/Makefile.am
index a7ef229a..bc752af5 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -17,8 +17,8 @@ endif
 include Make.inc
 include Eigen.inc
 
-lib_LTLIBRARIES = libGrid.la
+lib_LIBRARIES = libGrid.a
 
-libGrid_la_SOURCES             = $(CCFILES) $(extra_sources)
-libGrid_ladir                  = $(pkgincludedir)
+libGrid_a_SOURCES              = $(CCFILES) $(extra_sources)
+libGrid_adir                   = $(pkgincludedir)
 nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h

From 0724f7af75764f7617dc945a7382ff6d289c1cf4 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <a.portelli@soton.ac.uk>
Date: Mon, 19 Sep 2016 18:09:12 +0100
Subject: [PATCH 27/29] QPX single precision implementation

---
 lib/Stencil.h       |   2 +-
 lib/simd/Grid_qpx.h | 151 ++++++++++++++++++++++++++++++++++++++------
 tests/Test_simd.cc  |   3 +-
 3 files changed, 132 insertions(+), 24 deletions(-)

diff --git a/lib/Stencil.h b/lib/Stencil.h
index 47c1b977..79479879 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -265,7 +265,7 @@
 	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
        }
        inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
-	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
+	 //_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 	 local = _entries[ent]._is_local;
 	 perm  = _entries[ent]._permute;
 	 if (perm)  ptype = _permute_type[point]; 
diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h
index 6ac83290..07933f52 100644
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -27,20 +27,31 @@
 
 namespace Grid {
 namespace Optimization {
+  typedef struct 
+  {
+    float v0,v1,v2,v3;
+  } vector4float;
+
   inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
   {
     stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
     return stream;
   };
+
+  inline std::ostream & operator<<(std::ostream& stream, const vector4float a)
+  {
+    stream << "{"<< a.v0 <<","<< a.v1 <<","<< a.v2 <<","<< a.v3 <<"}";
+    return stream;
+  };
   
   struct Vsplat{
     //Complex float
-    inline vector4double operator()(float a, float b){
-      return (vector4double){(double)a, (double)b, (double)a, (double)b};
+    inline vector4float operator()(float a, float b){
+      return (vector4float){a, b, a, b};
     }
     // Real float
-    inline vector4double operator()(float a){
-      return (vector4double){(double)a, (double)a, (double)a, (double)a};
+    inline vector4float operator()(float a){
+      return (vector4float){a, a, a, a};
     }
     //Complex double
     inline vector4double operator()(double a, double b){
@@ -61,6 +72,18 @@ namespace Optimization {
     inline void operator()(vector4double a, float *f){
       vec_st(a, 0, f);
     }
+
+    inline void operator()(vector4double a, vector4float &f){
+      vec_st(a, 0, (float *)(&f));
+    }
+
+    inline void operator()(vector4float a, float *f){
+      f[0] = a.v0;
+      f[1] = a.v1;
+      f[2] = a.v2;
+      f[3] = a.v3;
+    }
+
     //Double
     inline void operator()(vector4double a, double *d){
       vec_st(a, 0, d);
@@ -76,6 +99,18 @@ namespace Optimization {
     inline void operator()(float *f, vector4double a){
       vec_st(a, 0, f);
     }
+
+    inline void operator()(vector4float f, vector4double a){
+      vec_st(a, 0, (float *)(&f));
+    }
+
+    inline void operator()(float *f, vector4float a){
+      f[0] = a.v0;
+      f[1] = a.v1;
+      f[2] = a.v2;
+      f[3] = a.v3;
+    }
+
     //Double
     inline void operator()(double *d, vector4double a){
       vec_st(a, 0, d);
@@ -85,17 +120,23 @@ namespace Optimization {
   
   struct Vset{
     // Complex float
-    inline vector4double operator()(Grid::ComplexF *a){
-      return vec_ld(0, (float *)a);
+    inline vector4float operator()(Grid::ComplexF *a){
+      return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()};
     }
     // Complex double
     inline vector4double operator()(Grid::ComplexD *a){
       return vec_ld(0, (double *)a);
     }
+
     // Real float
-    inline vector4double operator()(float *a){
-      return vec_ld(0, a);
+    inline vector4float operator()(float *a){
+      return (vector4float){a[0], a[1], a[2], a[3]};
     }
+
+    inline vector4double operator()(vector4float a){
+      return vec_ld(0, (float *)(&a));
+    }
+
     // Real double
     inline vector4double operator()(double *a){
       return vec_ld(0, a);
@@ -122,11 +163,42 @@ namespace Optimization {
   /////////////////////////////////////////////////////
   // Arithmetic operations
   /////////////////////////////////////////////////////
+  #define FLOAT_WRAP_2(fn, pref)\
+  pref vector4float fn(vector4float a, vector4float b)\
+  {\
+    vector4double ad, bd, rd;\
+    vector4float  r;\
+    \
+    ad = Vset()(a);\
+    bd = Vset()(b);\
+    rd = fn(ad, bd);\
+    Vstore()(rd, r);\
+    \
+    return r;\
+  }
+
+  #define FLOAT_WRAP_1(fn, pref)\
+  pref vector4float fn(vector4float a)\
+  {\
+    vector4double ad, rd;\
+    vector4float  r;\
+    \
+    ad = Vset()(a);\
+    rd = fn(ad);\
+    Vstore()(rd, r);\
+    \
+    return r;\
+  }
+
   struct Sum{
     //Complex/Real double
     inline vector4double operator()(vector4double a, vector4double b){
       return vec_add(a, b);
     }
+
+    //Complex/Real float
+    FLOAT_WRAP_2(operator(), inline)
+
     //Integer
     inline int operator()(int a, int b){
       return a + b;
@@ -138,6 +210,10 @@ namespace Optimization {
     inline vector4double operator()(vector4double a, vector4double b){
       return vec_sub(a, b);
     }
+
+    //Complex/Real float
+    FLOAT_WRAP_2(operator(), inline)
+
     //Integer
     inline int operator()(int a, int b){
       return a - b;
@@ -149,6 +225,9 @@ namespace Optimization {
     inline vector4double operator()(vector4double a, vector4double b){
       return vec_xxnpmadd(a, b, vec_xmul(b, a));
     }
+
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
   };
   
   struct Mult{
@@ -156,6 +235,10 @@ namespace Optimization {
     inline vector4double operator()(vector4double a, vector4double b){
       return vec_mul(a, b);
     }
+
+    // Real float
+    FLOAT_WRAP_2(operator(), inline)
+
     // Integer
     inline int operator()(int a, int b){
       return a*b;
@@ -167,6 +250,9 @@ namespace Optimization {
     inline vector4double operator()(vector4double v){
       return vec_mul(v, (vector4double){1., -1., 1., -1.});
     }
+
+    // Complex float
+    FLOAT_WRAP_1(operator(), inline)
   };
   
   struct TimesMinusI{
@@ -175,6 +261,9 @@ namespace Optimization {
       return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
                                (vector4double){0., 0., 0., 0.});
     }
+
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
   };
   
   struct TimesI{
@@ -183,9 +272,13 @@ namespace Optimization {
       return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
                               (vector4double){0., 0., 0., 0.});
     }
+
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
   };
 
   struct Permute{
+    //Complex double
     static inline vector4double Permute0(vector4double v){ //0123 -> 2301
       return vec_perm(v, v, vec_gpci(02301));
     };
@@ -198,6 +291,12 @@ namespace Optimization {
     static inline vector4double Permute3(vector4double v){
       return v;
     };
+
+    // Complex float
+    FLOAT_WRAP_1(Permute0, static inline)
+    FLOAT_WRAP_1(Permute1, static inline)
+    FLOAT_WRAP_1(Permute2, static inline)
+    FLOAT_WRAP_1(Permute3, static inline)
   };
   
   struct Rotate{
@@ -218,31 +317,42 @@ namespace Optimization {
         default: assert(0);
       }
     }
+
+    static inline vector4float rotate(vector4float v, int n){
+      vector4double vd, rd;
+      vector4float  r;
+
+      vd = Vset()(v);
+      rd = rotate(vd, n);
+      Vstore()(rd, r);
+
+      return r;
+    }
   };
   
   //Complex float Reduce
   template<>
   inline Grid::ComplexF
-  Reduce<Grid::ComplexF, vector4double>::operator()(vector4double v) { //2 complex
-    vector4double v1,v2;
+  Reduce<Grid::ComplexF, vector4float>::operator()(vector4float v) { //2 complex
+    vector4float v1,v2;
     
     v1 = Optimization::Permute::Permute0(v);
-    v1 = vec_add(v1, v);
+    v1 = Optimization::Sum()(v1, v);
     
-    return Grid::ComplexF((float)vec_extract(v1, 0), (float)vec_extract(v1, 1));
+    return Grid::ComplexF(v1.v0, v1.v1);
   }
   //Real float Reduce
   template<>
   inline Grid::RealF
-  Reduce<Grid::RealF, vector4double>::operator()(vector4double v){ //4 floats
-    vector4double v1,v2;
+  Reduce<Grid::RealF, vector4float>::operator()(vector4float v){ //4 floats
+    vector4float v1,v2;
     
     v1 = Optimization::Permute::Permute0(v);
-    v1 = vec_add(v1, v);
+    v1 = Optimization::Sum()(v1, v);
     v2 = Optimization::Permute::Permute1(v1);
-    v1 = vec_add(v1, v2);
+    v1 = Optimization::Sum()(v1, v2);
     
-    return (float)vec_extract(v1, 0);
+    return v1.v0;
   }
   
   
@@ -283,10 +393,9 @@ namespace Optimization {
 
 ////////////////////////////////////////////////////////////////////////////////
 // Here assign types
-
-typedef vector4double SIMD_Ftype;  // Single precision type
-typedef vector4double SIMD_Dtype; // Double precision type
-typedef int SIMD_Itype; // Integer type
+typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
+typedef vector4double              SIMD_Dtype; // Double precision type
+typedef int                        SIMD_Itype; // Integer type
 
 // prefetch utilities
 inline void v_prefetch0(int size, const char *ptr){};
diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc
index 5d84b9ef..189f0559 100644
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -157,10 +157,9 @@ void Tester(const functor &func)
   std::cout << GridLogMessage << " " << func.name() << std::endl;
 
   std::cout << GridLogDebug << v_input1 << std::endl;
+  std::cout << GridLogDebug << v_input2 << std::endl;
   std::cout << GridLogDebug << v_result << std::endl;
 
-
-
   int ok=0;
   for(int i=0;i<Nsimd;i++){
     if ( abs(reference[i]-result[i])>1.0e-7){

From 65ca174dbbb938cef617a40aea7f106f23babab4 Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 20 Sep 2016 11:25:06 +0100
Subject: [PATCH 28/29] gitignore update

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index f972b7f5..e82ecf9c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -94,6 +94,10 @@ build.sh
 ################
 lib/Eigen/*
 
+# FFTW source #
+################
+lib/fftw/*
+
 # libtool macros #
 ##################
 m4/lt*

From d2573189d8d79d20fb5d3d7ea115d996ad431bca Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Tue, 20 Sep 2016 12:30:24 +0100
Subject: [PATCH 29/29] build system: FFTW fix

---
 configure.ac | 34 ++++++++++++----------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/configure.ac b/configure.ac
index a987eabb..db379510 100644
--- a/configure.ac
+++ b/configure.ac
@@ -73,23 +73,6 @@ AC_ARG_WITH([fftw],
             [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
             [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
 
-#
-# What about the MKL library replacement for fftw3 ?  How do we know if fftw_execute
-# can be found in MKL? 
-#
-AC_CHECK_LIB([fftw3],[fftw_execute],
-	[AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes],
-        [ac_fftw=no])
-
-case ${ac_fftw} in
-    no)
-        echo WARNING libfftw3 not found FFT routines will not work
-        ;;
-    yes)
-        AM_LDFLAGS="$AM_LDFLAGS -lfftw3 -lfftw3f"
-esac
-
-
 ################ Get compiler informations
 AC_LANG([C++])
 AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
@@ -103,7 +86,6 @@ AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
 ############### Checks for library functions
 CXXFLAGS_CPY=$CXXFLAGS
 LDFLAGS_CPY=$LDFLAGS
-LIBS_CPY=$LIBS
 CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
 LDFLAGS="$AM_LDFLAGS $LDFLAGS"
 AC_CHECK_FUNCS([gettimeofday])
@@ -122,6 +104,11 @@ if test "${ac_LAPACK}x" != "nox"; then
     AC_CHECK_LIB([lapack],[LAPACKE_sbdsdc],[],
                  [AC_MSG_ERROR("LAPACK enabled but library not found")])
 fi
+AC_CHECK_LIB([fftw3],[fftw_execute],
+  [AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])]
+  [have_fftw=true]
+  [LIBS="$LIBS -lfftw3 -lfftw3f"],
+  [AC_MSG_WARN([**** FFTW library not found, Grid can still compile but FFT-based routines will not work ****])])
 CXXFLAGS=$CXXFLAGS_CPY
 LDFLAGS=$LDFLAGS_CPY
 
@@ -337,12 +324,15 @@ Summary of configuration for $PACKAGE v$VERSION
 - RNG choice                    : ${ac_RNG} 
 - GMP                           : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 - LAPACK                        : ${ac_LAPACK}
-- FFTW                          : ${ac_fftw}
+- FFTW                          : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
 - build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 - graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
-- CXXFLAGS:  "${AM_CXXFLAGS} ${CXXFLAGS}"
-- LDFLAGS:   "${AM_LDFLAGS} ${LDFLAGS}"
-- LIBS:      "${LIBS} "
+- CXXFLAGS:
+`echo ${AM_CXXFLAGS} ${CXXFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
+- LDFLAGS:
+`echo ${AM_LDFLAGS} ${LDFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
+- LIBS:
+`echo ${LIBS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
 -------------------------------------------------------
 "