Merge branch 'develop' into feature/hadrons

2025-11-04 14:04:32 +00:00 · 2016-09-20 13:49:33 +01:00
parent 7ff7c7d90d d2573189d8
commit a034e9901b
42 changed files with 1594 additions and 558 deletions
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -0,0 +1,276 @@
+
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_FFT_H_
+#define _GRID_FFT_H_
+
+#ifdef HAVE_FFTW	
+#include <fftw3.h>
+#endif
+namespace Grid {
+
+  template<class scalar> struct FFTW { };
+
+#ifdef HAVE_FFTW	
+  template<> struct FFTW<ComplexD> {
+  public:
+
+    typedef fftw_complex FFTW_scalar;
+    typedef fftw_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftw_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftw_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftw_destroy_plan(p);
+    }
+  };
+
+  template<> struct FFTW<ComplexF> {
+  public:
+
+    typedef fftwf_complex FFTW_scalar;
+    typedef fftwf_plan    FFTW_plan;
+
+    static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
+					FFTW_scalar *in, const int *inembed,		
+					int istride, int idist,		
+					FFTW_scalar *out, const int *onembed,		
+					int ostride, int odist,		
+					int sign, unsigned flags) {
+      return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
+    }	  
+    
+    static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
+      ::fftwf_flops(p,add,mul,fmas);
+    }
+
+    inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
+      ::fftwf_execute_dft(p,in,out);
+    }
+    inline static void fftw_destroy_plan(const FFTW_plan p) {
+      ::fftwf_destroy_plan(p);
+    }
+  };
+
+#endif
+
+#ifndef FFTW_FORWARD
+#define FFTW_FORWARD (-1)
+#define FFTW_BACKWARD (+1)
+#endif
+
+  class FFT { 
+  private:
+
+    GridCartesian *vgrid;
+    GridCartesian *sgrid;
+
+    int Nd;
+    double flops;
+    double flops_call;
+    uint64_t usec;
+
+    std::vector<int> dimensions;
+    std::vector<int> processors;
+    std::vector<int> processor_coor;
+
+  public:
+
+    static const int forward=FFTW_FORWARD;
+    static const int backward=FFTW_BACKWARD;
+
+    double Flops(void) {return flops;}
+    double MFlops(void) {return flops/usec;}
+
+    FFT ( GridCartesian * grid ) : 
+      vgrid(grid),
+      Nd(grid->_ndimension),
+      dimensions(grid->_fdimensions),
+      processors(grid->_processors),
+      processor_coor(grid->_processor_coor)
+    {
+      flops=0;
+      usec =0;
+      std::vector<int> layout(Nd,1);
+      sgrid = new GridCartesian(dimensions,layout,processors);
+    };
+
+    ~FFT ( void)  { 
+      delete sgrid; 
+    }
+    
+    template<class vobj>
+    void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
+
+      conformable(result._grid,vgrid);
+      conformable(source._grid,vgrid);
+
+      int L = vgrid->_ldimensions[dim];
+      int G = vgrid->_fdimensions[dim];
+
+      std::vector<int> layout(Nd,1);
+      std::vector<int> pencil_gd(vgrid->_fdimensions);
+
+      pencil_gd[dim] = G*processors[dim];    
+
+      // Pencil global vol LxLxGxLxL per node
+      GridCartesian pencil_g(pencil_gd,layout,processors);
+
+      // Construct pencils
+      typedef typename vobj::scalar_object sobj;
+      typedef typename sobj::scalar_type   scalar;
+
+      Lattice<vobj> ssource(vgrid); ssource =source;
+      Lattice<sobj> pgsource(&pencil_g);
+      Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
+
+#ifndef HAVE_FFTW	
+      assert(0);
+#else 
+      typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
+      typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan;
+
+      {
+	int Ncomp = sizeof(sobj)/sizeof(scalar);
+	int Nlow  = 1;
+	for(int d=0;d<dim;d++){
+	  Nlow*=vgrid->_ldimensions[d];
+	}
+
+	int rank = 1;  /* 1d transforms */
+	int n[] = {G}; /* 1d transforms of length G */
+	int howmany = Ncomp;
+	int odist,idist,istride,ostride;
+	idist   = odist   = 1;          /* Distance between consecutive FT's */
+	istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
+	int *inembed = n, *onembed = n;
+
+	
+	int sign = FFTW_FORWARD;
+	if (inverse) sign = FFTW_BACKWARD;
+
+	FFTW_plan p;
+	{
+	  FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
+	  FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
+	  p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
+					       in,inembed,
+					       istride,idist,
+					       out,onembed,
+					       ostride, odist,
+					       sign,FFTW_ESTIMATE);
+	}
+
+	double add,mul,fma;
+	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
+	flops_call = add+mul+2.0*fma;
+
+	GridStopWatch timer;
+
+	// Barrel shift and collect global pencil
+	for(int p=0;p<processors[dim];p++) { 
+
+	  for(int idx=0;idx<sgrid->lSites();idx++) { 
+
+	    std::vector<int> lcoor(Nd);
+    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
+
+	    sobj s;
+
+	    peekLocalSite(s,ssource,lcoor);
+
+	    lcoor[dim]+=p*L;
+	   
+	    pokeLocalSite(s,pgsource,lcoor);
+	  }
+
+	  ssource = Cshift(ssource,dim,L);
+	}
+	
+	// Loop over orthog coords
+	int NN=pencil_g.lSites();
+
+	GridStopWatch Timer;
+	Timer.Start();
+
+PARALLEL_FOR_LOOP
+	for(int idx=0;idx<NN;idx++) { 
+
+	  std::vector<int> lcoor(Nd);
+	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
+
+	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
+	    FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
+	    FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
+	    FFTW<scalar>::fftw_execute_dft(p,in,out);
+	  }
+	}
+
+        Timer.Stop();
+	usec += Timer.useconds();
+	flops+= flops_call*NN;
+
+        int pc = processor_coor[dim];
+        for(int idx=0;idx<sgrid->lSites();idx++) { 
+	  std::vector<int> lcoor(Nd);
+	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
+	  std::vector<int> gcoor = lcoor;
+	  // extract the result
+	  sobj s;
+	  gcoor[dim] = lcoor[dim]+L*pc;
+	  peekLocalSite(s,pgresult,gcoor);
+	  pokeLocalSite(s,result,lcoor);
+	}
+      	  
+	FFTW<scalar>::fftw_destroy_plan(p);
+      }
+#endif
+
+
+    }
+
+  };
+
+
+}
+
+#endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -68,6 +68,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Simd.h>
 #include <Grid/Threads.h>
 #include <Grid/Lexicographic.h>
+#include <Grid/Init.h>
 #include <Grid/Communicator.h> 
 #include <Grid/Cartesian.h>    
 #include <Grid/Tensors.h>      
@@ -78,7 +79,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/parallelIO/BinaryIO.h>
 #include <Grid/qcd/QCD.h>
 #include <Grid/parallelIO/NerscIO.h>
-#include <Grid/Init.h>
+
+#include <Grid/FFT.h>

 #include <Grid/qcd/hmc/NerscCheckpointer.h>
 #include <Grid/qcd/hmc/HmcRunner.h>
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc,
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
+
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
    std::vector<int> cores(0);
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
@@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv)
    GridLogConfigure(logstreams);
  }

-
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -17,8 +17,8 @@ endif
 include Make.inc
 include Eigen.inc

-lib_LTLIBRARIES = libGrid.la
+lib_LIBRARIES = libGrid.a

-libGrid_la_SOURCES             = $(CCFILES) $(extra_sources)
-libGrid_ladir                  = $(pkgincludedir)
+libGrid_a_SOURCES              = $(CCFILES) $(extra_sources)
+libGrid_adir                   = $(pkgincludedir)
 nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -265,7 +265,7 @@
 	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
       }
       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
-	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
+	 //_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 	 local = _entries[ent]._is_local;
 	 perm  = _entries[ent]._permute;
 	 if (perm)  ptype = _permute_type[point]; 
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -127,21 +127,12 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);

-    void SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-			    void *xmit,
-			    int xmit_to_rank,
-			    void *recv,
-			    int recv_from_rank,
-			    int bytes);
-
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
 			 int xmit_to_rank,
 			 void *recv,
 			 int recv_from_rank,
 			 int bytes);
-
-    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list);
    void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

    ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -144,28 +144,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
-// Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-					       void *xmit,
-					       int dest,
-					       void *recv,
-					       int from,
-					       int bytes)
-{
-  MPI_Request xrq;
-  MPI_Request rrq;
-  int rank = _processor;
-  int ierr;
-  ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
-  ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq);
-  assert(ierr==0);
-  list.push_back(xrq);
-  list.push_back(rrq);
-}
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
-{
-  MPI_Startall(list.size(),&list[0]);
-}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -173,12 +151,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 						int from,
 						int bytes)
 {
-  std::vector<CommsRequest_t> reqs(0);
-  SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes);
-  SendToRecvFromBegin(reqs);
-  for(int i=0;i<reqs.size();i++){
-    list.push_back(reqs[i]);
-  }
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  
+  assert(ierr==0);
+
+  list.push_back(xrq);
+  list.push_back(rrq);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -84,19 +84,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 {
  assert(0);
 }
-void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0);
-}
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
-{
-  assert(0);
-}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  assert(0);
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -268,10 +268,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
 }

 // Basic Halo comms primitive
-void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list)
-{
-  assert(0); //unimplemented
-}
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
@@ -284,15 +280,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
-void CartesianCommunicator::SendToRecvFromInit(std::vector<CommsRequest_t> &list,
-						void *xmit,
-						int dest,
-						void *recv,
-						int from,
-						int bytes)
-{
-  assert(0); // Unimplemented
-}
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
  }

-PARALLEL_FOR_LOOP
+  //PARALLEL_FOR_LOOP
  for(int idx=0;idx<ig->lSites();idx++){
    std::vector<int> lcoor(ni);
    ig->LocalIndexToLocalCoor(idx,lcoor);
@@ -446,6 +446,79 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in

 }

+
+template<class vobj>
+void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,lowDim,lcoor);
+      pokeLocalSite(s,higherDim,hcoor);
+    }
+  }
+}
+
+
+template<class vobj>
+void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+
+  for(int d=0;d<nh;d++){
+    assert(lg->_processors[d]  == hg->_processors[d]);
+    assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
+  }
+
+  // the above should guarantee that the operations are local
+  //PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    if( lcoor[orthog] == slice_lo ) { 
+      hcoor=lcoor;
+      hcoor[orthog] = slice_hi;
+      peekLocalSite(s,higherDim,hcoor);
+      pokeLocalSite(s,lowDim,lcoor);
+    }
+  }
+}
+
+
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
 {
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -194,22 +194,22 @@ class BinaryIO {

      std::vector<int> site({x,y,z,t});

-      if ( grid->IsBoss() ) {
-	fin.read((char *)&file_object,sizeof(file_object));
-	bytes += sizeof(file_object);
-	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    le64toh_v((void *)&file_object,sizeof(file_object));
+      if (grid->IsBoss()) {
+        fin.read((char *)&file_object, sizeof(file_object));
+        bytes += sizeof(file_object);
+        if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
+        if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));

-	munge(file_object,munged,csum);
+        munge(file_object, munged, csum);
      }
      // The boss who read the file has their value poked
      pokeSite(munged,Umu,site);
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
@@ -254,20 +254,20 @@ class BinaryIO {

      
      if ( grid->IsBoss() ) {
-	
-	if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
-	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
-	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
-	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
+  
+  if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
+  if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
+  if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
+  if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));

-	// NB could gather an xstrip as an optimisation.
-	fout.write((char *)&file_object,sizeof(file_object));
-	bytes+=sizeof(file_object);
+  // NB could gather an xstrip as an optimisation.
+  fout.write((char *)&file_object,sizeof(file_object));
+  bytes+=sizeof(file_object);
      }
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
@@ -305,15 +305,15 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);

      if( rank == grid->ThisRank() ){
-	//	std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
-	parallel.GetState(saved,l_idx);
+  //  std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
+  parallel.GetState(saved,l_idx);
      }

      grid->Broadcast(rank,(void *)&saved[0],bytes);

      if ( grid->IsBoss() ) {
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
-	fout.write((char *)&saved[0],bytes);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  fout.write((char *)&saved[0],bytes);
      }

    }
@@ -355,14 +355,14 @@ class BinaryIO {
      int l_idx=parallel.generator_idx(o_idx,i_idx);

      if ( grid->IsBoss() ) {
-	fin.read((char *)&saved[0],bytes);
-	Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
+  fin.read((char *)&saved[0],bytes);
+  Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
      }

      grid->Broadcast(0,(void *)&saved[0],bytes);

      if( rank == grid->ThisRank() ){
-	parallel.SetState(saved,l_idx);
+  parallel.SetState(saved,l_idx);
      }

    }
@@ -415,15 +415,15 @@ class BinaryIO {

      if ( d == 0 ) parallel[d] = 0;
      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  range[d] = grid->_ldimensions[d];
+  start[d] = grid->_processor_coor[d]*range[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
+  range[d] = grid->_gdimensions[d];
+  start[d] = 0;
+  ioproc[d]= 0;

-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }
      slice_vol = slice_vol * range[d];
    }
@@ -434,9 +434,9 @@ class BinaryIO {
      std::cout<< std::dec ;
      std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+  std::cout<< range[d];
+  if( d< grid->_ndimension-1 ) 
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@@ -463,7 +463,7 @@ class BinaryIO {

      // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@@ -472,8 +472,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);

      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  gsite[d] = tsite[d]+start[d];               // global site
      }

      /////////////////////////
@@ -487,29 +487,29 @@ class BinaryIO {
      // iorank reads from the seek
      ////////////////////////////////
      if (myrank == iorank) {
-	
-	fin.seekg(offset+g_idx*sizeof(fileObj));
-	fin.read((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
-	
-	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
-	
-	munge(fileObj,siteObj,csum);
+  
+  fin.seekg(offset+g_idx*sizeof(fileObj));
+  fin.read((char *)&fileObj,sizeof(fileObj));
+  bytes+=sizeof(fileObj);
+  
+  if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
+  
+  munge(fileObj,siteObj,csum);

-      }	
+      } 

      // Possibly do transport through pt2pt 
      if ( rank != iorank ) { 
-	if ( (myrank == rank) || (myrank==iorank) ) {
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
-	}
+  if ( (myrank == rank) || (myrank==iorank) ) {
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
+  }
      }
      // Poke at destination
      if ( myrank == rank ) {
-	  pokeLocalSite(siteObj,Umu,lsite);
+    pokeLocalSite(siteObj,Umu,lsite);
      }
      grid->Barrier(); // necessary?
    }
@@ -520,7 +520,7 @@ class BinaryIO {

    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    
    return csum;
  }
@@ -558,15 +558,15 @@ class BinaryIO {
      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;

      if (parallel[d]) {
-	range[d] = grid->_ldimensions[d];
-	start[d] = grid->_processor_coor[d]*range[d];
-	ioproc[d]= grid->_processor_coor[d];
+  range[d] = grid->_ldimensions[d];
+  start[d] = grid->_processor_coor[d]*range[d];
+  ioproc[d]= grid->_processor_coor[d];
      } else {
-	range[d] = grid->_gdimensions[d];
-	start[d] = 0;
-	ioproc[d]= 0;
+  range[d] = grid->_gdimensions[d];
+  start[d] = 0;
+  ioproc[d]= 0;

-	if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
+  if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
      }

      slice_vol = slice_vol * range[d];
@@ -577,9 +577,9 @@ class BinaryIO {
      grid->GlobalSum(tmp);
      std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
      for(int d=0;d<grid->_ndimension;d++){
-	std::cout<< range[d];
-	if( d< grid->_ndimension-1 ) 
-	  std::cout<< " x ";
+  std::cout<< range[d];
+  if( d< grid->_ndimension-1 ) 
+    std::cout<< " x ";
      }
      std::cout << std::endl;
    }
@@ -610,7 +610,7 @@ class BinaryIO {
    // should aggregate a whole chunk and then write.
    // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
-	
+  
      std::vector<int> tsite(nd); // temporary mixed up site
      std::vector<int> gsite(nd);
      std::vector<int> lsite(nd);
@@ -619,8 +619,8 @@ class BinaryIO {
      Lexicographic::CoorFromIndex(tsite,tlex,range);

      for(int d=0;d<nd;d++){
-	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
-	gsite[d] = tsite[d]+start[d];               // global site
+  lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
+  gsite[d] = tsite[d]+start[d];               // global site
      }


@@ -640,26 +640,26 @@ class BinaryIO {

      // Pair of nodes may need to do pt2pt send
      if ( rank != iorank ) { // comms is necessary
-	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
-	  // Send to IOrank 
-	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
-	}
+  if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+    // Send to IOrank 
+    grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
+  }
      }

      grid->Barrier(); // necessary?

      if (myrank == iorank) {
-	
-	munge(siteObj,fileObj,csum);
+  
+  munge(siteObj,fileObj,csum);

-	if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
-	if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
-	
-	fout.seekp(offset+g_idx*sizeof(fileObj));
-	fout.write((char *)&fileObj,sizeof(fileObj));
-	bytes+=sizeof(fileObj);
+  if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee32)    htole32_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
+  if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj));
+  
+  fout.seekp(offset+g_idx*sizeof(fileObj));
+  fout.write((char *)&fileObj,sizeof(fileObj));
+  bytes+=sizeof(fileObj);
      }
    }

@@ -668,7 +668,7 @@ class BinaryIO {

    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
-	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
+       << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -55,11 +55,14 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////
    // QCD iMatrix types
    // Index conventions:                            Lorentz x Spin x Colour
+    // note: static const int or constexpr will work for type deductions
+    //       with the intel compiler (up to version 17)
    //////////////////////////////////////////////////////////////////////////////
-    static const int ColourIndex = 2;
-    static const int SpinIndex   = 1;
-    static const int LorentzIndex= 0;
+    #define ColourIndex  2
+    #define SpinIndex    1
+    #define LorentzIndex 0

+  
    // Also should make these a named enum type
    static const int DaggerNo=0;
    static const int DaggerYes=1;
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -111,12 +111,16 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 #define FermOp4dVecTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
  template class A<WilsonImplD>;		\
+  template class A<ZWilsonImplF>;		\
+  template class A<ZWilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		

 #define FermOp5dVecTemplateInstantiate(A) \
  template class A<DomainWallVec5dImplF>;	\
-  template class A<DomainWallVec5dImplD>;	
+  template class A<DomainWallVec5dImplD>;	\
+  template class A<ZDomainWallVec5dImplF>;	\
+  template class A<ZDomainWallVec5dImplD>;	

 #define FermOpTemplateInstantiate(A) \
 FermOp4dVecTemplateInstantiate(A) \
@@ -138,6 +142,7 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/DomainWallFermion.h>
 #include <Grid/qcd/action/fermion/MobiusFermion.h>
+#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
 #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
 #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
@@ -176,6 +181,11 @@ typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
 typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 typedef MobiusFermion<WilsonImplD> MobiusFermionD;
+
+typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
+typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
+typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
+
 typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
 typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
 typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -54,18 +54,18 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag (Ls,1.0);
-  std::vector<RealD> upper(Ls,-1.0); upper[Ls-1]=mass;
-  std::vector<RealD> lower(Ls,-1.0); lower[0]   =mass;
+  std::vector<Coeff_t> diag (Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
+  std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass;
  M5D(psi,chi,chi,lower,diag,upper);
 }
 template<class Impl>
 void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = bs;
-  std::vector<RealD> upper= cs;
-  std::vector<RealD> lower= cs; 
+  std::vector<Coeff_t> diag = bs;
+  std::vector<Coeff_t> upper= cs;
+  std::vector<Coeff_t> lower= cs; 
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
@@ -73,9 +73,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = beo;
-  std::vector<RealD> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> diag = beo;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-ceo[i];
    lower[i]=-ceo[i];
@@ -88,9 +88,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = bee;
-  std::vector<RealD> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);
  for(int i=0;i<Ls;i++) {
    upper[i]=-cee[i];
    lower[i]=-cee[i];
@@ -104,9 +104,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag = bee;
-  std::vector<RealD> upper(Ls);
-  std::vector<RealD> lower(Ls);
+  std::vector<Coeff_t> diag = bee;
+  std::vector<Coeff_t> upper(Ls);
+  std::vector<Coeff_t> lower(Ls);

  for (int s=0;s<Ls;s++){
    // Assemble the 5d matrix
@@ -129,9 +129,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag(Ls,1.0);
-  std::vector<RealD> upper(Ls,-1.0);
-  std::vector<RealD> lower(Ls,-1.0);
+  std::vector<Coeff_t> diag(Ls,1.0);
+  std::vector<Coeff_t> upper(Ls,-1.0);
+  std::vector<Coeff_t> lower(Ls,-1.0);
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,chi,chi,lower,diag,upper);
@@ -141,9 +141,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din)
 {
  int Ls=this->Ls;
-  std::vector<RealD> diag =bs;
-  std::vector<RealD> upper=cs;
-  std::vector<RealD> lower=cs;
+  std::vector<Coeff_t> diag =bs;
+  std::vector<Coeff_t> upper=cs;
+  std::vector<Coeff_t> lower=cs;
  upper[Ls-1]=-mass*upper[Ls-1];
  lower[0]   =-mass*lower[0];
  M5Ddag(psi,psi,Din,lower,diag,upper);
@@ -273,11 +273,21 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
 {
-  SetCoefficientsZolotarev(1.0,zdata,b,c);
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(1.0,gamma,b,c);
 }
 //Zolo
 template<class Impl>
 void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
+{
+  std::vector<Coeff_t> gamma(this->Ls);
+  for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
+  SetCoefficientsInternal(zolo_hi,gamma,b,c);
+}
+//Zolo
+template<class Impl>
+void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
 {
  int Ls=this->Ls;

@@ -315,7 +325,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
  double bmc = b-c;
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
-    omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code
+    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
@@ -377,7 +387,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot
  }
 	
  { 
-    double delta_d=mass*cee[Ls-1];
+    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
    dee[Ls-1] += delta_d;
  }  
--- a/lib/qcd/action/fermion/CayleyFermion5D.h
+++ b/lib/qcd/action/fermion/CayleyFermion5D.h
@@ -62,16 +62,16 @@ namespace Grid {
      void M5D(const FermionField &psi,
 	       const FermionField &phi, 
 	       FermionField &chi,
-	       std::vector<RealD> &lower,
-	       std::vector<RealD> &diag,
-	       std::vector<RealD> &upper);
+	       std::vector<Coeff_t> &lower,
+	       std::vector<Coeff_t> &diag,
+	       std::vector<Coeff_t> &upper);

      void M5Ddag(const FermionField &psi,
 		  const FermionField &phi, 
 		  FermionField &chi,
-		  std::vector<RealD> &lower,
-		  std::vector<RealD> &diag,
-		  std::vector<RealD> &upper);
+		  std::vector<Coeff_t> &lower,
+		  std::vector<Coeff_t> &diag,
+		  std::vector<Coeff_t> &upper);
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);

      virtual void   Instantiatable(void)=0;
@@ -91,23 +91,23 @@ namespace Grid {
      RealD mass;

      // Cayley form Moebius (tanh and zolotarev)
-      std::vector<RealD> omega; 
-      std::vector<RealD> bs;    // S dependent coeffs
-      std::vector<RealD> cs;    
-      std::vector<RealD> as;    
+      std::vector<Coeff_t> omega; 
+      std::vector<Coeff_t> bs;    // S dependent coeffs
+      std::vector<Coeff_t> cs;    
+      std::vector<Coeff_t> as;    
      // For preconditioning Cayley form
-      std::vector<RealD> bee;    
-      std::vector<RealD> cee;    
-      std::vector<RealD> aee;    
-      std::vector<RealD> beo;    
-      std::vector<RealD> ceo;    
-      std::vector<RealD> aeo;    
+      std::vector<Coeff_t> bee;    
+      std::vector<Coeff_t> cee;    
+      std::vector<Coeff_t> aee;    
+      std::vector<Coeff_t> beo;    
+      std::vector<Coeff_t> ceo;    
+      std::vector<Coeff_t> aeo;    
      // LDU factorisation of the eeoo matrix
-      std::vector<RealD> lee;    
-      std::vector<RealD> leem;    
-      std::vector<RealD> uee;    
-      std::vector<RealD> ueem;    
-      std::vector<RealD> dee;    
+      std::vector<Coeff_t> lee;    
+      std::vector<Coeff_t> leem;    
+      std::vector<Coeff_t> uee;    
+      std::vector<Coeff_t> ueem;    
+      std::vector<Coeff_t> dee;    

      // Constructors
      CayleyFermion5D(GaugeField &_Umu,
@@ -117,20 +117,19 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

-
-
    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
+      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
    };

  }
 }
 #define INSTANTIATE_DPERP(A)\
 template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+					std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
-					   std::vector<RealD> &lower,std::vector<RealD> &diag,std::vector<RealD> &upper); \
+					   std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
 template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \
 template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);

--- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc
@@ -43,9 +43,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
-				std::vector<RealD> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
@@ -82,9 +82,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
-				   std::vector<RealD> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
 {
  int Ls =this->Ls;
  GridBase *grid=psi._grid;
@@ -204,6 +204,8 @@ PARALLEL_FOR_LOOP
  INSTANTIATE_DPERP(WilsonImplD);
  INSTANTIATE_DPERP(GparityWilsonImplF);
  INSTANTIATE_DPERP(GparityWilsonImplD);
+  INSTANTIATE_DPERP(ZWilsonImplF);
+  INSTANTIATE_DPERP(ZWilsonImplD);
 #endif

 }}
--- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc
@@ -43,9 +43,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
-				std::vector<RealD> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
@@ -65,9 +65,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
-				   std::vector<RealD> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
 {
  int Ls=this->Ls;
  for(int s=0;s<Ls;s++){
--- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc
@@ -53,9 +53,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 				const FermionField &phi, 
 				FermionField &chi,
-				std::vector<RealD> &lower,
-				std::vector<RealD> &diag,
-				std::vector<RealD> &upper)
+				std::vector<Coeff_t> &lower,
+				std::vector<Coeff_t> &diag,
+				std::vector<Coeff_t> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
@@ -121,9 +121,9 @@ template<class Impl>
 void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 				   const FermionField &phi, 
 				   FermionField &chi,
-				   std::vector<RealD> &lower,
-				   std::vector<RealD> &diag,
-				   std::vector<RealD> &upper)
+				   std::vector<Coeff_t> &lower,
+				   std::vector<Coeff_t> &diag,
+				   std::vector<Coeff_t> &upper)
 {
  GridBase *grid=psi._grid;
  int Ls   = this->Ls;
@@ -194,8 +194,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField

  chi.checkerboard=psi.checkerboard;
  
-  Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
-  Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
  
  for(int s=0;s<Ls;s++){
    Pplus(s,s) = bee[s];
@@ -212,8 +212,8 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
  Pplus (0,Ls-1) = mass*cee[0];
  Pminus(Ls-1,0) = mass*cee[Ls-1];
  
-  Eigen::MatrixXd PplusMat ;
-  Eigen::MatrixXd PminusMat;
+  Eigen::MatrixXcd PplusMat ;
+  Eigen::MatrixXcd PminusMat;
  
  if ( inv ) {
    PplusMat =Pplus.inverse();
@@ -298,8 +298,12 @@ PARALLEL_FOR_LOOP

 INSTANTIATE_DPERP(DomainWallVec5dImplD);
 INSTANTIATE_DPERP(DomainWallVec5dImplF);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
+INSTANTIATE_DPERP(ZDomainWallVec5dImplF);

 template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
+template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);

 }}
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -100,7 +100,8 @@ namespace Grid {
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
    typedef typename Impl::Compressor               Compressor;		\
    typedef typename Impl::StencilImpl             StencilImpl;	  \
-    typedef typename Impl::ImplParams ImplParams;
+    typedef typename Impl::ImplParams ImplParams; \
+    typedef typename Impl::Coeff_t       Coeff_t;

 #define INHERIT_IMPL_TYPES(Base) \
    INHERIT_GIMPL_TYPES(Base)\
@@ -109,12 +110,14 @@ namespace Grid {
    ///////
    // Single flavour four spinors with colour index
    ///////
-    template<class S,int Nrepresentation=Nc>
+    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
    class WilsonImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S, Nrepresentation> > { 
    public:

+
      const bool LsVectorised=false;

+      typedef _Coeff_t Coeff_t;
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
@@ -192,12 +195,13 @@ PARALLEL_FOR_LOOP
    ///////
    // Single flavour four spinors with colour index, 5d redblack
    ///////
-    template<class S,int Nrepresentation=Nc>
+    template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
    class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
    
      const bool LsVectorised=true;

+      typedef _Coeff_t Coeff_t;
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
@@ -287,12 +291,13 @@ PARALLEL_FOR_LOOP
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////

-    template<class S,int Nrepresentation>
+    template<class S,int Nrepresentation,class _Coeff_t = RealD>
    class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> >{ 
    public:

      const bool LsVectorised=false;

+      typedef _Coeff_t Coeff_t;
      typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;

      INHERIT_GIMPL_TYPES(Gimpl);
@@ -483,6 +488,18 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double

+    typedef WilsonImpl<vComplex ,Nc,ComplexD> ZWilsonImplR; // Real.. whichever prec
+    typedef WilsonImpl<vComplexF,Nc,ComplexD> ZWilsonImplF; // Float
+    typedef WilsonImpl<vComplexD,Nc,ComplexD> ZWilsonImplD; // Double
+
+    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
+
+    typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
+    typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
+    typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
+
    typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
    typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
    typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -68,16 +68,21 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
-  // No asm implementation yet.
-  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-  //  else
-  for(int site=0;site<Ns;site++) {
-    for(int s=0;s<Ls;s++) {
-      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-      sF++;
+#ifdef AVX512
+  if ( AsmOpt ) {
+    WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
+  } else {
+#else
+  {  
+#endif
+    for(int site=0;site<Ns;site++) {
+      for(int s=0;s<Ls;s++) {
+	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	sF++;
+      }
+      sU++;
    }
-    sU++;
  }
 }

--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -79,6 +79,10 @@ namespace Grid {
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);

+     void DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
+

     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
@@ -92,7 +96,25 @@ namespace Grid {
     WilsonKernels(const ImplParams &p= ImplParams());
     
    };
-
+    
+    ///////////////////////////////////////////////////////////
+    // Default to no assembler implementation
+    ///////////////////////////////////////////////////////////
+    template<class Impl>
+    void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }
+    template<class Impl>
+    void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+						      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+    {
+      assert(0);
+    }
+        
  }
 }
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -26,59 +26,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */

 #include <Grid.h>

 namespace Grid {
-namespace QCD {
-
-
-  ///////////////////////////////////////////////////////////
-  // Default to no assembler implementation
-  ///////////////////////////////////////////////////////////
-template<class Impl>
-void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
-{
-  assert(0);
-}
-
+  namespace QCD {
+    
 #if defined(AVX512) 
-
-
-  ///////////////////////////////////////////////////////////
-  // If we are AVX512 specialise the single precision routine
-  ///////////////////////////////////////////////////////////
-
+    
+    
+    ///////////////////////////////////////////////////////////
+    // If we are AVX512 specialise the single precision routine
+    ///////////////////////////////////////////////////////////
+    
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
-
-static Vector<vComplexF> signs;
-
-int setupSigns(void ){
-  Vector<vComplexF> bother(2);
-  signs = bother;
-  vrsign(signs[0]);
-  visign(signs[1]);
-  return 1;
-}
-static int signInit = setupSigns();
-
+    
+    static Vector<vComplexF> signs;
+    
+    int setupSigns(void ){
+      Vector<vComplexF> bother(2);
+      signs = bother;
+      vrsign(signs[0]);
+      visign(signs[1]);
+      return 1;
+    }
+    static int signInit = setupSigns();
+  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
-
+  
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
-template<>
-void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+  
+#undef KERNEL_DAG
+    template<>
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+      
+#define KERNEL_DAG
+    template<>
+    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
@@ -89,32 +86,22 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
-template<>
-void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+				    
+#undef KERNEL_DAG
+    template<>
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
-
+				    
+#define KERNEL_DAG
+    template<>
+    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
 #endif
-
-template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-
-template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
-}}
+  }
+}

--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -30,7 +30,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+    XP_PROJMEM(base);
+#else 
    XM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@@ -41,15 +45,22 @@
    MULT_2SPIN_DIR_PFXP(Xp,basep);
  }
  LOAD64(%r10,isigns);
+#ifdef KERNEL_DAG
+  XP_RECON;
+#else
  XM_RECON;
-
+#endif
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YP_PROJMEM(base);
+#else
    YM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@@ -60,7 +71,11 @@
    MULT_2SPIN_DIR_PFYP(Yp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YP_RECON_ACCUM;
+#else
  YM_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Zp
@@ -68,7 +83,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZP_PROJMEM(base);
+#else
    ZM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@@ -79,7 +98,11 @@
    MULT_2SPIN_DIR_PFZP(Zp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZP_RECON_ACCUM;
+#else
  ZM_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Tp
@@ -87,7 +110,11 @@
  basep = st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TP_PROJMEM(base);
+#else
    TM_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
@@ -98,7 +125,11 @@
    MULT_2SPIN_DIR_PFTP(Tp,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TP_RECON_ACCUM;
+#else
  TM_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Xm
@@ -107,7 +138,11 @@
  //  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    XM_PROJMEM(base);
+#else
    XP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR3,perm);
  } else { 
    LOAD_CHI(base);
@@ -118,7 +153,11 @@
    MULT_2SPIN_DIR_PFXM(Xm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  XM_RECON_ACCUM;
+#else
  XP_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Ym
@@ -126,7 +165,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    YM_PROJMEM(base);
+#else
    YP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR2,perm);
  } else { 
    LOAD_CHI(base);
@@ -137,7 +180,11 @@
    MULT_2SPIN_DIR_PFYM(Ym,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  YM_RECON_ACCUM;
+#else
  YP_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Zm
@@ -145,7 +192,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    ZM_PROJMEM(base);
+#else
    ZP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR1,perm);
  } else { 
    LOAD_CHI(base);
@@ -156,7 +207,11 @@
    MULT_2SPIN_DIR_PFZM(Zm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  ZM_RECON_ACCUM;
+#else
  ZP_RECON_ACCUM;
+#endif

  ////////////////////////////////
  // Tm
@@ -164,7 +219,11 @@
  basep= st.GetPFInfo(nent,plocal); nent++;
  if ( local ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+    TM_PROJMEM(base);
+#else
    TP_PROJMEM(base);
+#endif
    MAYBEPERM(PERMUTE_DIR0,perm);
  } else { 
    LOAD_CHI(base);
@@ -175,7 +234,11 @@
    MULT_2SPIN_DIR_PFTM(Tm,basep);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+#ifdef KERNEL_DAG
+  TM_RECON_ACCUM;
+#else
  TP_RECON_ACCUM;
+#endif

  basep= st.GetPFInfo(nent,plocal); nent++;
  SAVE_RESULT(base,basep);
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -839,46 +839,23 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 ////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //

-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+#define INSTANTIATE_THEM(A) \
+template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+							       int ss,int sU,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 								  int ss,int sU,const FermionField &in, FermionField &out);

-
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-
-
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallVec5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-template void WilsonKernels<DomainWallVec5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out);
-
+INSTANTIATE_THEM(WilsonImplF);
+INSTANTIATE_THEM(WilsonImplD);
+INSTANTIATE_THEM(ZWilsonImplF);
+INSTANTIATE_THEM(ZWilsonImplD);
+INSTANTIATE_THEM(GparityWilsonImplF);
+INSTANTIATE_THEM(GparityWilsonImplD);
+INSTANTIATE_THEM(DomainWallVec5dImplF);
+INSTANTIATE_THEM(DomainWallVec5dImplD);
+INSTANTIATE_THEM(ZDomainWallVec5dImplF);
+INSTANTIATE_THEM(ZDomainWallVec5dImplD);

 }}
--- a/lib/qcd/action/fermion/ZMobiusFermion.h
+++ b/lib/qcd/action/fermion/ZMobiusFermion.h
@@ -0,0 +1,79 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/MobiusFermion.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef  GRID_QCD_ZMOBIUS_FERMION_H
+#define  GRID_QCD_ZMOBIUS_FERMION_H
+
+#include <Grid/Grid.h>
+
+namespace Grid {
+
+  namespace QCD {
+
+    template<class Impl>
+    class ZMobiusFermion : public CayleyFermion5D<Impl>
+    {
+    public:
+     INHERIT_IMPL_TYPES(Impl);
+    public:
+
+      virtual void   Instantiatable(void) {};
+      // Constructors
+      ZMobiusFermion(GaugeField &_Umu,
+		     GridCartesian         &FiveDimGrid,
+		     GridRedBlackCartesian &FiveDimRedBlackGrid,
+		     GridCartesian         &FourDimGrid,
+		     GridRedBlackCartesian &FourDimRedBlackGrid,
+		     RealD _mass,RealD _M5,
+		     std::vector<ComplexD> &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : 
+      
+      CayleyFermion5D<Impl>(_Umu,
+			    FiveDimGrid,
+			    FiveDimRedBlackGrid,
+			    FourDimGrid,
+			    FourDimRedBlackGrid,_mass,_M5,p)
+
+      {
+	RealD eps = 1.0;
+	
+	std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
+	std::vector<Coeff_t> zgamma(this->Ls);
+	for(int s=0;s<this->Ls;s++){
+	  zgamma[s] = gamma[s];
+	}
+
+	// Call base setter
+	this->SetCoefficientsInternal(1.0,zgamma,b,c);
+      }
+
+    };
+
+  }
+}
+
+#endif
--- a/lib/simd/Grid_qpx.h
+++ b/lib/simd/Grid_qpx.h
@@ -1,300 +1,421 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/simd/Grid_qpx.h
-
-    Copyright (C) 2015
-
-Author: neo <cossu@post.kek.jp>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-//----------------------------------------------------------------------
-/*! @file Grid_qpx.h
-  @brief Optimization libraries for QPX instructions set for BG/Q
-
-  Using intrinsics
-*/
-// Time-stamp: <2015-05-27 11:30:21 neo>
-//----------------------------------------------------------------------
-
-// lot of undefined functions
+/*******************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/simd/Grid_qpx.h
+ 
+ Copyright (C) 2016
+ 
+ Author: Antonin Portelli <antonin.portelli@me.com>
+ 
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ ******************************************************************************/

+namespace Grid {
 namespace Optimization {
+  typedef struct 
+  {
+    float v0,v1,v2,v3;
+  } vector4float;
+
+  inline std::ostream & operator<<(std::ostream& stream, const vector4double a)
+  {
+    stream << "{"<<vec_extract(a,0)<<","<<vec_extract(a,1)<<","<<vec_extract(a,2)<<","<<vec_extract(a,3)<<"}";
+    return stream;
+  };
+
+  inline std::ostream & operator<<(std::ostream& stream, const vector4float a)
+  {
+    stream << "{"<< a.v0 <<","<< a.v1 <<","<< a.v2 <<","<< a.v3 <<"}";
+    return stream;
+  };
  
  struct Vsplat{
    //Complex float
-    inline float operator()(float a, float b){
-      return {a,b,a,b};
+    inline vector4float operator()(float a, float b){
+      return (vector4float){a, b, a, b};
    }
    // Real float
-    inline float operator()(float a){
-      return {a,a,a,a};
+    inline vector4float operator()(float a){
+      return (vector4float){a, a, a, a};
    }
    //Complex double
    inline vector4double operator()(double a, double b){
-      return {a,b,a,b};
+      return (vector4double){a, b, a, b};
    }
    //Real double
    inline vector4double operator()(double a){
-      return {a,a,a,a};
+      return (vector4double){a, a, a, a};
    }
    //Integer
    inline int operator()(Integer a){
-#error
+      return a;
    }
  };
-
+  
  struct Vstore{
-    //Float 
-    inline void operator()(float a, float* F){
-      assert(0);
+    //Float
+    inline void operator()(vector4double a, float *f){
+      vec_st(a, 0, f);
    }
+
+    inline void operator()(vector4double a, vector4float &f){
+      vec_st(a, 0, (float *)(&f));
+    }
+
+    inline void operator()(vector4float a, float *f){
+      f[0] = a.v0;
+      f[1] = a.v1;
+      f[2] = a.v2;
+      f[3] = a.v3;
+    }
+
    //Double
-    inline void operator()(vector4double a, double* D){
-      assert(0);
+    inline void operator()(vector4double a, double *d){
+      vec_st(a, 0, d);
    }
    //Integer
-    inline void operator()(int a, Integer* I){
-      assert(0);
+    inline void operator()(int a, Integer *i){
+      i[0] = a;
    }
-
  };
-
-
+  
  struct Vstream{
    //Float
-    inline void operator()(float * a, float b){
-      assert(0);
-    }
-    //Double
-    inline void operator()(double * a, vector4double b){
-      assert(0);
+    inline void operator()(float *f, vector4double a){
+      vec_st(a, 0, f);
    }

+    inline void operator()(vector4float f, vector4double a){
+      vec_st(a, 0, (float *)(&f));
+    }
+
+    inline void operator()(float *f, vector4float a){
+      f[0] = a.v0;
+      f[1] = a.v1;
+      f[2] = a.v2;
+      f[3] = a.v3;
+    }
+
+    //Double
+    inline void operator()(double *d, vector4double a){
+      vec_st(a, 0, d);
+    }

  };
-
-
-
+  
  struct Vset{
-    // Complex float 
-    inline float operator()(Grid::ComplexF *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+    // Complex float
+    inline vector4float operator()(Grid::ComplexF *a){
+      return (vector4float){a[0].real(), a[0].imag(), a[1].real(), a[1].imag()};
    }
-    // Complex double 
+    // Complex double
    inline vector4double operator()(Grid::ComplexD *a){
-      return {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+      return vec_ld(0, (double *)a);
    }
-    // Real float 
-    inline float operator()(float *a){
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+
+    // Real float
+    inline vector4float operator()(float *a){
+      return (vector4float){a[0], a[1], a[2], a[3]};
    }
+
+    inline vector4double operator()(vector4float a){
+      return vec_ld(0, (float *)(&a));
+    }
+
    // Real double
    inline vector4double operator()(double *a){
-      return {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+      return vec_ld(0, a);
    }
    // Integer
    inline int operator()(Integer *a){
-#error
+      return a[0];
    }
-
-
+    
+    
  };
-
+  
  template <typename Out_type, typename In_type>
-    struct Reduce{
-      //Need templated class to overload output type
-      //General form must generate error if compiled
-      inline Out_type operator()(In_type in){
-	printf("Error, using wrong Reduce function\n");
-	exit(1);
-	return 0;
-      }
-    };
-
-
- 
-
+  struct Reduce{
+    //Need templated class to overload output type
+    //General form must generate error if compiled
+    inline Out_type operator()(In_type in){
+      printf("Error, using wrong Reduce function\n");
+      exit(1);
+      return 0;
+    }
+  };
+  
  /////////////////////////////////////////////////////
  // Arithmetic operations
  /////////////////////////////////////////////////////
+  #define FLOAT_WRAP_2(fn, pref)\
+  pref vector4float fn(vector4float a, vector4float b)\
+  {\
+    vector4double ad, bd, rd;\
+    vector4float  r;\
+    \
+    ad = Vset()(a);\
+    bd = Vset()(b);\
+    rd = fn(ad, bd);\
+    Vstore()(rd, r);\
+    \
+    return r;\
+  }
+
+  #define FLOAT_WRAP_1(fn, pref)\
+  pref vector4float fn(vector4float a)\
+  {\
+    vector4double ad, rd;\
+    vector4float  r;\
+    \
+    ad = Vset()(a);\
+    rd = fn(ad);\
+    Vstore()(rd, r);\
+    \
+    return r;\
+  }
+
  struct Sum{
-    //Complex/Real float
-    inline float operator()(float a, float b){
-#error
-    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
-      return vec_add(a,b);
+      return vec_add(a, b);
    }
+
+    //Complex/Real float
+    FLOAT_WRAP_2(operator(), inline)
+
    //Integer
    inline int operator()(int a, int b){
-#error
+      return a + b;
    }
  };
-
+  
  struct Sub{
-    //Complex/Real float
-    inline float operator()(float a, float b){
-#error
-    }
    //Complex/Real double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_sub(a, b);
    }
+
+    //Complex/Real float
+    FLOAT_WRAP_2(operator(), inline)
+
    //Integer
-    inline floati operator()(int a, int b){
-#error
+    inline int operator()(int a, int b){
+      return a - b;
    }
  };
-
-
+  
  struct MultComplex{
-    // Complex float
-    inline float operator()(float a, float b){
-#error
-    }
    // Complex double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_xxnpmadd(a, b, vec_xmul(b, a));
    }
-  };

+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
+  };
+  
  struct Mult{
-    // Real float
-    inline float operator()(float a, float b){
-#error
-    }
    // Real double
    inline vector4double operator()(vector4double a, vector4double b){
-#error
+      return vec_mul(a, b);
    }
+
+    // Real float
+    FLOAT_WRAP_2(operator(), inline)
+
    // Integer
    inline int operator()(int a, int b){
-#error
+      return a*b;
    }
  };
-
-
+  
  struct Conj{
-    // Complex single
-    inline float operator()(float in){
-      assert(0);
-    }
    // Complex double
-    inline vector4double operator()(vector4double in){
-      assert(0);
+    inline vector4double operator()(vector4double v){
+      return vec_mul(v, (vector4double){1., -1., 1., -1.});
    }
-    // do not define for integer input
-  };

+    // Complex float
+    FLOAT_WRAP_1(operator(), inline)
+  };
+  
  struct TimesMinusI{
-    //Complex single
-    inline float operator()(float in, float ret){
-      assert(0);
-    }
    //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
-      assert(0);
+    inline vector4double operator()(vector4double v, vector4double ret){
+      return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
+                               (vector4double){0., 0., 0., 0.});
    }

-
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
  };
-
+  
  struct TimesI{
-    //Complex single
-    inline float operator()(float in, float ret){
-  
-    }
    //Complex double
-    inline vector4double operator()(vector4double in, vector4double ret){
-  
+    inline vector4double operator()(vector4double v, vector4double ret){
+      return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
+                              (vector4double){0., 0., 0., 0.});
    }

-
+    // Complex float
+    FLOAT_WRAP_2(operator(), inline)
  };

+  struct Permute{
+    //Complex double
+    static inline vector4double Permute0(vector4double v){ //0123 -> 2301
+      return vec_perm(v, v, vec_gpci(02301));
+    };
+    static inline vector4double Permute1(vector4double v){ //0123 -> 1032
+      return vec_perm(v, v, vec_gpci(01032));
+    };
+    static inline vector4double Permute2(vector4double v){
+      return v;
+    };
+    static inline vector4double Permute3(vector4double v){
+      return v;
+    };

+    // Complex float
+    FLOAT_WRAP_1(Permute0, static inline)
+    FLOAT_WRAP_1(Permute1, static inline)
+    FLOAT_WRAP_1(Permute2, static inline)
+    FLOAT_WRAP_1(Permute3, static inline)
+  };
  
+  struct Rotate{
+    static inline vector4double rotate(vector4double v, int n){
+      switch(n){
+        case 0:
+          return v;
+          break;
+        case 1:
+          return vec_perm(v, v, vec_gpci(01230));
+          break;
+        case 2:
+          return vec_perm(v, v, vec_gpci(02301));
+          break;
+        case 3:
+          return vec_perm(v, v, vec_gpci(03012));
+          break;
+        default: assert(0);
+      }
+    }

+    static inline vector4float rotate(vector4float v, int n){
+      vector4double vd, rd;
+      vector4float  r;

-  //////////////////////////////////////////////
-  // Some Template specialization
+      vd = Vset()(v);
+      rd = rotate(vd, n);
+      Vstore()(rd, r);
+
+      return r;
+    }
+  };
  
  //Complex float Reduce
  template<>
-    inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
-    assert(0);
+  inline Grid::ComplexF
+  Reduce<Grid::ComplexF, vector4float>::operator()(vector4float v) { //2 complex
+    vector4float v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = Optimization::Sum()(v1, v);
+    
+    return Grid::ComplexF(v1.v0, v1.v1);
  }
  //Real float Reduce
  template<>
-    inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
-    assert(0);
+  inline Grid::RealF
+  Reduce<Grid::RealF, vector4float>::operator()(vector4float v){ //4 floats
+    vector4float v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = Optimization::Sum()(v1, v);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = Optimization::Sum()(v1, v2);
+    
+    return v1.v0;
  }
  
  
  //Complex double Reduce
  template<>
-    inline Grid::ComplexD Reduce<Grid::ComplexD, vector4double>::operator()(vector4double in){
-    assert(0);
+  inline Grid::ComplexD
+  Reduce<Grid::ComplexD, vector4double>::operator()(vector4double v){ //2 complex
+    vector4double v1;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    
+    return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1));
  }
  
  //Real double Reduce
  template<>
-    inline Grid::RealD Reduce<Grid::RealD, vector4double>::operator()(vector4double in){
-    assert(0);
-  }
+  inline Grid::RealD
+  Reduce<Grid::RealD, vector4double>::operator()(vector4double v){ //4 doubles
+    vector4double v1,v2;
+    
+    v1 = Optimization::Permute::Permute0(v);
+    v1 = vec_add(v1, v);
+    v2 = Optimization::Permute::Permute1(v1);
+    v1 = vec_add(v1, v2);

+    return vec_extract(v1, 0);
+  }
+  
  //Integer Reduce
  template<>
-    inline Integer Reduce<Integer, floati>::operator()(float in){
+  inline Integer Reduce<Integer, int>::operator()(int in){
+    // FIXME unimplemented
+    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
-  
-  
 }

-//////////////////////////////////////////////////////////////////////////////////////
-// Here assign types 
-namespace Grid {
-  typedef float SIMD_Ftype  __attribute__ ((vector_size (16)));         // Single precision type
-  typedef vector4double SIMD_Dtype; // Double precision type
-  typedef int SIMD_Itype;           // Integer type
+////////////////////////////////////////////////////////////////////////////////
+// Here assign types
+typedef Optimization::vector4float SIMD_Ftype;  // Single precision type
+typedef vector4double              SIMD_Dtype; // Double precision type
+typedef int                        SIMD_Itype; // Integer type

-  inline void v_prefetch0(int size, const char *ptr){};
-
-  // Function name aliases
-  typedef Optimization::Vsplat   VsplatSIMD;
-  typedef Optimization::Vstore   VstoreSIMD;
-  typedef Optimization::Vset     VsetSIMD;
-  typedef Optimization::Vstream  VstreamSIMD;
-  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
+// prefetch utilities
+inline void v_prefetch0(int size, const char *ptr){};
+inline void prefetch_HINT_T0(const char *ptr){};


-  // Arithmetic operations
-  typedef Optimization::Sum         SumSIMD;
-  typedef Optimization::Sub         SubSIMD;
-  typedef Optimization::Mult        MultSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
-  typedef Optimization::Conj        ConjSIMD;
-  typedef Optimization::TimesMinusI TimesMinusISIMD;
-  typedef Optimization::TimesI      TimesISIMD;
+// Function name aliases
+typedef Optimization::Vsplat   VsplatSIMD;
+typedef Optimization::Vstore   VstoreSIMD;
+typedef Optimization::Vset     VsetSIMD;
+typedef Optimization::Vstream  VstreamSIMD;
+template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;

+// Arithmetic operations
+typedef Optimization::Sum         SumSIMD;
+typedef Optimization::Sub         SubSIMD;
+typedef Optimization::Mult        MultSIMD;
+typedef Optimization::MultComplex MultComplexSIMD;
+typedef Optimization::Conj        ConjSIMD;
+typedef Optimization::TimesMinusI TimesMinusISIMD;
+typedef Optimization::TimesI      TimesISIMD;
+  
 }
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -388,6 +388,12 @@ class Grid_simd {

 };  // end of Grid_simd class definition

+
+inline void permute(ComplexD &y,ComplexD b, int perm) {  y=b; }
+inline void permute(ComplexF &y,ComplexF b, int perm) {  y=b; }
+inline void permute(RealD &y,RealD b, int perm) {  y=b; }
+inline void permute(RealF &y,RealF b, int perm) {  y=b; }
+
 ////////////////////////////////////////////////////////////////////
 // General rotate
 ////////////////////////////////////////////////////////////////////
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -67,15 +67,13 @@ template <class scalar>
 struct AsinRealFunctor {
  scalar operator()(const scalar &a) const { return asin(real(a)); }
 };
-
 template <class scalar>
 struct LogRealFunctor {
  scalar operator()(const scalar &a) const { return log(real(a)); }
 };
-
 template <class scalar>
-struct ExpRealFunctor {
-  scalar operator()(const scalar &a) const { return exp(real(a)); }
+struct ExpFunctor {
+  scalar operator()(const scalar &a) const { return exp(a); }
 };
 template <class scalar>
 struct NotFunctor {
@@ -85,7 +83,6 @@ template <class scalar>
 struct AbsRealFunctor {
  scalar operator()(const scalar &a) const { return std::abs(real(a)); }
 };
-
 template <class scalar>
 struct PowRealFunctor {
  double y;
@@ -135,7 +132,6 @@ template <class Scalar>
 inline Scalar rsqrt(const Scalar &r) {
  return (RSqrtRealFunctor<Scalar>(), r);
 }
-
 template <class S, class V>
 inline Grid_simd<S, V> cos(const Grid_simd<S, V> &r) {
  return SimdApply(CosRealFunctor<S>(), r);
@@ -162,7 +158,7 @@ inline Grid_simd<S, V> abs(const Grid_simd<S, V> &r) {
 }
 template <class S, class V>
 inline Grid_simd<S, V> exp(const Grid_simd<S, V> &r) {
-  return SimdApply(ExpRealFunctor<S>(), r);
+  return SimdApply(ExpFunctor<S>(), r);
 }
 template <class S, class V>
 inline Grid_simd<S, V> Not(const Grid_simd<S, V> &r) {