Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/hadrons

2026-05-31 06:24:18 +01:00 · 2017-01-23 15:24:47 +00:00
parent 7dd2764bb2 b7da264b0a
commit c291ef77b5
43 changed files with 1891 additions and 431 deletions
@@ -9,6 +9,7 @@
 ################
 *~
 *#
+*.sublime-*

 # Precompiled Headers #
 #######################
@@ -113,6 +113,36 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
    std::cout<<GridLogMessage << "******************"<<std::endl;

+#define BENCH_ZDW(A,in,out)			\
+    zDw.CayleyZeroCounters();			\
+    zDw. A (in,out);				\
+    FGrid->Barrier();				\
+    t0=usecond();				\
+    for(int i=0;i<ncall;i++){			\
+      zDw. A (in,out);				\
+    }						\
+    t1=usecond();				\
+    FGrid->Barrier();				\
+    zDw.CayleyReport();							\
+    std::cout<<GridLogMessage << "Called ZDw " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
+    std::cout<<GridLogMessage << "******************"<<std::endl;
+
+#define BENCH_DW_SSC(A,in,out)			\
+    Dw.CayleyZeroCounters();			\
+    Dw. A (in,out);				\
+    FGrid->Barrier();				\
+    t0=usecond();				\
+    for(int i=0;i<ncall;i++){			\
+      __SSC_START ;				\
+      Dw. A (in,out);				\
+      __SSC_STOP ;				\
+    }						\
+    t1=usecond();				\
+    FGrid->Barrier();				\
+    Dw.CayleyReport();					\
+    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
+    std::cout<<GridLogMessage << "******************"<<std::endl;
+
 #define BENCH_DW_MEO(A,in,out)			\
    Dw.CayleyZeroCounters();			\
    Dw. A (in,out,0);				\
@@ -148,9 +178,15 @@ int main (int argc, char ** argv)
    LatticeFermion sref(sFGrid);
    LatticeFermion result(sFGrid);

+
    std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
    DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);

+    RealD b=1.5;// Scale factor b+c=2, b-c=1
+    RealD c=0.5;
+    std::vector<ComplexD> gamma(Ls,std::complex<double>(1.0,0.0));
+    ZMobiusFermionVec5dR zDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c);
+
    std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
    FGrid->Barrier();

@@ -173,10 +209,13 @@ int main (int argc, char ** argv)

    BENCH_DW_MEO(Dhop    ,src,result);
    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
-    BENCH_DW(Meooe   ,src_o,r_e);
+    BENCH_DW_SSC(Meooe   ,src_o,r_e);
    BENCH_DW(Mooee   ,src_o,r_o);
    BENCH_DW(MooeeInv,src_o,r_o);

+    BENCH_ZDW(Mooee   ,src_o,r_o);
+    BENCH_ZDW(MooeeInv,src_o,r_o);
+
  }

  Grid_finalize();
@@ -99,6 +99,13 @@ case ${ac_MKL} in
        AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);;
 esac

+############### HDF5
+AC_ARG_WITH([hdf5],
+    [AS_HELP_STRING([--with-hdf5=prefix],
+    [try this for a non-standard install prefix of the HDF5 library])],
+    [AM_CXXFLAGS="-I$with_hdf5/include $AM_CXXFLAGS"]
+    [AM_LDFLAGS="-L$with_hdf5/lib $AM_LDFLAGS"])
+
 ############### first-touch
 AC_ARG_ENABLE([numa],
    [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], 
@@ -145,6 +152,12 @@ AC_SEARCH_LIBS([fftw_execute], [fftw3],
               [AC_DEFINE([HAVE_FFTW], [1], [Define to 1 if you have the `FFTW' library])]
               [have_fftw=true])

+AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
+               [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
+               [have_hdf5=true]
+               [LIBS="${LIBS} -lhdf5"], [], [-lhdf5])
+AM_CONDITIONAL(BUILD_HDF5, [ test "${have_hdf5}X" == "trueX" ])
+
 CXXFLAGS=$CXXFLAGS_CPY
 LDFLAGS=$LDFLAGS_CPY

@@ -410,6 +423,7 @@ RNG choice                  : ${ac_RNG}
 GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
 LAPACK                      : ${ac_LAPACK}
 FFTW                        : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
+HDF5                        : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi`
 build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
 ----- BUILD FLAGS -------------------------------------
 CXXFLAGS:
@@ -42,7 +42,6 @@ using namespace Hadrons;
 ******************************************************************************/
 // constructors ////////////////////////////////////////////////////////////////
 Application::Application(void)
-: env_(Environment::getInstance())
 {
    LOG(Message) << "Modules available:" << std::endl;
    auto list = ModuleFactory::getInstance().getBuilderList();
@@ -74,11 +73,17 @@ Application::Application(const std::string parameterFileName)
    parameterFileName_ = parameterFileName;
 }

+// environment shortcut ////////////////////////////////////////////////////////
+Environment & Application::env(void) const
+{
+    return Environment::getInstance();
+}
+
 // access //////////////////////////////////////////////////////////////////////
 void Application::setPar(const Application::GlobalPar &par)
 {
    par_ = par;
-    env_.setSeed(strToVec<int>(par_.seed));
+    env().setSeed(strToVec<int>(par_.seed));
 }

 const Application::GlobalPar & Application::getPar(void)
@@ -89,7 +94,7 @@ const Application::GlobalPar & Application::getPar(void)
 // execute /////////////////////////////////////////////////////////////////////
 void Application::run(void)
 {
-    if (!parameterFileName_.empty() and (env_.getNModule() == 0))
+    if (!parameterFileName_.empty() and (env().getNModule() == 0))
    {
        parseParameterFile(parameterFileName_);
    }
@@ -124,7 +129,7 @@ void Application::parseParameterFile(const std::string parameterFileName)
    do
    {
        read(reader, "id", id);
-        env_.createModule(id.name, id.type, reader);
+        env().createModule(id.name, id.type, reader);
    } while (reader.nextElement("module"));
    pop(reader);
    pop(reader);
@@ -134,7 +139,7 @@ void Application::saveParameterFile(const std::string parameterFileName)
 {
    XmlWriter          writer(parameterFileName);
    ObjectId           id;
-    const unsigned int nMod = env_.getNModule();
+    const unsigned int nMod = env().getNModule();
    
    LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl;
    write(writer, "parameters", getPar());
@@ -142,10 +147,10 @@ void Application::saveParameterFile(const std::string parameterFileName)
    for (unsigned int i = 0; i < nMod; ++i)
    {
        push(writer, "module");
-        id.name = env_.getModuleName(i);
-        id.type = env_.getModule(i)->getRegisteredName();
+        id.name = env().getModuleName(i);
+        id.type = env().getModule(i)->getRegisteredName();
        write(writer, "id", id);
-        env_.getModule(i)->saveParameters(writer, "options");
+        env().getModule(i)->saveParameters(writer, "options");
        pop(writer);
    }
    pop(writer);
@@ -164,10 +169,10 @@ auto memPeak = [this](const std::vector<unsigned int> &program)\
    \
    msg = HadronsLogMessage.isActive();\
    HadronsLogMessage.Active(false);\
-    env_.dryRun(true);\
-    memPeak = env_.executeProgram(program);\
-    env_.dryRun(false);\
-    env_.freeAll();\
+    env().dryRun(true);\
+    memPeak = env().executeProgram(program);\
+    env().dryRun(false);\
+    env().freeAll();\
    HadronsLogMessage.Active(true);\
    \
    return memPeak;\
@@ -179,7 +184,7 @@ void Application::schedule(void)
    
    // build module dependency graph
    LOG(Message) << "Building module graph..." << std::endl;
-    auto graph = env_.makeModuleGraph();
+    auto graph = env().makeModuleGraph();
    auto con = graph.getConnectedComponents();
    
    // constrained topological sort using a genetic algorithm
@@ -256,7 +261,7 @@ void Application::saveSchedule(const std::string filename)
                 << std::endl;
    for (auto address: program_)
    {
-        program.push_back(env_.getModuleName(address));
+        program.push_back(env().getModuleName(address));
    }
    write(writer, "schedule", program);
 }
@@ -274,7 +279,7 @@ void Application::loadSchedule(const std::string filename)
    program_.clear();
    for (auto &name: program)
    {
-        program_.push_back(env_.getModuleAddress(name));
+        program_.push_back(env().getModuleAddress(name));
    }
    scheduled_ = true;
    memPeak_   = memPeak(program_);
@@ -291,7 +296,7 @@ void Application::printSchedule(void)
    for (unsigned int i = 0; i < program_.size(); ++i)
    {
        LOG(Message) << std::setw(4) << i + 1 << ": "
-                     << env_.getModuleName(program_[i]) << std::endl;
+                     << env().getModuleName(program_[i]) << std::endl;
    }
 }

@@ -304,9 +309,9 @@ void Application::configLoop(void)
    {
        LOG(Message) << BIG_SEP << " Starting measurement for trajectory " << t
                     << " " << BIG_SEP << std::endl;
-        env_.setTrajectory(t);
-        env_.executeProgram(program_);
+        env().setTrajectory(t);
+        env().executeProgram(program_);
    }
    LOG(Message) << BIG_SEP << " End of measurement " << BIG_SEP << std::endl;
-    env_.freeAll();
+    env().freeAll();
 }
@@ -98,11 +98,13 @@ public:
    void printSchedule(void);
    // loop on configurations
    void configLoop(void);
+private:
+    // environment shortcut
+    Environment & env(void) const;
 private:
    long unsigned int         locVol_;
    std::string               parameterFileName_{""};
    GlobalPar                 par_;
-    Environment               &env_;
    std::vector<unsigned int> program_;
    Environment::Size         memPeak_;
    bool                      scheduled_{false};
@@ -115,14 +117,14 @@ private:
 template <typename M>
 void Application::createModule(const std::string name)
 {
-    env_.createModule<M>(name);
+    env().createModule<M>(name);
 }

 template <typename M>
 void Application::createModule(const std::string name,
                               const typename M::Par &par)
 {
-    env_.createModule<M>(name, par);
+    env().createModule<M>(name, par);
 }

 END_HADRONS_NAMESPACE
@@ -41,8 +41,9 @@ using namespace Hadrons;
 // constructor /////////////////////////////////////////////////////////////////
 Environment::Environment(void)
 {
+    nd_ = GridDefaultLatt().size();
    grid4d_.reset(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
+        GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()),
        GridDefaultMpi()));
    gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get()));
    auto loc = getGrid()->LocalDimensions();
@@ -126,6 +127,11 @@ GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const
    }
 }

+unsigned int Environment::getNd(void) const
+{
+    return nd_;
+}
+
 // random number generator /////////////////////////////////////////////////////
 void Environment::setSeed(const std::vector<int> &seed)
 {
@@ -106,6 +106,7 @@ public:
    void                    createGrid(const unsigned int Ls);
    GridCartesian *         getGrid(const unsigned int Ls = 1) const;
    GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
+    unsigned int            getNd(void) const;
    // random number generator
    void                    setSeed(const std::vector<int> &seed);
    GridParallelRNG *       get4dRng(void) const;
@@ -200,6 +201,7 @@ private:
    std::map<unsigned int, GridPt>         grid5d_;
    GridRbPt                               gridRb4d_;
    std::map<unsigned int, GridRbPt>       gridRb5d_;
+    unsigned int                           nd_;
    // random number generator
    RngPt                                  rng4d_;
    // module and related maps
@@ -166,7 +166,7 @@ void GeneticScheduler<T>::initPopulation(void)
    {
        auto p = graph_.topoSort(gen_);
        
-        population_.emplace(func_(p), p);
+        population_.insert(std::make_pair(func_(p), p));
    }
 }

@@ -180,8 +180,8 @@ void GeneticScheduler<T>::doCrossover(void)
    crossover(c1, c2, p1, p2);
    PARALLEL_CRITICAL
    {
-        population_.emplace(func_(c1), c1);
-        population_.emplace(func_(c2), c2);
+        population_.insert(std::make_pair(func_(c1), c1));
+        population_.insert(std::make_pair(func_(c2), c2));
    }
 }

@@ -200,7 +200,7 @@ void GeneticScheduler<T>::doMutation(void)
        mutation(m, it->second);
        PARALLEL_CRITICAL
        {
-            population_.emplace(func_(m), m);
+            population_.insert(std::make_pair(func_(m), m));
        }
    }
 }
@@ -147,7 +147,7 @@ void TSeqGamma<FImpl>::execute(void)
    g  = makeGammaProd(par().gamma);
    p  = strToVec<Real>(par().mom);
    ph = zero;
-    for(unsigned int mu = 0; mu < Nd; mu++)
+    for(unsigned int mu = 0; mu < env().getNd(); mu++)
    {
        LatticeCoordinate(coor, mu);
        ph = ph + p[mu]*coor;
@@ -0,0 +1,65 @@
+
+
+
+#include <Grid/Grid.h>
+
+namespace Grid {
+
+int PointerCache::victim;
+
+  PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
+
+void *PointerCache::Insert(void *ptr,size_t bytes) {
+
+  if (bytes < 4096 ) return NULL;
+
+#ifdef _OPENMP
+  assert(omp_in_parallel()==0);
+#endif 
+  void * ret = NULL;
+  int v = -1;
+
+  for(int e=0;e<Ncache;e++) {
+    if ( Entries[e].valid==0 ) {
+      v=e; 
+      break;
+    }
+  }
+
+  if ( v==-1 ) {
+    v=victim;
+    victim = (victim+1)%Ncache;
+  }
+
+  if ( Entries[v].valid ) {
+    ret = Entries[v].address;
+    Entries[v].valid = 0;
+    Entries[v].address = NULL;
+    Entries[v].bytes = 0;
+  }
+
+  Entries[v].address=ptr;
+  Entries[v].bytes  =bytes;
+  Entries[v].valid  =1;
+
+  return ret;
+}
+
+void *PointerCache::Lookup(size_t bytes) {
+
+ if (bytes < 4096 ) return NULL;
+
+#ifdef _OPENMP
+  assert(omp_in_parallel()==0);
+#endif 
+
+  for(int e=0;e<Ncache;e++){
+    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
+      Entries[e].valid = 0;
+      return Entries[e].address;
+    }
+  }
+  return NULL;
+}
+
+}
@@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@@ -42,9 +42,32 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

+  class PointerCache {
+  private:
+
+    static const int Ncache=8;
+    static int victim;
+
+    typedef struct { 
+      void *address;
+      size_t bytes;
+      int valid;
+    } PointerCacheEntry;
+    
+    static PointerCacheEntry Entries[Ncache];
+
+  public:
+
+
+    static void *Insert(void *ptr,size_t bytes) ;
+    static void *Lookup(size_t bytes) ;
+
+  };
+
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
+
 template<typename _Tp>
 class alignedAllocator {
 public: 
@@ -66,27 +89,27 @@ public:

  pointer allocate(size_type __n, const void* _p= 0)
  { 
+    size_type bytes = __n*sizeof(_Tp);
+
+    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
+    
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
 #else
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 #endif

-    _Tp tmp;
-#ifdef GRID_NUMA
-#pragma omp parallel for schedule(static)
-  for(int i=0;i<__n;i++){
-    ptr[i]=tmp;
-  }
-#endif 
    return ptr;
  }

-  void deallocate(pointer __p, size_type) { 
+  void deallocate(pointer __p, size_type __n) { 
+    size_type bytes = __n * sizeof(_Tp);
+    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
+
 #ifdef HAVE_MM_MALLOC_H
-    _mm_free((void *)__p); 
+    if ( __freeme ) _mm_free((void *)__freeme); 
 #else
-    free((void *)__p);
+    if ( __freeme ) free((void *)__freeme);
 #endif
  }
  void construct(pointer __p, const _Tp& __val) { };
@@ -59,13 +59,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 ///////////////////
 // Grid headers
 ///////////////////
-#include <Grid/serialisation/Serialisation.h>
 #include "Config.h"
 #include <Grid/Timer.h>
 #include <Grid/PerfCount.h>
 #include <Grid/Log.h>
 #include <Grid/AlignedAllocator.h>
 #include <Grid/Simd.h>
+#include <Grid/serialisation/Serialisation.h>
 #include <Grid/Threads.h>
 #include <Grid/Lexicographic.h>
 #include <Grid/Init.h>
@@ -1,4 +1,5 @@
 extra_sources=
+extra_headers=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
  extra_sources+=communicator/Communicator_base.cc
@@ -24,6 +25,12 @@ if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_base.cc
 endif

+if BUILD_HDF5
+  extra_sources+=serialisation/Hdf5IO.cc 
+  extra_headers+=serialisation/Hdf5IO.h
+  extra_headers+=serialisation/Hdf5Type.h
+endif
+
 #
 # Libraries
 #
@@ -32,6 +39,9 @@ include Eigen.inc

 lib_LIBRARIES = libGrid.a

-libGrid_a_SOURCES              = $(CCFILES) $(extra_sources)
+CCFILES += $(extra_sources)
+HFILES  += $(extra_headers)
+
+libGrid_a_SOURCES              = $(CCFILES)
 libGrid_adir                   = $(pkgincludedir)
 nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
@@ -205,12 +205,13 @@ public:
  void Stop(void) {
    count=0;
    cycles=0;
+    size_t ign;
 #ifdef __linux__
    if ( fd!= -1) {
      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
-      ::read(fd, &count, sizeof(long long));
-      ::read(cyclefd, &cycles, sizeof(long long));
+      ign=::read(fd, &count, sizeof(long long));
+      ign=::read(cyclefd, &cycles, sizeof(long long));
    }
    elapsed = cyclecount() - begin;
 #else
@@ -113,7 +113,7 @@ Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice
 {
 PARALLEL_FOR_LOOP     
     for(int i=0;i<table.size();i++){
-       buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
+       vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second]));
     }
 }

@@ -386,7 +386,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
  }

  // the above should guarantee that the operations are local
-  //PARALLEL_FOR_LOOP
+  PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -428,7 +428,7 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
    }
  }
  // the above should guarantee that the operations are local
-  //PARALLEL_FOR_LOOP
+  PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
@@ -29,6 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */

+#include <Grid/Eigen/Dense>
 #include <Grid.h>


@@ -48,18 +49,18 @@ namespace QCD {
 		   FourDimGrid,
 	 	   FourDimRedBlackGrid,_M5,p),
   mass(_mass)
- { }
+ { 
+ }

 template<class Impl>  
 void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  FermionField tmp(psi._grid);

-  this->DW(psi,tmp,DaggerNo);
+  this->DW(psi,this->tmp(),DaggerNo);

  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
  }
 }

@@ -87,8 +88,8 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;

-    // Flops = 9*12*Ls*vol/2
-    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
+    RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
  }
@@ -110,12 +111,11 @@ template<class Impl>
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  FermionField tmp(psi._grid);

-  this->DW(psi,tmp,DaggerYes);
+  this->DW(psi,this->tmp(),DaggerYes);

  for(int s=0;s<Ls;s++){
-    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
+    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],this->tmp(),s,s);// chi = (1-c[s] D_W) psi
  }
 }
 template<class Impl>  
@@ -138,6 +138,7 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D
  lower[0]   =-mass*lower[0];
  M5D(psi,psi,Din,lower,diag,upper);
 }
+// FIXME Redunant with the above routine; check this and eliminate
 template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
@@ -259,36 +260,33 @@ template<class Impl>
 void CayleyFermion5D<Impl>::Meooe       (const FermionField &psi, FermionField &chi)
 {
  int Ls=this->Ls;
-  FermionField tmp(psi._grid);

-  Meooe5D(psi,tmp); 
+  Meooe5D(psi,this->tmp()); 

  if ( psi.checkerboard == Odd ) {
-    this->DhopEO(tmp,chi,DaggerNo);
+    this->DhopEO(this->tmp(),chi,DaggerNo);
  } else {
-    this->DhopOE(tmp,chi,DaggerNo);
+    this->DhopOE(this->tmp(),chi,DaggerNo);
  }
 }

 template<class Impl>
 void CayleyFermion5D<Impl>::MeooeDag    (const FermionField &psi, FermionField &chi)
 {
-  FermionField tmp(psi._grid);
  // Apply 4d dslash
  if ( psi.checkerboard == Odd ) {
-    this->DhopEO(psi,tmp,DaggerYes);
+    this->DhopEO(psi,this->tmp(),DaggerYes);
  } else {
-    this->DhopOE(psi,tmp,DaggerYes);
+    this->DhopOE(psi,this->tmp(),DaggerYes);
  }
-  MeooeDag5D(tmp,chi); 
+  MeooeDag5D(this->tmp(),chi); 
 }

 template<class Impl>
 void  CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
-  FermionField tmp(psi._grid);
-  Meo5D(psi,tmp);
+  Meo5D(psi,this->tmp());
  // Apply 4d dslash fragment
-  this->DhopDir(tmp,chi,dir,disp);
+  this->DhopDir(this->tmp(),chi,dir,disp);
 }
 // force terms; five routines; default to Dhop on diagonal
 template<class Impl>
@@ -459,9 +457,91 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
    dee[Ls-1] += delta_d;
  }  
+
+  int inv=1;
+  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
+  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
+
 }


+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
+						 Vector<iSinglet<Simd> > & Matp,
+						 Vector<iSinglet<Simd> > & Matm)
+{
+  int Ls=this->Ls;
+
+  GridBase *grid = this->FermionRedBlackGrid();
+  int LLs = grid->_rdimensions[0];
+
+  if ( LLs == Ls ) return; // Not vectorised in 5th direction
+
+  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+  
+  for(int s=0;s<Ls;s++){
+    Pplus(s,s) = bee[s];
+    Pminus(s,s)= bee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pminus(s,s+1) = -cee[s];
+  }
+  
+  for(int s=0;s<Ls-1;s++){
+    Pplus(s+1,s) = -cee[s+1];
+  }
+  Pplus (0,Ls-1) = mass*cee[0];
+  Pminus(Ls-1,0) = mass*cee[Ls-1];
+  
+  Eigen::MatrixXcd PplusMat ;
+  Eigen::MatrixXcd PminusMat;
+  
+  if ( inv ) {
+    PplusMat =Pplus.inverse();
+    PminusMat=Pminus.inverse();
+  } else { 
+    PplusMat =Pplus;
+    PminusMat=Pminus;
+  }
+  
+  if(dag){
+    PplusMat.adjointInPlace();
+    PminusMat.adjointInPlace();
+  }
+  
+  typedef typename SiteHalfSpinor::scalar_type scalar_type;
+  const int Nsimd=Simd::Nsimd();
+  Matp.resize(Ls*LLs);
+  Matm.resize(Ls*LLs);
+
+  for(int s2=0;s2<Ls;s2++){
+  for(int s1=0;s1<LLs;s1++){
+    int istride = LLs;
+    int ostride = 1;
+    Simd Vp;
+    Simd Vm;
+    scalar_type *sp = (scalar_type *)&Vp;
+    scalar_type *sm = (scalar_type *)&Vm;
+    for(int l=0;l<Nsimd;l++){
+      if ( switcheroo<Coeff_t>::iscomplex() ) {
+	sp[l] = PplusMat (l*istride+s1*ostride,s2);
+	sm[l] = PminusMat(l*istride+s1*ostride,s2);
+      } else { 
+      // if real
+	scalar_type tmp;
+	tmp = PplusMat (l*istride+s1*ostride,s2);
+	sp[l] = scalar_type(tmp.real(),tmp.real());
+	tmp = PminusMat(l*istride+s1*ostride,s2);
+	sm[l] = scalar_type(tmp.real(),tmp.real());
+      }
+    }
+    Matp[LLs*s2+s1] = Vp;
+    Matm[LLs*s2+s1] = Vm;
+  }}
+}
+

  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
@@ -33,6 +33,31 @@ namespace Grid {

  namespace QCD {

+     template<typename T> struct switcheroo   {  
+       static inline int iscomplex()  { return 0; } 
+
+       template<class vec>
+       static inline vec mult(vec a, vec b) {
+	 return real_mult(a,b);
+       }
+     };
+     template<> struct switcheroo<ComplexD> {  
+       static inline int iscomplex()  { return 1; } 
+
+       template<class vec>
+       static inline vec mult(vec a, vec b) {
+	 return a*b;
+       }
+     };
+     template<> struct switcheroo<ComplexF> {  
+       static inline int iscomplex()  { return 1; } 
+       template<class vec>
+       static inline vec mult(vec a, vec b) {
+	 return a*b;
+       }
+     };
+
+
    template<class Impl>
    class CayleyFermion5D : public WilsonFermion5D<Impl>
    {
@@ -75,7 +100,19 @@ namespace Grid {
 		  std::vector<Coeff_t> &lower,
 		  std::vector<Coeff_t> &diag,
 		  std::vector<Coeff_t> &upper);
+
      void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
+      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd> > & Matp, Vector<iSinglet<Simd> > & Matm);
+
+      void MooeeInternalAsm(const FermionField &in, FermionField &out,
+			    int LLs, int site,
+			    Vector<iSinglet<Simd> > &Matp,
+			    Vector<iSinglet<Simd> > &Matm);
+      void MooeeInternalZAsm(const FermionField &in, FermionField &out,
+			    int LLs, int site,
+			    Vector<iSinglet<Simd> > &Matp,
+			    Vector<iSinglet<Simd> > &Matm);
+

      virtual void   Instantiatable(void)=0;

@@ -112,6 +149,12 @@ namespace Grid {
      std::vector<Coeff_t> ueem;    
      std::vector<Coeff_t> dee;    

+      // Matrices of 5d ee inverse params
+      Vector<iSinglet<Simd> >  MatpInv;
+      Vector<iSinglet<Simd> >  MatmInv;
+      Vector<iSinglet<Simd> >  MatpInvDag;
+      Vector<iSinglet<Simd> >  MatmInvDag;
+
      // Constructors
      CayleyFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
@@ -29,13 +29,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */

-#include <Grid/Eigen/Dense>
+
 #include <Grid.h>


 namespace Grid {
-namespace QCD {
-  /*
+namespace QCD {  /*
   * Dense matrix versions of routines
   */
 template<class Impl>
@@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP
      for(int v=0;v<LLs;v++){

 	vprefetch(psi[ss+v+LLs]);
-	//	vprefetch(phi[ss+v+LLs]);

 	int vp= (v==LLs-1) ? 0     : v+1;
 	int vm= (v==0    ) ? LLs-1 : v-1;
@@ -145,9 +143,6 @@ PARALLEL_FOR_LOOP
 	Simd hm_11 = psi[ss+vm]()(1)(1); 
 	Simd hm_12 = psi[ss+vm]()(1)(2); 

-	//	if ( ss==0) std::cout << " hp_00 " <<hp_00<<std::endl;
-	//	if ( ss==0) std::cout << " hm_00 " <<hm_00<<std::endl;
-
 	if ( vp<=v ) {
 	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
@@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP
 	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 	}

-	/*
-	if ( ss==0) std::cout << " dphi_00 " <<d[v]()()() * phi[ss+v]()(0)(0) <<std::endl;
-	if ( ss==0) std::cout << " dphi_10 " <<d[v]()()() * phi[ss+v]()(1)(0) <<std::endl;
-	if ( ss==0) std::cout << " dphi_20 " <<d[v]()()() * phi[ss+v]()(2)(0) <<std::endl;
-	if ( ss==0) std::cout << " dphi_30 " <<d[v]()()() * phi[ss+v]()(3)(0) <<std::endl;
-	*/	
-	Simd p_00  = d[v]()()() * phi[ss+v]()(0)(0)  + l[v]()()()*hm_00; 
-	Simd p_01  = d[v]()()() * phi[ss+v]()(0)(1)  + l[v]()()()*hm_01; 
-	Simd p_02  = d[v]()()() * phi[ss+v]()(0)(2)  + l[v]()()()*hm_02; 
-	Simd p_10  = d[v]()()() * phi[ss+v]()(1)(0)  + l[v]()()()*hm_10; 
-	Simd p_11  = d[v]()()() * phi[ss+v]()(1)(1)  + l[v]()()()*hm_11; 
-	Simd p_12  = d[v]()()() * phi[ss+v]()(1)(2)  + l[v]()()()*hm_12; 
-	Simd p_20  = d[v]()()() * phi[ss+v]()(2)(0)  + u[v]()()()*hp_00; 
-	Simd p_21  = d[v]()()() * phi[ss+v]()(2)(1)  + u[v]()()()*hp_01; 
-	Simd p_22  = d[v]()()() * phi[ss+v]()(2)(2)  + u[v]()()()*hp_02;  
-	Simd p_30  = d[v]()()() * phi[ss+v]()(3)(0)  + u[v]()()()*hp_10; 
-	Simd p_31  = d[v]()()() * phi[ss+v]()(3)(1)  + u[v]()()()*hp_11; 
-	Simd p_32  = d[v]()()() * phi[ss+v]()(3)(2)  + u[v]()()()*hp_12; 
+	// Can force these to real arithmetic and save 2x.
+	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02); 
+	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);  
+	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 

-	
-	//	if ( ss==0){
-	/*
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<<p_00<<" diff "<<chi[ss+v]()(0)(0)-p_00<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(1) << " bad "<<p_01<<" diff "<<chi[ss+v]()(0)(1)-p_01<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(2) << " bad "<<p_02<<" diff "<<chi[ss+v]()(0)(2)-p_02<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(0) << " bad "<<p_10<<" diff "<<chi[ss+v]()(1)(0)-p_10<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(1) << " bad "<<p_11<<" diff "<<chi[ss+v]()(1)(1)-p_11<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(1)(2) << " bad "<<p_12<<" diff "<<chi[ss+v]()(1)(2)-p_12<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(0) << " bad "<<p_20<<" diff "<<chi[ss+v]()(2)(0)-p_20<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(1) << " bad "<<p_21<<" diff "<<chi[ss+v]()(2)(1)-p_21<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(2)(2) << " bad "<<p_22<<" diff "<<chi[ss+v]()(2)(2)-p_22<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(0) << " bad "<<p_30<<" diff "<<chi[ss+v]()(3)(0)-p_30<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(1) << " bad "<<p_31<<" diff "<<chi[ss+v]()(3)(1)-p_31<<std::endl;
-	std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(3)(2) << " bad "<<p_32<<" diff "<<chi[ss+v]()(3)(2)-p_32<<std::endl;
-	}
-	*/
 	vstream(chi[ss+v]()(0)(0),p_00);
 	vstream(chi[ss+v]()(0)(1),p_01);
 	vstream(chi[ss+v]()(0)(2),p_02);
@@ -261,7 +234,7 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
-
+#if 0
    alignas(64) SiteHalfSpinor hp;
    alignas(64) SiteHalfSpinor hm;
    alignas(64) SiteSpinor fp;
@@ -287,9 +260,504 @@ PARALLEL_FOR_LOOP
      chi[ss+v] = chi[ss+v]     +l[v]*fm;

    }
+#else
+      for(int v=0;v<LLs;v++){
+
+	vprefetch(psi[ss+v+LLs]);
+
+	int vp= (v==LLs-1) ? 0     : v+1;
+	int vm= (v==0    ) ? LLs-1 : v-1;
+	
+	Simd hp_00 = psi[ss+vp]()(0)(0); 
+	Simd hp_01 = psi[ss+vp]()(0)(1); 
+	Simd hp_02 = psi[ss+vp]()(0)(2); 
+	Simd hp_10 = psi[ss+vp]()(1)(0); 
+	Simd hp_11 = psi[ss+vp]()(1)(1); 
+	Simd hp_12 = psi[ss+vp]()(1)(2); 
+	
+	Simd hm_00 = psi[ss+vm]()(2)(0); 
+	Simd hm_01 = psi[ss+vm]()(2)(1); 
+	Simd hm_02 = psi[ss+vm]()(2)(2); 
+	Simd hm_10 = psi[ss+vm]()(3)(0); 
+	Simd hm_11 = psi[ss+vm]()(3)(1); 
+	Simd hm_12 = psi[ss+vm]()(3)(2); 
+
+	if ( vp<=v ) {
+	  hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
+	  hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
+	  hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
+	  hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
+	  hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
+	  hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
+	}
+	if ( vm>=v ) {
+	  hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
+	  hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
+	  hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
+	  hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
+	  hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
+	  hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
+	}
+
+	Simd p_00  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00); 
+	Simd p_01  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01); 
+	Simd p_02  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02); 
+	Simd p_10  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10); 
+	Simd p_11  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11); 
+	Simd p_12  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2))  + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12); 
+
+	Simd p_20  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00); 
+	Simd p_21  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01); 
+	Simd p_22  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);  
+	Simd p_30  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10); 
+	Simd p_31  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11); 
+	Simd p_32  = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2))  + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12); 
+
+	vstream(chi[ss+v]()(0)(0),p_00);
+	vstream(chi[ss+v]()(0)(1),p_01);
+	vstream(chi[ss+v]()(0)(2),p_02);
+	vstream(chi[ss+v]()(1)(0),p_10);
+	vstream(chi[ss+v]()(1)(1),p_11);
+	vstream(chi[ss+v]()(1)(2),p_12);
+	vstream(chi[ss+v]()(2)(0),p_20);
+	vstream(chi[ss+v]()(2)(1),p_21);
+	vstream(chi[ss+v]()(2)(2),p_22);
+	vstream(chi[ss+v]()(3)(0),p_30);
+	vstream(chi[ss+v]()(3)(1),p_31);
+	vstream(chi[ss+v]()(3)(2),p_32);
+      }
+#endif
  }
  M5Dtime+=usecond();
 }
+
+
+#ifdef AVX512 
+#include <simd/Intel512common.h>
+#include <simd/Intel512avx.h>
+#include <simd/Intel512single.h>
+#endif 
+
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
+					     int LLs, int site,
+					     Vector<iSinglet<Simd> > &Matp,
+					     Vector<iSinglet<Simd> > &Matm)
+{
+#ifndef AVX512
+  {
+  SiteHalfSpinor BcastP;
+  SiteHalfSpinor BcastM;
+  SiteHalfSpinor SiteChiP;
+  SiteHalfSpinor SiteChiM;
+
+  // Ls*Ls * 2 * 12 * vol flops
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+        int s=s2+l*LLs;
+	int lex=s2+LLs*site;
+	
+	if ( s2==0 && l==0) {
+	  SiteChiP=zero;
+	  SiteChiM=zero;
+	}
+	
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	}}
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	}}
+
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
+	  SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
+	}}
+
+    }}
+    {
+      int lex = s1+LLs*site;
+      for(int sp=0;sp<2;sp++){
+      for(int co=0;co<Nc;co++){
+	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+      }}
+    }
+  }
+
+  }
+#else
+  {
+  // pointers
+    //  MASK_REGS;
+#define Chi_00 %%zmm1
+#define Chi_01 %%zmm2
+#define Chi_02 %%zmm3
+#define Chi_10 %%zmm4
+#define Chi_11 %%zmm5
+#define Chi_12 %%zmm6
+#define Chi_20 %%zmm7
+#define Chi_21 %%zmm8
+#define Chi_22 %%zmm9
+#define Chi_30 %%zmm10
+#define Chi_31 %%zmm11
+#define Chi_32 %%zmm12
+
+#define BCAST0   %%zmm13
+#define BCAST1   %%zmm14
+#define BCAST2   %%zmm15
+#define BCAST3   %%zmm16
+#define BCAST4   %%zmm17
+#define BCAST5   %%zmm18
+#define BCAST6   %%zmm19
+#define BCAST7   %%zmm20
+#define BCAST8   %%zmm21
+#define BCAST9   %%zmm22
+#define BCAST10  %%zmm23
+#define BCAST11  %%zmm24
+
+  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      int lex=s2+LLs*site;
+      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+      uint64_t a2 = (uint64_t)&psi[lex];
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	if ( (s2+l)==0 ) {
+	  asm (
+  	           VPREFETCH1(0,%2)  	     VPREFETCH1(0,%1)
+  	           VPREFETCH1(12,%2)  	     VPREFETCH1(13,%2)
+  	           VPREFETCH1(14,%2)  	     VPREFETCH1(15,%2)         
+		   VBCASTCDUP(0,%2,BCAST0)   
+		   VBCASTCDUP(1,%2,BCAST1)   
+		   VBCASTCDUP(2,%2,BCAST2)   
+		   VBCASTCDUP(3,%2,BCAST3)   
+		   VBCASTCDUP(4,%2,BCAST4)     VMULMEM (0,%0,BCAST0,Chi_00)
+		   VBCASTCDUP(5,%2,BCAST5)     VMULMEM (0,%0,BCAST1,Chi_01)
+		   VBCASTCDUP(6,%2,BCAST6)     VMULMEM (0,%0,BCAST2,Chi_02)
+		   VBCASTCDUP(7,%2,BCAST7)     VMULMEM (0,%0,BCAST3,Chi_10)
+		   VBCASTCDUP(8,%2,BCAST8)     VMULMEM (0,%0,BCAST4,Chi_11)
+		   VBCASTCDUP(9,%2,BCAST9)     VMULMEM (0,%0,BCAST5,Chi_12)
+		   VBCASTCDUP(10,%2,BCAST10)   VMULMEM (0,%1,BCAST6,Chi_20)
+		   VBCASTCDUP(11,%2,BCAST11)   VMULMEM (0,%1,BCAST7,Chi_21)
+		   VMULMEM (0,%1,BCAST8,Chi_22)         
+		   VMULMEM (0,%1,BCAST9,Chi_30)
+		   VMULMEM (0,%1,BCAST10,Chi_31)       
+		   VMULMEM (0,%1,BCAST11,Chi_32)
+		   : : "r" (a0), "r" (a1), "r" (a2)  );
+	} else { 
+	  asm (
+		   VBCASTCDUP(0,%2,BCAST0)   VMADDMEM (0,%0,BCAST0,Chi_00)
+		   VBCASTCDUP(1,%2,BCAST1)   VMADDMEM (0,%0,BCAST1,Chi_01)
+		   VBCASTCDUP(2,%2,BCAST2)   VMADDMEM (0,%0,BCAST2,Chi_02)
+		   VBCASTCDUP(3,%2,BCAST3)   VMADDMEM (0,%0,BCAST3,Chi_10)
+		   VBCASTCDUP(4,%2,BCAST4)   VMADDMEM (0,%0,BCAST4,Chi_11)
+		   VBCASTCDUP(5,%2,BCAST5)   VMADDMEM (0,%0,BCAST5,Chi_12)
+		   VBCASTCDUP(6,%2,BCAST6)   VMADDMEM (0,%1,BCAST6,Chi_20)
+		   VBCASTCDUP(7,%2,BCAST7)   VMADDMEM (0,%1,BCAST7,Chi_21)
+		   VBCASTCDUP(8,%2,BCAST8)   VMADDMEM (0,%1,BCAST8,Chi_22)
+		   VBCASTCDUP(9,%2,BCAST9)   VMADDMEM (0,%1,BCAST9,Chi_30)
+		   VBCASTCDUP(10,%2,BCAST10)  VMADDMEM (0,%1,BCAST10,Chi_31)
+		   VBCASTCDUP(11,%2,BCAST11)  VMADDMEM (0,%1,BCAST11,Chi_32) 
+		   : : "r" (a0), "r" (a1), "r" (a2)  );
+	}
+	a0 = a0+incr;
+	a1 = a1+incr;
+	a2 = a2+sizeof(Simd::scalar_type);
+      }}
+    {
+      int lexa = s1+LLs*site;
+      asm (
+	       VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)		
+	       VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)		
+	       VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)		
+	       VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)		
+	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+    }
+  }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+#endif
+};
+
+  // Z-mobius version
+template<class Impl>
+void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionField &chi,
+					     int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
+{
+#ifndef AVX512
+  {
+  SiteHalfSpinor BcastP;
+  SiteHalfSpinor BcastM;
+  SiteHalfSpinor SiteChiP;
+  SiteHalfSpinor SiteChiM;
+
+  // Ls*Ls * 2 * 12 * vol flops
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+
+        int s=s2+l*LLs;
+	int lex=s2+LLs*site;
+	
+	if ( s2==0 && l==0) {
+	  SiteChiP=zero;
+	  SiteChiM=zero;
+	}
+	
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastP()(sp  )(co),psi[lex]()(sp)(co),l);
+	}}
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  vbroadcast(BcastM()(sp  )(co),psi[lex]()(sp+2)(co),l);
+	}}
+
+	for(int sp=0;sp<2;sp++){
+        for(int co=0;co<Nc;co++){
+	  SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co); 
+	  SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co); 
+	}}
+
+
+    }}
+    {
+      int lex = s1+LLs*site;
+      for(int sp=0;sp<2;sp++){
+      for(int co=0;co<Nc;co++){
+	vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
+	vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
+      }}
+    }
+  }
+
+  }
+#else
+  {
+  // pointers
+  //  MASK_REGS;
+#define Chi_00 %zmm0
+#define Chi_01 %zmm1
+#define Chi_02 %zmm2
+#define Chi_10 %zmm3
+#define Chi_11 %zmm4
+#define Chi_12 %zmm5
+#define Chi_20 %zmm6
+#define Chi_21 %zmm7
+#define Chi_22 %zmm8
+#define Chi_30 %zmm9
+#define Chi_31 %zmm10
+#define Chi_32 %zmm11
+#define pChi_00 %%zmm0
+#define pChi_01 %%zmm1
+#define pChi_02 %%zmm2
+#define pChi_10 %%zmm3
+#define pChi_11 %%zmm4
+#define pChi_12 %%zmm5
+#define pChi_20 %%zmm6
+#define pChi_21 %%zmm7
+#define pChi_22 %%zmm8
+#define pChi_30 %%zmm9
+#define pChi_31 %%zmm10
+#define pChi_32 %%zmm11
+
+#define BCAST_00   %zmm12
+#define  SHUF_00   %zmm13
+#define BCAST_01   %zmm14
+#define  SHUF_01   %zmm15
+#define BCAST_02   %zmm16
+#define  SHUF_02   %zmm17
+#define BCAST_10   %zmm18
+#define  SHUF_10   %zmm19
+#define BCAST_11   %zmm20
+#define  SHUF_11   %zmm21
+#define BCAST_12   %zmm22
+#define  SHUF_12   %zmm23
+
+#define Mp  %zmm24
+#define Mps %zmm25
+#define Mm  %zmm26
+#define Mms %zmm27
+#define N 8
+  int incr=LLs*LLs*sizeof(iSinglet<Simd>);
+  for(int s1=0;s1<LLs;s1++){ 
+    for(int s2=0;s2<LLs;s2++){ 
+      int lex=s2+LLs*site;
+      uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
+      uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
+      uint64_t a2 = (uint64_t)&psi[lex];
+      for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
+	if ( (s2+l)==0 ) {
+	  LOAD64(%r8,a0);
+	  LOAD64(%r9,a1);
+	  LOAD64(%r10,a2);
+	  asm (
+	       VLOAD(0,%r8,Mp)// i r
+	       VLOAD(0,%r9,Mm)
+	       VSHUF(Mp,Mps)  // r i 
+	       VSHUF(Mm,Mms)
+	       VPREFETCH1(12,%r10)  	     VPREFETCH1(13,%r10)
+	       VPREFETCH1(14,%r10)  	     VPREFETCH1(15,%r10)         
+
+	       VMULIDUP(0*N,%r10,Mps,Chi_00)
+	       VMULIDUP(1*N,%r10,Mps,Chi_01)
+	       VMULIDUP(2*N,%r10,Mps,Chi_02)
+	       VMULIDUP(3*N,%r10,Mps,Chi_10)
+	       VMULIDUP(4*N,%r10,Mps,Chi_11)
+	       VMULIDUP(5*N,%r10,Mps,Chi_12)
+
+	       VMULIDUP(6*N ,%r10,Mms,Chi_20)
+	       VMULIDUP(7*N ,%r10,Mms,Chi_21)
+	       VMULIDUP(8*N ,%r10,Mms,Chi_22)
+	       VMULIDUP(9*N ,%r10,Mms,Chi_30)
+	       VMULIDUP(10*N,%r10,Mms,Chi_31)
+	       VMULIDUP(11*N,%r10,Mms,Chi_32)
+
+	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
+	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
+	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+	       VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
+	       VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
+	       VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
+	       VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
+	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+	       );
+	} else { 
+	  LOAD64(%r8,a0);
+	  LOAD64(%r9,a1);
+	  LOAD64(%r10,a2);
+	  asm (
+	       VLOAD(0,%r8,Mp)
+	       VSHUF(Mp,Mps)
+
+	       VLOAD(0,%r9,Mm)
+	       VSHUF(Mm,Mms)
+
+	       VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) //  Mri * Pii +- Cir
+	       VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
+	       VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
+	       VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
+	       VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
+	       VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
+
+	       VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
+	       VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
+	       VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
+	       VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
+	       VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
+	       VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
+
+	       VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) //  Cir = Mir * Prr +- ( Mri * Pii +- Cir) 
+	       VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) //  Ci = MiPr + Ci + MrPi ;    Cr = MrPr - ( MiPi - Cr)
+	       VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
+	       VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
+	       VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
+	       VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
+
+	       VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
+	       VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
+	       VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
+	       VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
+	       VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
+	       VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
+	       );
+	}
+	a0 = a0+incr;
+	a1 = a1+incr;
+	a2 = a2+sizeof(Simd::scalar_type);
+      }}
+    {
+      int lexa = s1+LLs*site;
+      /*
+      SiteSpinor tmp;
+      asm (
+	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	       : : "r" ((uint64_t)&tmp) : "memory" );
+      */
+
+      asm (
+	       VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01)  VSTORE(2 ,%0,pChi_02)		
+	       VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11)  VSTORE(5 ,%0,pChi_12)		
+	       VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21)  VSTORE(8 ,%0,pChi_22)		
+	       VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31)  VSTORE(11,%0,pChi_32)		
+	       : : "r" ((uint64_t)&chi[lexa]) : "memory" );
+
+      //      if ( 1 || (site==0) ) { 
+      //	std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
+      //      }
+    }
+  }
+  }
+#undef Chi_00
+#undef Chi_01
+#undef Chi_02
+#undef Chi_10
+#undef Chi_11
+#undef Chi_12
+#undef Chi_20
+#undef Chi_21
+#undef Chi_22
+#undef Chi_30
+#undef Chi_31
+#undef Chi_32
+
+#undef BCAST0
+#undef BCAST1
+#undef BCAST2
+#undef BCAST3
+#undef BCAST4
+#undef BCAST5
+#undef BCAST6
+#undef BCAST7
+#undef BCAST8
+#undef BCAST9
+#undef BCAST10
+#undef BCAST11
+
+#endif
+};
+
+
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
 {
@@ -299,108 +767,41 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField

  chi.checkerboard=psi.checkerboard;
  
-  Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
-  Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
+  Vector<iSinglet<Simd> >  Matp;
+  Vector<iSinglet<Simd> >  Matm;
+  Vector<iSinglet<Simd> >  *_Matp;
+  Vector<iSinglet<Simd> >  *_Matm;
  
-  for(int s=0;s<Ls;s++){
-    Pplus(s,s) = bee[s];
-    Pminus(s,s)= bee[s];
+  //  MooeeInternalCompute(dag,inv,Matp,Matm);
+  if ( inv && dag ) { 
+    _Matp = &MatpInvDag;
+    _Matm = &MatmInvDag;
  }
-  
-  for(int s=0;s<Ls-1;s++){
-    Pminus(s,s+1) = -cee[s];
+  if ( inv && (!dag) ) { 
+    _Matp = &MatpInv;
+    _Matm = &MatmInv;
+  } 
+  if ( !inv ) {
+    MooeeInternalCompute(dag,inv,Matp,Matm);
+    _Matp = &Matp;
+    _Matm = &Matm;
  }
-  
-  for(int s=0;s<Ls-1;s++){
-    Pplus(s+1,s) = -cee[s+1];
-  }
-  Pplus (0,Ls-1) = mass*cee[0];
-  Pminus(Ls-1,0) = mass*cee[Ls-1];
-  
-  Eigen::MatrixXcd PplusMat ;
-  Eigen::MatrixXcd PminusMat;
-  
-  if ( inv ) {
-    PplusMat =Pplus.inverse();
-    PminusMat=Pminus.inverse();
-  } else { 
-    PplusMat =Pplus;
-    PminusMat=Pminus;
-  }
-  
-  if(dag){
-    PplusMat.adjointInPlace();
-    PminusMat.adjointInPlace();
-  }
-  
-  typedef typename SiteHalfSpinor::scalar_type scalar_type;
-  const int Nsimd=Simd::Nsimd();
-  Vector<iSinglet<Simd> > Matp(Ls*LLs);
-  Vector<iSinglet<Simd> > Matm(Ls*LLs);
+  assert(_Matp->size()==Ls*LLs);

-  for(int s2=0;s2<Ls;s2++){
-  for(int s1=0;s1<LLs;s1++){
-    int istride = LLs;
-    int ostride = 1;
-      Simd Vp;
-      Simd Vm;
-      scalar_type *sp = (scalar_type *)&Vp;
-      scalar_type *sm = (scalar_type *)&Vm;
-      for(int l=0;l<Nsimd;l++){
-	sp[l] = PplusMat (l*istride+s1*ostride ,s2);
-	sm[l] = PminusMat(l*istride+s1*ostride,s2);
-      }
-      Matp[LLs*s2+s1] = Vp;
-      Matm[LLs*s2+s1] = Vm;
-    }
-  }
-  
  MooeeInvCalls++;
  MooeeInvTime-=usecond();
-  // Dynamic allocate on stack to get per thread without serialised heap acces
-#pragma omp parallel  
-  {

-    Vector<SiteHalfSpinor> SitePplus(LLs);
-    Vector<SiteHalfSpinor> SitePminus(LLs);
-    Vector<SiteHalfSpinor> SiteChiP(LLs);
-    Vector<SiteHalfSpinor> SiteChiM(LLs);
-    Vector<SiteSpinor>     SiteChi(LLs);
-
-    SiteHalfSpinor BcastP;
-    SiteHalfSpinor BcastM;
-
-#pragma omp for 
-  for(auto site=0;site<vol;site++){
-
-    for(int s=0;s<LLs;s++){
-      int lex = s+LLs*site;
-      spProj5p(SitePplus[s] ,psi[lex]);
-      spProj5m(SitePminus[s],psi[lex]);
-      SiteChiP[s]=zero;
-      SiteChiM[s]=zero;
+  if ( switcheroo<Coeff_t>::iscomplex() ) {
+  PARALLEL_FOR_LOOP
+    for(auto site=0;site<vol;site++){
+      MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    }
-      
-    int s=0;
-    for(int  l=0; l<Simd::Nsimd();l++){ // simd lane
-      for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
-	vbroadcast(BcastP,SitePplus [s2],l);
-	vbroadcast(BcastM,SitePminus[s2],l);
-	for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
-	  SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
-	  SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
-	}
-      s++;
-    }}
-
-    for(int s=0;s<LLs;s++){
-      int lex = s+LLs*site;
-      spRecon5p(SiteChi[s],SiteChiP[s]);
-      accumRecon5m(SiteChi[s],SiteChiM[s]);
-      chi[lex] = SiteChi[s]*0.5;
+  } else { 
+  PARALLEL_FOR_LOOP
+    for(auto site=0;site<vol;site++){
+      MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
    }
  }
-  }
  MooeeInvTime+=usecond();
 }

@@ -414,4 +815,5 @@ template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const Fermion
 template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
 template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);

+
 }}
@@ -48,6 +48,8 @@ namespace Grid {

      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};

+      virtual FermionField &tmp(void) = 0;
+
      GridBase * Grid(void)   { return FermionGrid(); };   // this is all the linalg routines need to know
      GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

@@ -61,7 +61,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      LebesgueEvenOdd(_cbgrid),
      Umu(&Fgrid),
      UmuEven(&Hgrid),
-      UmuOdd(&Hgrid) {
+      UmuOdd(&Hgrid),
+      _tmp(&Hgrid)
+{
  // Allocate the required comms buffer
  ImportGauge(_Umu);
 }
@@ -58,6 +58,9 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  GridBase *FermionGrid(void) { return _grid; }
  GridBase *FermionRedBlackGrid(void) { return _cbgrid; }

+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }
+
  //////////////////////////////////////////////////////////////////
  // override multiply; cut number routines if pass dagger argument
  // and also make interface more uniformly consistent
@@ -60,7 +60,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  UmuEven(_FourDimRedBlackGrid),
  UmuOdd (_FourDimRedBlackGrid),
  Lebesgue(_FourDimGrid),
-  LebesgueEvenOdd(_FourDimRedBlackGrid)
+  LebesgueEvenOdd(_FourDimRedBlackGrid),
+  _tmp(&FiveDimRedBlackGrid)
 {
  if (Impl::LsVectorised) { 

@@ -74,6 +74,9 @@ namespace QCD {
     typedef WilsonKernels<Impl> Kernels;
     PmuStat stat;

+     FermionField _tmp;
+     FermionField &tmp(void) { return _tmp; }
+
     void Report(void);
     void ZeroCounters(void);
     double DhopCalls;
@@ -32,6 +32,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <type_traits>

 namespace Grid {
+  // Vector IO utilities ///////////////////////////////////////////////////////
  // helper function to read space-separated values
  template <typename T>
  std::vector<T> strToVec(const std::string s)
@@ -67,6 +68,77 @@ namespace Grid {
    return os;
  }
  
+  // Vector element trait //////////////////////////////////////////////////////  
+  template <typename T>
+  struct element
+  {
+    typedef T type;
+    static constexpr bool is_number = false;
+  };
+  
+  template <typename T>
+  struct element<std::vector<T>>
+  {
+    typedef typename element<T>::type type;
+    static constexpr bool is_number = std::is_arithmetic<T>::value
+                                      or is_complex<T>::value
+                                      or element<T>::is_number;
+  };
+  
+  // Vector flatening utility class ////////////////////////////////////////////
+  // Class to flatten a multidimensional std::vector
+  template <typename V>
+  class Flatten
+  {
+  public:
+    typedef typename element<V>::type Element;
+  public:
+    explicit                     Flatten(const V &vector);
+    const V &                    getVector(void);
+    const std::vector<Element> & getFlatVector(void);
+    const std::vector<size_t>  & getDim(void);
+  private:
+    void accumulate(const Element &e);
+    template <typename W>
+    void accumulate(const W &v);
+    void accumulateDim(const Element &e);
+    template <typename W>
+    void accumulateDim(const W &v);
+  private:
+    const V              &vector_;
+    std::vector<Element> flatVector_;
+    std::vector<size_t>  dim_;
+  };
+  
+  
+  // Class to reconstruct a multidimensional std::vector
+  template <typename V>
+  class Reconstruct
+  {
+  public:
+    typedef typename element<V>::type Element;
+  public:
+    Reconstruct(const std::vector<Element> &flatVector,
+                const std::vector<size_t> &dim);
+    const V &                    getVector(void);
+    const std::vector<Element> & getFlatVector(void);
+    const std::vector<size_t>  & getDim(void);
+  private:
+    void fill(std::vector<Element> &v);
+    template <typename W>
+    void fill(W &v);
+    void resize(std::vector<Element> &v, const unsigned int dim);
+    template <typename W>
+    void resize(W &v, const unsigned int dim);
+  private:
+    V                          vector_;
+    const std::vector<Element> &flatVector_;
+    std::vector<size_t>        dim_;
+    size_t                     ind_{0};
+    unsigned int               dimInd_{0};
+  };
+  
+  // Abstract writer/reader classes ////////////////////////////////////////////
  // static polymorphism implemented using CRTP idiom
  class Serializable;
  
@@ -83,12 +155,7 @@ namespace Grid {
    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
    write(const std::string& s, const U &output);
    template <typename U>
-    typename std::enable_if<std::is_enum<U>::value, void>::type
-    write(const std::string& s, const U &output);
-    template <typename U>
-    typename std::enable_if<
-      !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
-      void>::type
+    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
    write(const std::string& s, const U &output);
  private:
    T *upcast;
@@ -107,12 +174,7 @@ namespace Grid {
    typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
    read(const std::string& s, U &output);
    template <typename U>
-    typename std::enable_if<std::is_enum<U>::value, void>::type
-    read(const std::string& s, U &output);
-    template <typename U>
-    typename std::enable_if<
-      !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
-      void>::type
+    typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
    read(const std::string& s, U &output);
  protected:
    template <typename U>
@@ -142,7 +204,128 @@ namespace Grid {
    }
  };
  
-  // Generic writer interface
+  // Flatten class template implementation /////////////////////////////////////
+  template <typename V>
+  void Flatten<V>::accumulate(const Element &e)
+  {
+    flatVector_.push_back(e);
+  }
+  
+  template <typename V>
+  template <typename W>
+  void Flatten<V>::accumulate(const W &v)
+  {
+    for (auto &e: v)
+    {
+      accumulate(e);
+    }
+  }
+  
+  template <typename V>
+  void Flatten<V>::accumulateDim(const Element &e) {};
+  
+  template <typename V>
+  template <typename W>
+  void Flatten<V>::accumulateDim(const W &v)
+  {
+    dim_.push_back(v.size());
+    accumulateDim(v[0]);
+  }
+  
+  template <typename V>
+  Flatten<V>::Flatten(const V &vector)
+  : vector_(vector)
+  {
+    accumulate(vector_);
+    accumulateDim(vector_);
+  }
+  
+  template <typename V>
+  const V & Flatten<V>::getVector(void)
+  {
+    return vector_;
+  }
+  
+  template <typename V>
+  const std::vector<typename Flatten<V>::Element> &
+  Flatten<V>::getFlatVector(void)
+  {
+    return flatVector_;
+  }
+  
+  template <typename V>
+  const std::vector<size_t> & Flatten<V>::getDim(void)
+  {
+    return dim_;
+  }
+  
+  // Reconstruct class template implementation /////////////////////////////////
+  template <typename V>
+  void Reconstruct<V>::fill(std::vector<Element> &v)
+  {
+    for (auto &e: v)
+    {
+      e = flatVector_[ind_++];
+    }
+  }
+  
+  template <typename V>
+  template <typename W>
+  void Reconstruct<V>::fill(W &v)
+  {
+    for (auto &e: v)
+    {
+      fill(e);
+    }
+  }
+  
+  template <typename V>
+  void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
+  {
+    v.resize(dim_[dim]);
+  }
+  
+  template <typename V>
+  template <typename W>
+  void Reconstruct<V>::resize(W &v, const unsigned int dim)
+  {
+    v.resize(dim_[dim]);
+    for (auto &e: v)
+    {
+      resize(e, dim + 1);
+    }
+  }
+  
+  template <typename V>
+  Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
+                              const std::vector<size_t> &dim)
+  : flatVector_(flatVector)
+  , dim_(dim)
+  {
+    resize(vector_, 0);
+    fill(vector_);
+  }
+  
+  template <typename V>
+  const V & Reconstruct<V>::getVector(void)
+  {
+    return vector_;
+  }
+  
+  template <typename V>
+  const std::vector<typename Reconstruct<V>::Element> &
+  Reconstruct<V>::getFlatVector(void)
+  {
+    return flatVector_;
+  }
+  
+  template <typename V>
+  const std::vector<size_t> & Reconstruct<V>::getDim(void)
+  {
+    return dim_;
+  }
+  
+  // Generic writer interface //////////////////////////////////////////////////
  template <typename T>
  inline void push(Writer<T> &w, const std::string &s)
  {
@@ -221,23 +404,13 @@ namespace Grid {
  
  template <typename T>
  template <typename U>
-  typename std::enable_if<std::is_enum<U>::value, void>::type
-  Writer<T>::write(const std::string &s, const U &output)
-  {
-    EnumIO<U>::write(*this, s, output);
-  }
-  
-  template <typename T>
-  template <typename U>
-  typename std::enable_if<
-    !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
-    void>::type
+  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
  Writer<T>::write(const std::string &s, const U &output)
  {
    upcast->writeDefault(s, output);
  }
  
-  // Reader template implementation ////////////////////////////////////////////
+  // Reader template implementation
  template <typename T>
  Reader<T>::Reader(void)
  {
@@ -266,17 +439,7 @@ namespace Grid {
  
  template <typename T>
  template <typename U>
-  typename std::enable_if<std::is_enum<U>::value, void>::type
-  Reader<T>::read(const std::string &s, U &output)
-  {
-    EnumIO<U>::read(*this, s, output);
-  }
-  
-  template <typename T>
-  template <typename U>
-  typename std::enable_if<
-    !(std::is_base_of<Serializable, U>::value or std::is_enum<U>::value),
-    void>::type
+  typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
  Reader<T>::read(const std::string &s, U &output)
  {
    upcast->readDefault(s, output);
@@ -300,7 +463,6 @@ namespace Grid {
      abort();
    }
  }
-
 }

 #endif
@@ -0,0 +1,103 @@
+#include <Grid.h>
+
+using namespace Grid;
+#ifndef H5_NO_NAMESPACE
+using namespace H5NS;
+#endif
+
+// Writer implementation ///////////////////////////////////////////////////////
+Hdf5Writer::Hdf5Writer(const std::string &fileName)
+: fileName_(fileName)
+, file_(fileName.c_str(), H5F_ACC_TRUNC)
+{
+  group_ = file_.openGroup("/");
+  writeSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold",
+                       Hdf5Type<unsigned int>::type());
+}
+
+void Hdf5Writer::push(const std::string &s)
+{
+  group_ = group_.createGroup(s);
+  path_.push_back(s);
+}
+
+void Hdf5Writer::pop(void)
+{
+  path_.pop_back();
+  if (path_.empty())
+  {
+    group_ = file_.openGroup("/");
+  }
+  else
+  {
+    auto binOp = [](const std::string &a, const std::string &b)->std::string
+    {
+      return a + "/" + b;
+    };
+    
+    group_ = group_.openGroup(std::accumulate(path_.begin(), path_.end(),
+                                              std::string(""), binOp));
+  }
+}
+
+template <>
+void Hdf5Writer::writeDefault(const std::string &s, const std::string &x)
+{
+  StrType     strType(PredType::C_S1, x.size());
+  
+  writeSingleAttribute(*(x.data()), s, strType);
+}
+
+void Hdf5Writer::writeDefault(const std::string &s, const char *x)
+{
+  std::string sx(x);
+  
+  writeDefault(s, sx);
+}
+
+// Reader implementation ///////////////////////////////////////////////////////
+Hdf5Reader::Hdf5Reader(const std::string &fileName)
+: fileName_(fileName)
+, file_(fileName.c_str(), H5F_ACC_RDONLY)
+{
+  group_ = file_.openGroup("/");
+  readSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold",
+                      Hdf5Type<unsigned int>::type());
+}
+
+void Hdf5Reader::push(const std::string &s)
+{
+  group_ = group_.openGroup(s);
+  path_.push_back(s);
+}
+
+void Hdf5Reader::pop(void)
+{
+  path_.pop_back();
+  if (path_.empty())
+  {
+    group_ = file_.openGroup("/");
+  }
+  else
+  {
+    auto binOp = [](const std::string &a, const std::string &b)->std::string
+    {
+      return a + "/" + b;
+    };
+    
+    group_ = group_.openGroup(std::accumulate(path_.begin(), path_.end(),
+                                              std::string(""), binOp));
+  }
+}
+
+template <>
+void Hdf5Reader::readDefault(const std::string &s, std::string &x)
+{
+  Attribute attribute;
+  
+  attribute       = group_.openAttribute(s);
+  StrType strType = attribute.getStrType();
+  
+  x.resize(strType.getSize());
+  attribute.read(strType, &(x[0]));
+}
@@ -0,0 +1,242 @@
+#ifndef GRID_SERIALISATION_HDF5_H
+#define GRID_SERIALISATION_HDF5_H
+
+#include <stack>
+#include <string>
+#include <vector>
+#include <H5Cpp.h>
+#include "Hdf5Type.h"
+
+#ifndef H5_NO_NAMESPACE
+#define H5NS H5
+#endif
+
+// default thresold above which datasets are used instead of attributes
+#ifndef HDF5_DEF_DATASET_THRES
+#define HDF5_DEF_DATASET_THRES 6u
+#endif
+
+// name guard for Grid metadata
+#define HDF5_GRID_GUARD "_Grid_"
+
+namespace Grid
+{
+  class Hdf5Writer: public Writer<Hdf5Writer>
+  {
+  public:
+    Hdf5Writer(const std::string &fileName);
+    virtual ~Hdf5Writer(void) = default;
+    void push(const std::string &s);
+    void pop(void);
+    void writeDefault(const std::string &s, const char *x);
+    template <typename U>
+    void writeDefault(const std::string &s, const U &x);
+    template <typename U>
+    typename std::enable_if<element<std::vector<U>>::is_number, void>::type
+    writeDefault(const std::string &s, const std::vector<U> &x);
+    template <typename U>
+    typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
+    writeDefault(const std::string &s, const std::vector<U> &x);
+  private:
+    template <typename U>
+    void writeSingleAttribute(const U &x, const std::string &name,
+                              const H5NS::DataType &type);
+  private:
+    std::string              fileName_;
+    std::vector<std::string> path_;
+    H5NS::H5File             file_;
+    H5NS::Group              group_;
+    unsigned int             dataSetThres_{HDF5_DEF_DATASET_THRES};
+  };
+  
+  class Hdf5Reader: public Reader<Hdf5Reader>
+  {
+  public:
+    Hdf5Reader(const std::string &fileName);
+    virtual ~Hdf5Reader(void) = default;
+    void push(const std::string &s);
+    void pop(void);
+    template <typename U>
+    void readDefault(const std::string &s, U &output);
+    template <typename U>
+    typename std::enable_if<element<std::vector<U>>::is_number, void>::type
+    readDefault(const std::string &s, std::vector<U> &x);
+    template <typename U>
+    typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
+    readDefault(const std::string &s, std::vector<U> &x);
+  private:
+    template <typename U>
+    void readSingleAttribute(U &x, const std::string &name,
+                             const H5NS::DataType &type);
+  private:
+    std::string              fileName_;
+    std::vector<std::string> path_;
+    H5NS::H5File             file_;
+    H5NS::Group              group_;
+    unsigned int             dataSetThres_;
+  };
+  
+  // Writer template implementation ////////////////////////////////////////////
+  template <typename U>
+  void Hdf5Writer::writeSingleAttribute(const U &x, const std::string &name,
+                                        const H5NS::DataType &type)
+  {
+    H5NS::Attribute attribute;
+    hsize_t         attrDim = 1;
+    H5NS::DataSpace attrSpace(1, &attrDim);
+    
+    attribute = group_.createAttribute(name, type, attrSpace);
+    attribute.write(type, &x);
+  }
+  
+  template <typename U>
+  void Hdf5Writer::writeDefault(const std::string &s, const U &x)
+  {
+    writeSingleAttribute(x, s, Hdf5Type<U>::type());
+  }
+  
+  template <>
+  void Hdf5Writer::writeDefault(const std::string &s, const std::string &x);
+  
+  template <typename U>
+  typename std::enable_if<element<std::vector<U>>::is_number, void>::type
+  Hdf5Writer::writeDefault(const std::string &s, const std::vector<U> &x)
+  {
+    // alias to element type
+    typedef typename element<std::vector<U>>::type Element;
+    
+    // flatten the vector and getting dimensions
+    Flatten<std::vector<U>> flat(x);
+    std::vector<hsize_t> dim;
+    const auto           &flatx = flat.getFlatVector();
+    
+    for (auto &d: flat.getDim())
+    {
+      dim.push_back(d);
+    }
+    
+    // write to file
+    H5NS::DataSpace dataSpace(dim.size(), dim.data());
+    
+    if (flatx.size() > dataSetThres_)
+    {
+      H5NS::DataSet dataSet;
+      
+      dataSet = group_.createDataSet(s, Hdf5Type<Element>::type(), dataSpace);
+      dataSet.write(flatx.data(), Hdf5Type<Element>::type());
+    }
+    else
+    {
+      H5NS::Attribute attribute;
+      
+      attribute = group_.createAttribute(s, Hdf5Type<Element>::type(), dataSpace);
+      attribute.write(Hdf5Type<Element>::type(), flatx.data());
+    }
+  }
+  
+  template <typename U>
+  typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
+  Hdf5Writer::writeDefault(const std::string &s, const std::vector<U> &x)
+  {
+    push(s);
+    writeSingleAttribute(x.size(), HDF5_GRID_GUARD "vector_size",
+                         Hdf5Type<uint64_t>::type());
+    for (hsize_t i = 0; i < x.size(); ++i)
+    {
+      write(s + "_" + std::to_string(i), x[i]);
+    }
+    pop();
+  }
+  
+  // Reader template implementation ////////////////////////////////////////////
+  template <typename U>
+  void Hdf5Reader::readSingleAttribute(U &x, const std::string &name,
+                                       const H5NS::DataType &type)
+  {
+    H5NS::Attribute attribute;
+    
+    attribute = group_.openAttribute(name);
+    attribute.read(type, &x);
+  }
+  
+  template <typename U>
+  void Hdf5Reader::readDefault(const std::string &s, U &output)
+  {
+    readSingleAttribute(output, s, Hdf5Type<U>::type());
+  }
+  
+  template <>
+  void Hdf5Reader::readDefault(const std::string &s, std::string &x);
+  
+  template <typename U>
+  typename std::enable_if<element<std::vector<U>>::is_number, void>::type
+  Hdf5Reader::readDefault(const std::string &s, std::vector<U> &x)
+  {
+    // alias to element type
+    typedef typename element<std::vector<U>>::type Element;
+    
+    // read the dimensions
+    H5NS::DataSpace       dataSpace;
+    std::vector<hsize_t>  hdim;
+    std::vector<size_t>   dim;
+    hsize_t               size = 1;
+    
+    if (group_.attrExists(s))
+    {
+      dataSpace = group_.openAttribute(s).getSpace();
+    }
+    else
+    {
+      dataSpace = group_.openDataSet(s).getSpace();
+    }
+    hdim.resize(dataSpace.getSimpleExtentNdims());
+    dataSpace.getSimpleExtentDims(hdim.data());
+    for (auto &d: hdim)
+    {
+      dim.push_back(d);
+      size *= d;
+    }
+    
+    // read the flat vector
+    std::vector<Element> buf(size);
+
+    if (size > dataSetThres_)
+    {
+      H5NS::DataSet dataSet;
+      
+      dataSet = group_.openDataSet(s);
+      dataSet.read(buf.data(), Hdf5Type<Element>::type());
+    }
+    else
+    {
+      H5NS::Attribute attribute;
+      
+      attribute = group_.openAttribute(s);
+      attribute.read(Hdf5Type<Element>::type(), buf.data());
+    }
+    
+    // reconstruct the multidimensional vector
+    Reconstruct<std::vector<U>> r(buf, dim);
+    
+    x = r.getVector();
+  }
+  
+  template <typename U>
+  typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
+  Hdf5Reader::readDefault(const std::string &s, std::vector<U> &x)
+  {
+    uint64_t size;
+    
+    push(s);
+    readSingleAttribute(size, HDF5_GRID_GUARD "vector_size",
+                        Hdf5Type<uint64_t>::type());
+    x.resize(size);
+    for (hsize_t i = 0; i < x.size(); ++i)
+    {
+      read(s + "_" + std::to_string(i), x[i]);
+    }
+    pop();
+  }
+}
+
+#endif
@@ -0,0 +1,77 @@
+#ifndef GRID_SERIALISATION_HDF5_TYPE_H
+#define GRID_SERIALISATION_HDF5_TYPE_H
+
+#include <H5Cpp.h>
+#include <complex>
+#include <memory>
+
+#ifndef H5_NO_NAMESPACE
+#define H5NS H5
+#endif
+
+#define HDF5_NATIVE_TYPE(predType, cType)\
+template <>\
+class Hdf5Type<cType>\
+{\
+public:\
+  static inline const H5NS::DataType & type(void)\
+  {\
+    return H5NS::PredType::predType;\
+  }\
+  static constexpr bool isNative = true;\
+};
+
+#define DEFINE_HDF5_NATIVE_TYPES \
+HDF5_NATIVE_TYPE(NATIVE_B8,      bool);\
+HDF5_NATIVE_TYPE(NATIVE_CHAR,    char);\
+HDF5_NATIVE_TYPE(NATIVE_SCHAR,   signed char);\
+HDF5_NATIVE_TYPE(NATIVE_UCHAR,   unsigned char);\
+HDF5_NATIVE_TYPE(NATIVE_SHORT,   short);\
+HDF5_NATIVE_TYPE(NATIVE_USHORT,  unsigned short);\
+HDF5_NATIVE_TYPE(NATIVE_INT,     int);\
+HDF5_NATIVE_TYPE(NATIVE_UINT,    unsigned int);\
+HDF5_NATIVE_TYPE(NATIVE_LONG,    long);\
+HDF5_NATIVE_TYPE(NATIVE_ULONG,   unsigned long);\
+HDF5_NATIVE_TYPE(NATIVE_LLONG,   long long);\
+HDF5_NATIVE_TYPE(NATIVE_ULLONG,  unsigned long long);\
+HDF5_NATIVE_TYPE(NATIVE_FLOAT,   float);\
+HDF5_NATIVE_TYPE(NATIVE_DOUBLE,  double);\
+HDF5_NATIVE_TYPE(NATIVE_LDOUBLE, long double);
+
+namespace Grid
+{
+  template <typename T> class Hdf5Type
+  {
+  public:
+    static constexpr bool isNative = false;
+  };
+  
+  DEFINE_HDF5_NATIVE_TYPES;
+  
+  template <typename R>
+  class Hdf5Type<std::complex<R>>
+  {
+  public:
+    static inline const H5NS::DataType & type(void)
+    {
+      if (typePtr_ == nullptr)
+      {
+        typePtr_.reset(new H5NS::CompType(sizeof(std::complex<R>)));
+        typePtr_->insertMember("re", 0,         Hdf5Type<R>::type());
+        typePtr_->insertMember("im", sizeof(R), Hdf5Type<R>::type());
+      }
+
+      return *typePtr_;
+    }
+    static constexpr bool isNative = false;
+  private:
+    static std::unique_ptr<H5NS::CompType> typePtr_;
+  };
+  
+  template <typename R>
+  std::unique_ptr<H5NS::CompType> Hdf5Type<std::complex<R>>::typePtr_ = nullptr;
+}
+
+#undef HDF5_NATIVE_TYPE
+
+#endif /* GRID_SERIALISATION_HDF5_TYPE_H */
@@ -109,40 +109,36 @@ THE SOFTWARE.
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 #define GRID_MACRO_MEMBER(A,B)        A B;
+#define GRID_MACRO_COMP_MEMBER(A,B) result = (result and (lhs. B == rhs. B));
 #define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " <<std::endl;
 #define GRID_MACRO_READ_MEMBER(A,B) Grid::read(RD,#B,obj. B);
 #define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);

-#define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)		\
-  \
-  \
-  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))		\
-  \
-  \
-  template <typename T>\
-  static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
-    push(WR,s);\
-    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
-    pop(WR);\
-  } \
-  \
-  \
-  template <typename T>\
-  static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
-    push(RD,s);\
-    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
-    pop(RD);\
-  } \
-  \
-  \
-  friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
-    os<<"class "<<#cname<<" {"<<std::endl;\
-    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
-      os<<"}";								\
-    return os;\
-  };  
-
-
+#define GRID_SERIALIZABLE_CLASS_MEMBERS(cname,...)\
+GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\
+template <typename T>\
+static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
+  push(WR,s);\
+  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
+  pop(WR);\
+}\
+template <typename T>\
+static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
+  push(RD,s);\
+  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
+  pop(RD);\
+}\
+friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
+  os<<"class "<<#cname<<" {"<<std::endl;\
+  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
+    os<<"}";								\
+  return os;\
+}\
+friend inline bool operator==(const cname &lhs, const cname &rhs) {\
+  bool result = true;\
+  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_COMP_MEMBER,__VA_ARGS__))\
+  return result;\
+}

 #define GRID_ENUM_TYPE(obj) std::remove_reference<decltype(obj)>::type
 #define GRID_MACRO_ENUMVAL(A,B) A = B,
@@ -150,44 +146,52 @@ THE SOFTWARE.
 #define GRID_MACRO_ENUMTEST(A,B) else if (buf == #A) {obj = GRID_ENUM_TYPE(obj)::A;}
 #define GRID_MACRO_ENUMCASEIO(A,B) case GRID_ENUM_TYPE(obj)::A: os << #A; break;

-namespace Grid {
-  template <typename U>
-  class EnumIO {};
-}
-
 #define GRID_SERIALIZABLE_ENUM(name,undefname,...)\
-  enum class name {\
-      GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\
-      undefname = -1\
+class name: public Grid::Serializable\
+{\
+public:\
+  enum EnumType\
+  {\
+    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\
+    undefname = -1\
  };\
+public:\
+  name(void): value_(undefname) {};\
+  name(EnumType value): value_(value) {};\
+  template <typename T>\
+  static inline void write(Grid::Writer<T> &WR,const std::string &s, const name &obj)\
+  {\
+    switch (obj.value_)\
+    {\
+      GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
+      default: Grid::write(WR,s,#undefname); break;\
+    }\
+  }\
  \
-  template<>\
-  class EnumIO<name> {\
-    public:\
-      template <typename T>\
-      static inline void write(Writer<T> &WR,const std::string &s, const name &obj){ \
-        switch (obj) {\
-          GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
-          default: Grid::write(WR,s,#undefname); break;\
-        }\
-      }\
-      \
-      template <typename T>\
-      static inline void read(Reader<T> &RD,const std::string &s, name &obj){ \
-        std::string buf;\
-        Grid::read(RD, s, buf);\
-        if (buf == #undefname) {obj = name::undefname;}\
-        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\
-        else {obj = name::undefname;}\
-      }\
-  };\
-  \
-  inline std::ostream & operator << (std::ostream &os, const name &obj ) { \
+  template <typename T>\
+  static inline void read(Grid::Reader<T> &RD,const std::string &s, name &obj)\
+  {\
+    std::string buf;\
+    Grid::read(RD, s, buf);\
+    if (buf == #undefname) {obj = name::undefname;}\
+    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\
+    else {obj = name::undefname;}\
+  }\
+  inline operator EnumType(void) const\
+  {\
+    return value_;\
+  }\
+  inline friend std::ostream & operator<<(std::ostream &os, const name &obj)\
+  {\
    switch (obj) {\
-        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
-        default: os << #undefname; break;\
+      GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
+      default: os << #undefname; break;\
    }\
    return os;\
-  };
+  }\
+private:\
+  EnumType value_;\
+};
+

 #endif
@@ -36,6 +36,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "BinaryIO.h"
 #include "TextIO.h"
 #include "XmlIO.h"
+#ifdef HAVE_HDF5
+#include "Hdf5IO.h"
+#endif
 //////////////////////////////////////////
 // Todo:
 //////////////////////////////////////////
@@ -213,6 +213,29 @@ namespace Optimization {
    }
  };

+  struct MultRealPart{
+    inline __m256 operator()(__m256 a, __m256 b){
+      __m256 ymm0;
+      ymm0  = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
+      return  _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
+    }
+    inline __m256d operator()(__m256d a, __m256d b){
+      __m256d ymm0;
+      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
+      return _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
+    }
+  };
+  struct MaddRealPart{
+    inline __m256 operator()(__m256 a, __m256 b, __m256 c){
+      __m256 ymm0 =  _mm256_moveldup_ps(a); // ymm0 <- ar ar,
+      return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c);                         
+    }
+    inline __m256d operator()(__m256d a, __m256d b, __m256d c){
+      __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 );
+      return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c);                         
+    }
+  };
+
  struct MultComplex{
    // Complex float
    inline __m256 operator()(__m256 a, __m256 b){
@@ -627,7 +650,9 @@ namespace Optimization {
  typedef Optimization::Sub         SubSIMD;
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
-  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultComplex  MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
@@ -189,6 +189,29 @@ namespace Optimization {
  //  2mul,4 mac +add+sub = 8 flop type insns
  //  3shuf + 2 (+shuf)   = 5/6 simd perm and 1/2 the load.

+  struct MultRealPart{
+    inline __m512 operator()(__m512 a, __m512 b){
+      __m512 ymm0;
+      ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar,
+      return _mm512_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
+    }
+    inline __m512d operator()(__m512d a, __m512d b){
+      __m512d ymm0;
+      ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00
+      return _mm512_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
+    }
+  };
+  struct MaddRealPart{
+    inline __m512 operator()(__m512 a, __m512 b, __m512 c){
+      __m512 ymm0 =  _mm512_moveldup_ps(a); // ymm0 <- ar ar,
+      return _mm512_fmadd_ps( ymm0, b, c);                         
+    }
+    inline __m512d operator()(__m512d a, __m512d b, __m512d c){
+      __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 );
+      return _mm512_fmadd_pd( ymm0, b, c);                         
+    }
+  };
+
  struct MultComplex{
    // Complex float
    inline __m512 operator()(__m512 a, __m512 b){
@@ -501,6 +524,8 @@ namespace Optimization {
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
@@ -224,6 +224,21 @@ namespace Optimization {
  #define cmul(a, b, c, i)\
  c[i]   = a[i]*b[i]   - a[i+1]*b[i+1];\
  c[i+1] = a[i]*b[i+1] + a[i+1]*b[i];
+
+  struct MultRealPart{
+    template <typename T>
+    inline vec<T> operator()(vec<T> a, vec<T> b){
+      vec<T> out;
+      
+      VECTOR_FOR(i, W<T>::c, 1)
+      {
+         out.v[2*i]   = a[2*i]*b[2*i];
+         out.v[2*i+1] = a[2*i]*b[2*i+1];
+      }      
+      return out;
+    };
+  };
+
  
  struct MultComplex{
    // Complex
@@ -456,6 +471,7 @@ namespace Optimization {
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
@@ -220,6 +220,14 @@ namespace Optimization {
    }
  };
  
+  struct MultRealPart{
+    // Complex double
+    inline vector4double operator()(vector4double a, vector4double b){
+  //      return vec_xmul(b, a);
+        return vec_xmul(a, b);
+    }
+    FLOAT_WRAP_2(operator(), inline)
+  };
  struct MultComplex{
    // Complex double
    inline vector4double operator()(vector4double a, vector4double b){
@@ -430,6 +438,7 @@ typedef Optimization::Sub         SubSIMD;
 typedef Optimization::Mult        MultSIMD;
 typedef Optimization::Div         DivSIMD;
 typedef Optimization::MultComplex MultComplexSIMD;
+typedef Optimization::MultRealPart MultRealPartSIMD;
 typedef Optimization::Conj        ConjSIMD;
 typedef Optimization::TimesMinusI TimesMinusISIMD;
 typedef Optimization::TimesI      TimesISIMD;
@@ -177,6 +177,29 @@ namespace Optimization {
    }
  };

+  struct MultRealPart{
+    inline __m128 operator()(__m128 a, __m128 b){
+      __m128 ymm0;
+      ymm0  = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
+      return  _mm_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
+    }
+    inline __m128d operator()(__m128d a, __m128d b){
+      __m128d ymm0;
+      ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
+      return _mm_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
+    }
+  };
+  struct MaddRealPart{
+    inline __m128 operator()(__m128 a, __m128 b, __m128 c){
+      __m128 ymm0 =  _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
+      return _mm_add_ps(_mm_mul_ps( ymm0, b),c);                         
+    }
+    inline __m128d operator()(__m128d a, __m128d b, __m128d c){
+      __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 );
+      return _mm_add_pd(_mm_mul_pd( ymm0, b),c);                         
+    }
+  };
+
  struct MultComplex{
    // Complex float
    inline __m128 operator()(__m128 a, __m128 b){
@@ -325,9 +348,11 @@ namespace Optimization {
      }
    }
  
+#ifndef _mm_alignr_epi64
 #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
-    
+#endif 
+
    template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
    template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };

@@ -415,6 +440,8 @@ namespace Optimization {
  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;
@@ -101,6 +101,11 @@ template <typename T> using IfNotInteger = Invoke<std::enable_if<!std::is_integr
 // general forms to allow for vsplat syntax
 // need explicit declaration of types when used since
 // clang cannot automatically determine the output type sometimes
+template <class Out, class Input1, class Input2, class Input3, class Operation>
+Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) {
+  return op(src_1, src_2, src_3);
+}
+
 template <class Out, class Input1, class Input2, class Operation>
 Out binary(Input1 src_1, Input2 src_2, Operation op) {
  return op(src_1, src_2);
@@ -178,6 +183,7 @@ class Grid_simd {
                          const Grid_simd *__restrict__ r) {
    *y = (*l) * (*r);
  }
+
  friend inline void sub(Grid_simd *__restrict__ y,
                         const Grid_simd *__restrict__ l,
                         const Grid_simd *__restrict__ r) {
@@ -188,7 +194,6 @@ class Grid_simd {
                         const Grid_simd *__restrict__ r) {
    *y = (*l) + (*r);
  }
-
  friend inline void mac(Grid_simd *__restrict__ y,
                         const Scalar_type *__restrict__ a,
                         const Grid_simd *__restrict__ x) {
@@ -260,7 +265,7 @@ class Grid_simd {
  }

  ////////////////////////////
-  // opreator scalar * simd
+  // operator scalar * simd
  ////////////////////////////
  friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) {
    Grid_simd va;
@@ -433,6 +438,11 @@ inline void vbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
  S* typepun =(S*) &src;
  vsplat(ret,typepun[lane]);
 }    
+template <class S, class V, IfComplex<S> =0> 
+inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
+  S* typepun =(S*) &src;
+  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
+}    

 ///////////////////////
 // Splat
@@ -449,6 +459,10 @@ template <class S, class V>
 inline void vsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
  vsplat(ret, real(c), imag(c));
 }
+template <class S, class V>
+inline void rsplat(Grid_simd<S, V> &ret, EnableIf<is_complex<S>, S> c) {
+  vsplat(ret, real(c), real(c));
+}

 // if real fill with a, if complex fill with a in the real part (first function
 // above)
@@ -550,6 +564,21 @@ inline Grid_simd<S, V> operator-(Grid_simd<S, V> a, Grid_simd<S, V> b) {
  return ret;
 };

+// Distinguish between complex types and others
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_mult(Grid_simd<S, V> a, Grid_simd<S, V> b) {
+  Grid_simd<S, V> ret;
+  ret.v = binary<V>(a.v, b.v, MultRealPartSIMD());
+  return ret;
+};
+template <class S, class V, IfComplex<S> = 0>
+inline Grid_simd<S, V> real_madd(Grid_simd<S, V> a, Grid_simd<S, V> b, Grid_simd<S,V> c) {
+  Grid_simd<S, V> ret;
+  ret.v = trinary<V>(a.v, b.v, c.v, MaddRealPartSIMD());
+  return ret;
+};
+
+
 // Distinguish between complex types and others
 template <class S, class V, IfComplex<S> = 0>
 inline Grid_simd<S, V> operator*(Grid_simd<S, V> a, Grid_simd<S, V> b) {
@@ -95,10 +95,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VIDUPd(SRC,DEST)       "vpshufd  $0xee," #SRC"," #DEST  ";\n" // 32 bit level: 3,2,3,2
 #define VIDUPf(SRC,DEST)         "vmovshdup " #SRC ", " #DEST  ";\n"

-#define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
-#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
-#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
-#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
+#define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
+#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
+#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss    (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
+#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss    (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
+#define VBCASTCDUPf(OFF,A,DEST)           "vbroadcastsd    (" #OFF "*64  )(" #A ")," #DEST  ";\n" 
+#define VBCASTZDUPf(OFF,A,DEST)           "vbroadcastf32x4 (" #OFF "*64  )(" #A ")," #DEST  ";\n" 
+#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST) 
+#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST) 

 #define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
 #define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
@@ -106,11 +110,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"


+#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"

+#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
@@ -87,7 +87,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  VACCTIMESMINUSI1d(A,ACC,tmp)				\
  VACCTIMESMINUSI2d(A,ACC,tmp)			

-#define LOAD64i(A,ptr)  __asm__ ( "movq %0, %" #A :  : "r"(ptr)  : #A  );
+#define LOAD64a(A,ptr)  "movq %0, %" #A :  : "r"(ptr)  : #A  
+#define LOAD64i(A,ptr)  __asm__ ( LOAD64a(A,ptr));
 #define LOAD64(A,ptr)  LOAD64i(A,ptr)

 #define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
@@ -108,8 +109,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
 //  "clevict0 "#O"*64("#A");\n" 

-#define VLOADf(OFF,PTR,DEST)   "vmovaps  " #OFF "*64(" #PTR "), " #DEST  ";\n"
-#define VLOADd(OFF,PTR,DEST)   "vmovapd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VLOADf(OFF,PTR,DEST)   "vmovups  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VLOADd(OFF,PTR,DEST)   "vmovupd  " #OFF "*64(" #PTR "), " #DEST  ";\n"

 #define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
 #define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
@@ -143,8 +144,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VSTOREf(OFF,PTR,SRC)   "vmovntps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #define VSTOREd(OFF,PTR,SRC)   "vmovntpd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #else
-#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
-#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREf(OFF,PTR,SRC)   "vmovups " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREd(OFF,PTR,SRC)   "vmovupd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #endif

 // Swaps Re/Im ; could unify this with IMCI
@@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
 #define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
 #define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
+#undef VMADDRDUP   
 #undef VMADDSUBRDUP   
 #undef VMADDSUBIDUP   
 #undef VMULRDUP   
 #undef VMULIDUP   
+#define VMADDRDUP(O,P,B,accum)    VMADDRDUPd(O,P,B,accum) 
 #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
 #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
 #define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
@@ -144,10 +144,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
 #define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)

+#undef VMADDRDUP   
 #undef VMADDSUBRDUP   
 #undef VMADDSUBIDUP   
 #undef VMULRDUP   
 #undef VMULIDUP   
+#define VMADDRDUP(O,P,B,accum)    VMADDRDUPf(O,P,B,accum) 
 #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
 #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
 #define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
@@ -4,9 +4,8 @@ home=`pwd`

 # library Make.inc
 cd $home/lib
-HFILES=`find . -type f -name '*.h' -not -path '*/Old/*' -not -path '*/Eigen/*'`
-HFILES="$HFILES"
-CCFILES=`find . -type f -name '*.cc' -not  -name '*ommunicator*.cc'`
+HFILES=`find . -type f -name '*.h' -not -name '*Hdf5*' -not -path '*/Old/*' -not -path '*/Eigen/*'`
+CCFILES=`find . -type f -name '*.cc' -not -name '*Communicator*.cc' -not -name '*Hdf5*'`
 echo HFILES=$HFILES > Make.inc
 echo >> Make.inc
 echo CCFILES=$CCFILES >> Make.inc
@@ -28,130 +28,152 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include <Grid/Grid.h>

-namespace Grid {
-  
-  GRID_SERIALIZABLE_ENUM(myenum, undef, red, 1, blue, 2, green, 3);
-    
-  class myclass: Serializable {
-  public:
-    
-    GRID_SERIALIZABLE_CLASS_MEMBERS(myclass,
-                            myenum, e,
-                            std::vector<myenum>, ve,
-                            std::string, name,
-                            int, x,
-                            double, y,
-                            bool , b,
-                            std::vector<double>, array,
-                            std::vector<std::vector<double>>, twodimarray,
-                            );
-    
-    myclass() {}
-    myclass(int i)
-    : array(4,5.1), twodimarray(3,std::vector<double>(2,1.23456)), ve(2, myenum::blue)
-    {
-      e=myenum::red;
-      x=i;
-      y=2*i;
-      b=true;
-      name="bother said pooh";
-    }
-  };
-  
-}
-
 using namespace Grid;

-int16_t i16 = 1;
+GRID_SERIALIZABLE_ENUM(myenum, undef, red, 1, blue, 2, green, 3);
+  
+class myclass: Serializable {
+public:
+  GRID_SERIALIZABLE_CLASS_MEMBERS(myclass,
+                          myenum, e,
+                          std::vector<myenum>, ve,
+                          std::string, name,
+                          int, x,
+                          double, y,
+                          bool , b,
+                          std::vector<double>, array,
+                          std::vector<std::vector<double>>, twodimarray,
+                          std::vector<std::vector<std::vector<Complex>>>, cmplx3darray
+                          );
+  myclass() {}
+  myclass(int i)
+  : array(4,5.1)
+  , twodimarray(3,std::vector<double>(5, 1.23456))
+  , cmplx3darray(3,std::vector<std::vector<Complex>>(5, std::vector<Complex>(7, Complex(1.2, 3.4))))
+  , ve(2, myenum::blue)
+  {
+    e=myenum::red;
+    x=i;
+    y=2*i;
+    b=true;
+    name="bother said pooh";
+  }
+};
+
+int16_t  i16 = 1;
 uint16_t u16 = 2;
-int32_t i32 = 3;
+int32_t  i32 = 3;
 uint32_t u32 = 4;
-int64_t i64 = 5;
+int64_t  i64 = 5;
 uint64_t u64 = 6;
-float    f = M_PI;
-double   d = 2*M_PI;
-bool     b = false;
+float    f   = M_PI;
+double   d   = 2*M_PI;
+bool     b   = false;
+
+template <typename W, typename R, typename O>
+void ioTest(const std::string &filename, const O &object, const std::string &name)
+{
+  // writer needs to be destroyed so that writing physically happens
+  {
+    W writer(filename);
+    
+    write(writer, "testobject", object);
+  }
+  
+  R    reader(filename);
+  O    buf;
+  bool good;
+  
+  read(reader, "testobject", buf);
+  good = (object == buf);
+  std::cout << name << " IO test: " << (good ? "success" : "failure");
+  std::cout << std::endl;
+  if (!good) exit(EXIT_FAILURE);
+}

 int main(int argc,char **argv)
 {
-  {
-    XmlWriter WR("bother.xml");
-    
-    // test basic type writing
-    push(WR,"BasicTypes");
-    write(WR,std::string("i16"),i16);
-    write(WR,"u16",u16);
-    write(WR,"i32",i32);
-    write(WR,"u32",u32);
-    write(WR,"i64",i64);
-    write(WR,"u64",u64);
-    write(WR,"f",f);
-    write(WR,"d",d);
-    write(WR,"b",b);
-    pop(WR);
-    
-    // test serializable class writing
-    myclass obj(1234); // non-trivial constructor
-    write(WR,"obj",obj);
-    WR.write("obj2", obj);
-    std::cout << obj << std::endl;
-    
-    std::vector<myclass> vec;
-    vec.push_back(myclass(1234));
-    vec.push_back(myclass(5678));
-    vec.push_back(myclass(3838));
-    write(WR, "objvec", vec);
-  };
+  std::cout << "==== basic IO" << std::endl;
+  XmlWriter WR("bother.xml");
+  
+  // test basic type writing
+  std::cout << "-- basic writing to 'bother.xml'..." << std::endl;
+  push(WR,"BasicTypes");
+  write(WR,std::string("i16"),i16);
+  write(WR,"u16",u16);
+  write(WR,"i32",i32);
+  write(WR,"u32",u32);
+  write(WR,"i64",i64);
+  write(WR,"u64",u64);
+  write(WR,"f",f);
+  write(WR,"d",d);
+  write(WR,"b",b);
+  pop(WR);
+  
+  // test serializable class writing
+  myclass              obj(1234); // non-trivial constructor
+  std::vector<myclass> vec;
+  
+  std::cout << "-- serialisable class writing to 'bother.xml'..." << std::endl;
+  write(WR,"obj",obj);
+  WR.write("obj2", obj);
+  vec.push_back(myclass(1234));
+  vec.push_back(myclass(5678));
+  vec.push_back(myclass(3838));
+  write(WR, "objvec", vec);
+  std::cout << "-- serialisable class writing to std::cout:" << std::endl;
+  std::cout << obj << std::endl;
+  std::cout << "-- serialisable class comparison:" << std::endl;
+  std::cout << "vec[0] == obj: " << ((vec[0] == obj) ? "true" : "false") << std::endl;
+  std::cout << "vec[1] == obj: " << ((vec[1] == obj) ? "true" : "false") << std::endl;
  
  // read tests
-  myclass copy1, copy2, copy3;
-  std::vector<myclass> veccopy1, veccopy2, veccopy3;
+  std::cout << "\n==== IO self-consistency tests" << std::endl;
  //// XML
-  {
-    XmlReader RD("bother.xml");
-    read(RD,"obj",copy1);
-    read(RD,"objvec", veccopy1);
-    std::cout << "Loaded (XML) -----------------" << std::endl;
-    std::cout << copy1 << std::endl << veccopy1 << std::endl;
-  }
+  ioTest<XmlWriter, XmlReader>("iotest.xml", obj, "XML    (object)           ");
+  ioTest<XmlWriter, XmlReader>("iotest.xml", vec, "XML    (vector of objects)");
  //// binary
-  {
-    BinaryWriter BWR("bother.bin");
-    write(BWR,"discard",copy1 );
-    write(BWR,"discard",veccopy1 );
-  }
-  {
-    BinaryReader BRD("bother.bin");
-    read (BRD,"discard",copy2 );
-    read (BRD,"discard",veccopy2 );
-    std::cout << "Loaded (bin) -----------------" << std::endl;
-    std::cout << copy2 << std::endl << veccopy2 << std::endl;
-  }
+  ioTest<BinaryWriter, BinaryReader>("iotest.bin", obj, "binary (object)           ");
+  ioTest<BinaryWriter, BinaryReader>("iotest.bin", vec, "binary (vector of objects)");
  //// text
-  {
-    TextWriter TWR("bother.txt");
-    write(TWR,"discard",copy1 );
-    write(TWR,"discard",veccopy1 );
-  }
-  {
-    TextReader TRD("bother.txt");
-    read (TRD,"discard",copy3 );
-    read (TRD,"discard",veccopy3 );
-    std::cout << "Loaded (txt) -----------------" << std::endl;
-    std::cout << copy3 << std::endl << veccopy3 << std::endl;
-  }
+  ioTest<TextWriter, TextReader>("iotest.dat", obj, "text   (object)           ");
+  ioTest<TextWriter, TextReader>("iotest.dat", vec, "text   (vector of objects)");
+  //// HDF5
+#ifdef HAVE_HDF5
+  ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", obj, "HDF5   (object)           ");
+  ioTest<Hdf5Writer, Hdf5Reader>("iotest.h5", vec, "HDF5   (vector of objects)");
+#endif
  
-  std::vector<int> iv = strToVec<int>("1 2 2 4");
-  std::vector<std::string> sv = strToVec<std::string>("bli bla blu");
+  std::cout << "\n==== vector flattening/reconstruction" << std::endl;
+  typedef std::vector<std::vector<std::vector<double>>> vec3d;
  
-  for (auto &e: iv)
+  vec3d dv, buf;
+  double d = 0.;
+  
+  dv.resize(4);
+  for (auto &v1: dv)
  {
-    std::cout << e << " ";
+    v1.resize(3);
+    for (auto &v2: v1)
+    {
+      v2.resize(5);
+      for (auto &x: v2)
+      {
+        x = d++;
+      }
+    }
  }
-  std::cout << std::endl;
-  for (auto &e: sv)
-  {
-    std::cout << e << " ";
-  }
-  std::cout << std::endl;
+  std::cout << "original 3D vector:" << std::endl;
+  std::cout << dv << std::endl;
+  
+  Flatten<vec3d> flatdv(dv);
+  
+  std::cout << "\ndimensions:" << std::endl;
+  std::cout << flatdv.getDim() << std::endl;
+  std::cout << "\nflattened vector:" << std::endl;
+  std::cout << flatdv.getFlatVector() << std::endl;
+  
+  Reconstruct<vec3d> rec(flatdv.getFlatVector(), flatdv.getDim());
+  std::cout << "\nreconstructed vector:" << std::endl;
+  std::cout << flatdv.getVector() << std::endl;
 }