Merge remote-tracking branch 'origin/develop' into temporary-smearing

2025-11-03 13:34:33 +00:00 · 2016-07-04 17:28:40 +01:00
parent 6ce174cd60 d6737e4bd8
commit 9cb90f714e
107 changed files with 7839 additions and 4572 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -94,7 +94,7 @@ Thumbs.db
 # build directory #
 ###################
-build/*
+build*/*
 # IDE related files #
 #####################
--- a/.travis.yml
+++ b/.travis.yml
@@ -0,0 +1,90 @@
 language: cpp
 cache:
  directories:
    - clang
 matrix:
  include:
    - os:        osx
      osx_image: xcode7.2
      compiler: clang
    - os:        osx
      osx_image: xcode7.2
      compiler: gcc
      env: VERSION=-5
    - compiler: gcc
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.9
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-5
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 before_install:
    - export GRIDDIR=`pwd`
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
 install:
    - export CC=$CC$VERSION
    - export CXX=$CXX$VERSION
    - echo $PATH
    - which $CC
    - $CC  --version
    - which $CXX
    - $CXX --version
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
 script:
    - ./scripts/reconfigure_script
    - mkdir build
    - cd build
    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Grid
+# Grid [![Build Status](https://travis-ci.org/paboyle/Grid.svg?branch=master)](https://travis-ci.org/paboyle/Grid)
 Data parallel C++ mathematical object library
 Last update 2015/7/30
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -27,6 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
@@ -45,6 +46,10 @@ struct scal {
  };
 bool overlapComms = false;
 typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
 int main (int argc, char ** argv)
 {
@@ -64,6 +69,12 @@ int main (int argc, char ** argv)
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
@@ -78,7 +89,9 @@ int main (int argc, char ** argv)
  ColourMatrix cm = Complex(1.0,0.0);
-  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
@@ -119,14 +132,21 @@ int main (int argc, char ** argv)
  RealD NP = UGrid->_Nprocessors;
  for(int doasm=1;doasm<2;doasm++){
    QCD::WilsonKernelsStatic::AsmOpt=doasm;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall=1000;
+  int ncall =10;
-  {
+  if (1) {
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      Dw.Dhop(src,result,0);
      __SSC_STOP;
    }
    double t1=usecond();
@@ -140,9 +160,121 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    Dw.Report();
+    //    Dw.Report();
  }
  if (1)
  {
    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
    LatticeFermionF ssrc(sFGrid);
    LatticeFermionF sref(sFGrid);
    LatticeFermionF sresult(sFGrid);
    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVectorF tmp;
      peekSite(tmp,src,site);
      pokeSite(tmp,ssrc,site);
    }}}}}
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      __SSC_START;
      sDw.Dhop(ssrc,sresult,0);
      __SSC_STOP;
    }
    double t1=usecond();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    //  sDw.Report();
    if(0){
      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 	sDw.Dhop(ssrc,sresult,0);
 	PerformanceCounter Counter(i);
 	Counter.Start();
 	sDw.Dhop(ssrc,sresult,0);
 	Counter.Stop();
 	Counter.Report();
      }
    }
    RealF sum=0;
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
    for(int t=0;t<latt4[3];t++){
    for(int s=0;s<Ls;s++){
      std::vector<int> site({s,x,y,z,t});
      SpinColourVectorF normal, simd;
      peekSite(normal,result,site);
      peekSite(simd,sresult,site);
      sum=sum+norm2(normal-simd);
      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
    }}}}}
    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
    if (1) {
      LatticeFermionF sr_eo(sFGrid);
      LatticeFermionF serr(sFGrid);
      LatticeFermion ssrc_e (sFrbGrid);
      LatticeFermion ssrc_o (sFrbGrid);
      LatticeFermion sr_e   (sFrbGrid);
      LatticeFermion sr_o   (sFrbGrid);
      pickCheckerboard(Even,ssrc_e,ssrc);
      pickCheckerboard(Odd,ssrc_o,ssrc);
      setCheckerboard(sr_eo,ssrc_o);
      setCheckerboard(sr_eo,ssrc_e);
      serr = sr_eo-ssrc; 
      std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl;
      sr_e = zero;
      sr_o = zero;
      double t0=usecond();
      for(int i=0;i<ncall;i++){
 	sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      }
      double t1=usecond();
      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
      sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
      sDw.Dhop  (ssrc  ,sresult,DaggerNo);
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
      ssrc_o = ssrc_o - sr_o;
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
    }
  }
  if (1)
  { // Naive wilson dag implementation
@@ -197,7 +329,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
  Dw.Dhop  (src  ,result,DaggerNo);
@@ -217,5 +348,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf_ntpf.cc
+++ b/benchmarks/Benchmark_dwf_ntpf.cc
@@ -0,0 +1,154 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };
 bool overlapComms = false;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
    overlapComms = true;
  }
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<LatticeColourMatrix> U(4,FGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }
  if (1)
  {
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
  RealD mass=0.1;
  RealD M5  =1.8;
  typename DomainWallFermionR::ImplParams params; 
  params.overlapCommsCompute = overlapComms;
  RealD NP = UGrid->_Nprocessors;
  QCD::WilsonKernelsStatic::AsmOpt=1;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall =50;
  if (1) {
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
    }
    double t1=usecond();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    //    Dw.Report();
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -0,0 +1,172 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_zmm.cc
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
 int main(int argc,char **argv)
 {
  Grid_init(&argc,&argv);
  std::ofstream os("zmm.dat");
  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
  for(int L=4;L<=32;L+=4){
    for(int m=1;m<=2;m++){
      for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
 	for(int i=0;i<4;i++) { 
 	  std::cout << grid[i]<<"x";
 	}
 	std::cout << Ls<<std::endl;
 	bench(os,grid,Ls);
      }
    }
  }
 }
 int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
 {
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int threads = GridThread::GetThreads();
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
  LatticeFermion src (FGrid);
  LatticeFermion tmp (FGrid);
  LatticeFermion srce(FrbGrid);
  LatticeFermion resulto(FrbGrid); resulto=zero;
  LatticeFermion resulta(FrbGrid); resulta=zero;
  LatticeFermion junk(FrbGrid); junk=zero;
  LatticeFermion diff(FrbGrid); 
  LatticeGaugeField Umu(UGrid);
  double mfc, mfa, mfo, mfl1;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  random(RNG5,src);
 #if 1
  random(RNG4,Umu);
 #else
  int mmu=2;
  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
    if ( mu!=mmu ) U[mu] = zero;
    if ( mu==mmu ) U[mu] = 1.0;
    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
  }
 #endif
 pickCheckerboard(Even,srce,src);
  RealD mass=0.1;
  RealD M5  =1.8;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall=50;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.DhopOE(srce,resulto,0);
  }
  double t1=usecond();
  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
  double flops=1344*volume/2;
  mfc = flops*ncall/(t1-t0);
  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
  QCD::WilsonKernelsStatic::AsmOpt=1;
  t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.DhopOE(srce,resulta,0);
  }
  t1=usecond();
  mfa = flops*ncall/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
  /*
  int dag=DaggerNo;
  t0=usecond();
  for(int i=0;i<1;i++){
    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
  }
  t1=usecond();
  mfo = flops*100/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
  t0=usecond();
  for(int i=0;i<1;i++){
    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
  }
  t1=usecond();
  mfl1= flops*100/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
     << mfc<<" "
     << mfa<<" "
     << mfo<<" "
     << mfl1<<std::endl;
  */
 #if 0
  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
    Dw.DhopOE(srce,resulta,0);
    PerformanceCounter Counter(i);
    Counter.Start();
    Dw.DhopOE(srce,resulta,0);
    Counter.Stop();
    Counter.Report();
  }
 #endif
  //resulta = (-0.5) * resulta;
  diff = resulto-resulta;
  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
  std::cout<<std::endl;
  return 0;
 }
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@@ -1,5 +1,5 @@
-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 Benchmark_comms_SOURCES=Benchmark_comms.cc
@@ -10,6 +10,10 @@ Benchmark_dwf_SOURCES=Benchmark_dwf.cc
 Benchmark_dwf_LDADD=-lGrid
 Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
 Benchmark_dwf_ntpf_LDADD=-lGrid
 Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 Benchmark_memory_asynch_LDADD=-lGrid
@@ -25,3 +29,7 @@ Benchmark_su3_LDADD=-lGrid
 Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 Benchmark_wilson_LDADD=-lGrid
 Benchmark_zmm_SOURCES=Benchmark_zmm.cc
 Benchmark_zmm_LDADD=-lGrid
--- a/109
+++ b/109
@@ -626,12 +626,18 @@ ac_subst_vars='am__EXEEXT_FALSE
 am__EXEEXT_TRUE
 LTLIBOBJS
 LIBOBJS
 USE_LAPACK_LIB_FALSE
 USE_LAPACK_LIB_TRUE
 USE_LAPACK_FALSE
 USE_LAPACK_TRUE
 BUILD_CHROMA_REGRESSION_FALSE
 BUILD_CHROMA_REGRESSION_TRUE
 BUILD_COMMS_NONE_FALSE
 BUILD_COMMS_NONE_TRUE
 BUILD_COMMS_MPI_FALSE
 BUILD_COMMS_MPI_TRUE
 BUILD_COMMS_SHMEM_FALSE
 BUILD_COMMS_SHMEM_TRUE
 BUILD_ZMM_FALSE
 BUILD_ZMM_TRUE
 EGREP
@@ -751,7 +757,9 @@ enable_simd
 enable_precision
 enable_comms
 enable_rng
 enable_timers
 enable_chroma
 enable_lapack
 '
      ac_precious_vars='build_alias
 host_alias
@@ -1410,7 +1418,9 @@ Optional Features:
  --enable-comms=none|mpi Select communications
  --enable-rng=ranlux48|mt19937
                          Select Random Number Generator to be used
  --enable-timers=yes|no  Enable system dependent high res timers
  --enable-chroma         Expect chroma compiled under c++11
  --enable-lapack         Enable lapack yes/no
 Some influential environment variables:
  CXX         C++ compiler command
@@ -6410,7 +6420,7 @@ if test "${enable_simd+set}" = set; then :
  enableval=$enable_simd; \
 	ac_SIMD=${enable_simd}
 else
-  ac_SIMD=AVX2
+  ac_SIMD=DEBUG
 fi
@@ -6477,7 +6487,7 @@ $as_echo "#define AVX512 1" >>confdefs.h
 $as_echo "#define IMCI 1" >>confdefs.h
       supported="cross compilation"
-       ac_ZMM=yes;
+       ac_ZMM=no;
     ;;
     NEONv8)
       echo Configuring for experimental ARMv8a support
@@ -6561,12 +6571,26 @@ $as_echo "#define GRID_COMMS_NONE 1" >>confdefs.h
 $as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h
     ;;
     shmem)
       echo Configuring for SHMEM communications
 $as_echo "#define GRID_COMMS_SHMEM 1" >>confdefs.h
     ;;
     *)
     as_fn_error $? "${ac_COMMS} unsupported --enable-comms option" "$LINENO" 5;
     ;;
 esac
 if  test "X${ac_COMMS}X" == "XshmemX" ; then
  BUILD_COMMS_SHMEM_TRUE=
  BUILD_COMMS_SHMEM_FALSE='#'
 else
  BUILD_COMMS_SHMEM_TRUE='#'
  BUILD_COMMS_SHMEM_FALSE=
 fi
 if  test "X${ac_COMMS}X" == "XmpiX" ; then
  BUILD_COMMS_MPI_TRUE=
  BUILD_COMMS_MPI_FALSE='#'
@@ -6610,6 +6634,34 @@ $as_echo "#define RNG_MT19937 1" >>confdefs.h
     as_fn_error $? "${ac_RNG} unsupported --enable-rng option" "$LINENO" 5;
     ;;
 esac
 #
 # SDE timing mode
 #
 # Check whether --enable-timers was given.
 if test "${enable_timers+set}" = set; then :
  enableval=$enable_timers; \
 	ac_TIMERS=${enable_timers}
 else
  ac_TIMERS=yes
 fi
 case ${ac_TIMERS} in
     yes)
 $as_echo "#define TIMERS_ON 1" >>confdefs.h
     ;;
     no)
 $as_echo "#define TIMERS_OFF 1" >>confdefs.h
     ;;
     *)
     as_fn_error $? "${ac_TIMERS} unsupported --enable-timers option" "$LINENO" 5;
     ;;
 esac
 #
 # Chroma regression tests
 #
@@ -6642,6 +6694,46 @@ else
 fi
 #
 # Lapack
 #
 # Check whether --enable-lapack was given.
 if test "${enable_lapack+set}" = set; then :
  enableval=$enable_lapack; ac_LAPACK=${enable_lapack}
 else
  ac_LAPACK=no
 fi
 case ${ac_LAPACK} in
     yes)
       echo Enabling lapack
     ;;
     no)
       echo Disabling lapack
     ;;
     *)
       echo Enabling lapack at ${ac_LAPACK}
     ;;
 esac
 if  test "X${ac_LAPACK}X" != "XnoX" ; then
  USE_LAPACK_TRUE=
  USE_LAPACK_FALSE='#'
 else
  USE_LAPACK_TRUE='#'
  USE_LAPACK_FALSE=
 fi
 if  test "X${ac_LAPACK}X" != "XyesX" ; then
  USE_LAPACK_LIB_TRUE=
  USE_LAPACK_LIB_FALSE='#'
 else
  USE_LAPACK_LIB_TRUE='#'
  USE_LAPACK_LIB_FALSE=
 fi
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
@@ -6809,6 +6901,10 @@ if test -z "${BUILD_ZMM_TRUE}" && test -z "${BUILD_ZMM_FALSE}"; then
  as_fn_error $? "conditional \"BUILD_ZMM\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${BUILD_COMMS_SHMEM_TRUE}" && test -z "${BUILD_COMMS_SHMEM_FALSE}"; then
  as_fn_error $? "conditional \"BUILD_COMMS_SHMEM\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
  as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -6821,6 +6917,14 @@ if test -z "${BUILD_CHROMA_REGRESSION_TRUE}" && test -z "${BUILD_CHROMA_REGRESSI
  as_fn_error $? "conditional \"BUILD_CHROMA_REGRESSION\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${USE_LAPACK_TRUE}" && test -z "${USE_LAPACK_FALSE}"; then
  as_fn_error $? "conditional \"USE_LAPACK\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${USE_LAPACK_LIB_TRUE}" && test -z "${USE_LAPACK_LIB_FALSE}"; then
  as_fn_error $? "conditional \"USE_LAPACK_LIB\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@@ -8167,6 +8271,7 @@ The following features are enabled:
 - communications type           : ${ac_COMMS}
 - default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG}
 - LAPACK	                : ${ac_LAPACK}
 "
--- a/configure.ac
+++ b/configure.ac
@@ -71,7 +71,7 @@ AC_CHECK_FUNCS([gettimeofday])
 AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
-	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
+	[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])
 supported=no
@@ -124,7 +124,7 @@ case ${ac_SIMD} in
       echo Configuring for IMCI
       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
       supported="cross compilation"
-       ac_ZMM=yes;
+       ac_ZMM=no;
     ;;
     NEONv8)
       echo Configuring for experimental ARMv8a support 
@@ -178,11 +178,16 @@ case ${ac_COMMS} in
       echo Configuring for MPI communications
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
     ;;
     shmem)
       echo Configuring for SHMEM communications
       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
     ;;
     *)
     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
     ;;
 esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
@@ -203,6 +208,25 @@ case ${ac_RNG} in
     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac
 #
 # SDE timing mode
 #
 AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
 	[Enable system dependent high res timers])],\
 	[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
 case ${ac_TIMERS} in
     yes)
     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
     ;;
     no)
     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
     ;;
     *)
     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
     ;;
 esac
 #
 # Chroma regression tests
 #
@@ -222,6 +246,26 @@ esac
 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
 #
 # Lapack
 #
 AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
 case ${ac_LAPACK} in
     yes)
       echo Enabling lapack
     ;;
     no)
       echo Disabling lapack
     ;;
     *)
       echo Enabling lapack at ${ac_LAPACK}
     ;;
 esac
 AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
 AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
@@ -265,6 +309,7 @@ The following features are enabled:
 - communications type           : ${ac_COMMS}
 - default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG} 
 - LAPACK	                : ${ac_LAPACK} 
 "
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -36,11 +36,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <malloc.h>
 #endif
 #include <immintrin.h>
 #ifdef HAVE_MM_MALLOC_H
 #include <mm_malloc.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 extern "C" { 
 #include <mpp/shmem.h>
 extern void * shmem_align(size_t, size_t);
 extern void  shmem_free(void *);
 }
 #endif
 namespace Grid {
 ////////////////////////////////////////////////////////////////////
@@ -72,21 +79,59 @@ public:
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
-  pointer allocate(size_type __n, const void* = 0)
+  pointer allocate(size_type __n, const void* _p= 0)
  { 
 #ifdef GRID_COMMS_SHMEM
    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 #define PARANOID_SYMMETRIC_HEAP
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
    bcast = (void *) ptr;
    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
    if ( bcast != ptr ) {
      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
      BACKTRACEFILE();
      exit(0);
    }
    assert( bcast == (void *) ptr);
 #endif 
 #else
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
 #endif
    _Tp tmp;
 #undef FIRST_TOUCH_OPTIMISE
 #ifdef FIRST_TOUCH_OPTIMISE
 #pragma omp parallel for 
  for(int i=0;i<__n;i++){
    ptr[i]=tmp;
  }
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
 #ifdef GRID_COMMS_SHMEM
    shmem_free((void *)__p);
 #else
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
 #endif
 #endif
  }
  void construct(pointer __p, const _Tp& __val) { };
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -1,180 +0,0 @@
 /* lib/Config.h.in.  Generated from configure.ac by autoheader.  */
 /* AVX Intrinsics */
 #undef AVX1
 /* AVX2 Intrinsics */
 #undef AVX2
 /* AVX512 Intrinsics for Knights Landing */
 #undef AVX512
 /* AVX Intrinsics with FMA4 */
 #undef AVXFMA4
 /* EMPTY_SIMD only for DEBUGGING */
 #undef EMPTY_SIMD
 /* GRID_COMMS_MPI */
 #undef GRID_COMMS_MPI
 /* GRID_COMMS_NONE */
 #undef GRID_COMMS_NONE
 /* GRID_DEFAULT_PRECISION is DOUBLE */
 #undef GRID_DEFAULT_PRECISION_DOUBLE
 /* GRID_DEFAULT_PRECISION is SINGLE */
 #undef GRID_DEFAULT_PRECISION_SINGLE
 /* Support Altivec instructions */
 #undef HAVE_ALTIVEC
 /* Support AVX (Advanced Vector Extensions) instructions */
 #undef HAVE_AVX
 /* Support AVX2 (Advanced Vector Extensions 2) instructions */
 #undef HAVE_AVX2
 /* Define to 1 if you have the declaration of `be64toh', and to 0 if you
   don't. */
 #undef HAVE_DECL_BE64TOH
 /* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
   */
 #undef HAVE_DECL_NTOHLL
 /* Define to 1 if you have the <endian.h> header file. */
 #undef HAVE_ENDIAN_H
 /* Define to 1 if you have the <execinfo.h> header file. */
 #undef HAVE_EXECINFO_H
 /* Support FMA3 (Fused Multiply-Add) instructions */
 #undef HAVE_FMA
 /* Define to 1 if you have the `gettimeofday' function. */
 #undef HAVE_GETTIMEOFDAY
 /* Define to 1 if you have the <gmp.h> header file. */
 #undef HAVE_GMP_H
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 /* Define to 1 if you have the <malloc.h> header file. */
 #undef HAVE_MALLOC_H
 /* Define to 1 if you have the <malloc/malloc.h> header file. */
 #undef HAVE_MALLOC_MALLOC_H
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 /* Support mmx instructions */
 #undef HAVE_MMX
 /* Define to 1 if you have the <mm_malloc.h> header file. */
 #undef HAVE_MM_MALLOC_H
 /* Support SSE (Streaming SIMD Extensions) instructions */
 #undef HAVE_SSE
 /* Support SSE2 (Streaming SIMD Extensions 2) instructions */
 #undef HAVE_SSE2
 /* Support SSE3 (Streaming SIMD Extensions 3) instructions */
 #undef HAVE_SSE3
 /* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
 #undef HAVE_SSE4_1
 /* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
 #undef HAVE_SSE4_2
 /* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
 #undef HAVE_SSSE3
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 /* Define to 1 if you have the <stdlib.h> header file. */
 #undef HAVE_STDLIB_H
 /* Define to 1 if you have the <strings.h> header file. */
 #undef HAVE_STRINGS_H
 /* Define to 1 if you have the <string.h> header file. */
 #undef HAVE_STRING_H
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #undef HAVE_SYS_STAT_H
 /* Define to 1 if you have the <sys/types.h> header file. */
 #undef HAVE_SYS_TYPES_H
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
 /* IMCI Intrinsics for Knights Corner */
 #undef IMCI
 /* NEON ARMv8 Experimental support */
 #undef NEONv8
 /* Name of package */
 #undef PACKAGE
 /* Define to the address where bug reports for this package should be sent. */
 #undef PACKAGE_BUGREPORT
 /* Define to the full name of this package. */
 #undef PACKAGE_NAME
 /* Define to the full name and version of this package. */
 #undef PACKAGE_STRING
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
 /* Define to the home page for this package. */
 #undef PACKAGE_URL
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 /* RNG_MT19937 */
 #undef RNG_MT19937
 /* RNG_RANLUX */
 #undef RNG_RANLUX
 /* SSE4 Intrinsics */
 #undef SSE4
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 /* Version number of package */
 #undef VERSION
 /* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
   #define below would cause a syntax error. */
 #undef _UINT32_T
 /* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
   #define below would cause a syntax error. */
 #undef _UINT64_T
 /* Define to `unsigned int' if <sys/types.h> does not define. */
 #undef size_t
 /* Define to the type of an unsigned integer type of width exactly 32 bits if
   such a type exists and the standard includes do not define it. */
 #undef uint32_t
 /* Define to the type of an unsigned integer type of width exactly 64 bits if
   such a type exists and the standard includes do not define it. */
 #undef uint64_t
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -37,4 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI
 #include <cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_SHMEM
 #include <cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
 #endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -62,10 +62,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <serialisation/Serialisation.h>
 #include <Config.h>
 #include <Timer.h>
 #include <PerfCount.h>
 #include <Log.h>
 #include <AlignedAllocator.h>
 #include <Simd.h>
 #include <Threads.h>
 #include <Lexicographic.h>
 #include <Communicator.h> 
 #include <Cartesian.h>    
 #include <Tensors.h>      
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -45,12 +45,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <algorithm>
 #include <iterator>
 #define __X86_64
 #ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif
 namespace Grid {
 //////////////////////////////////////////////////////
@@ -150,6 +144,10 @@ void GridParseLayout(char **argv,int argc,
  }
  if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
    std::vector<int> ompthreads(0);
 #ifndef GRID_OMP
    std::cout << GridLogWarning << "'--threads' option used but Grid was"
              << " not compiled with thread support" << std::endl;
 #endif
    arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
    GridCmdOptionIntVector(arg,ompthreads);
    assert(ompthreads.size()==1);
@@ -174,9 +172,8 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 /////////////////////////////////////////////////////////
 void Grid_init(int *argc,char ***argv)
 {
-#ifdef GRID_COMMS_MPI
+  CartesianCommunicator::Init(argc,argv);
-  MPI_Init(argc,argv);
+
 #endif
  // Parse command line args.
  GridLogger::StopWatch.Start();
@@ -194,9 +191,10 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
-    std::cout<<GridLogMessage<<"--omp n         : default number of OMP threads"<<std::endl;    
+    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;    
+    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    exit(EXIT_SUCCESS);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
@@ -213,8 +211,7 @@ void Grid_init(int *argc,char ***argv)
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
-    QCD::WilsonFermionStatic::HandOptDslash=1;
+    QCD::WilsonKernelsStatic::HandOpt=1;
    QCD::WilsonFermion5DStatic::HandOptDslash=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
@@ -287,13 +284,7 @@ void Grid_finalize(void)
  Grid_unquiesce_nodes();
 #endif
 }
 double usecond(void) {
  struct timeval tv;
  gettimeofday(&tv,NULL);
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 #define _NBACKTRACE (256)
 void * Grid_backtrace_buffer[_NBACKTRACE];
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
@@ -305,11 +296,11 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  // Linux/Posix
 #ifdef __linux__
  // And x86 64bit
 #ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 #define REG(A)  printf("  %s %lx\n",#A,sc-> A);
  REG(rdi);
  REG(rsi);
  REG(rbp);
@@ -330,17 +321,15 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  REG(r14);
  REG(r15);
 #endif
 #ifdef HAVE_EXECINFO_H
  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
  for (int i = 0; i < symbols; i++){
    printf ("%s\n", strings[i]);
  }
 #endif
  BACKTRACE();
  exit(0);
  return;
 };
-
+#ifdef GRID_FPE
 #define _GNU_SOURCE
 #include <fenv.h>
 #endif
 void Grid_debug_handler_init(void)
 {
  struct sigaction sa,osa;
@@ -349,5 +338,9 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
 #ifdef GRID_FPE
  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
  sigaction(SIGFPE,&sa,NULL);
 #endif
 }
 }
--- a/lib/Lexicographic.h
+++ b/lib/Lexicographic.h
@@ -0,0 +1,32 @@
 #ifndef GRID_LEXICOGRAPHIC_H
 #define GRID_LEXICOGRAPHIC_H
 namespace Grid{
  class Lexicographic {
  public:
    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
 	coor[d] = index % dims[d];
 	index   = index / dims[d];
      }
    }
    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=0;d<nd;d++){
 	index = index+stride*coor[d];
 	stride=stride*dims[d];
      }
    }
  };
 }
 #endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -73,13 +73,16 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void)
 {
  int me=0;
 #ifdef GRID_COMMS_MPI
  int me;
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
 #endif
 #ifdef GRID_COMMS_SHMEM
  me = shmem_my_pe();
 #endif
  if ( me ) { 
    std::cout.setstate(std::ios::badbit);
  }
 #endif
 }
 void Grid_unquiesce_nodes(void)
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -32,9 +32,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LOG_H
 #define GRID_LOG_H
 #ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif
    namespace Grid {
 // Dress the output; use std::chrono for time stamping via the StopWatch class
 int Rank(void); // used for early stage debug before library init
 class Colours{
@@ -48,7 +54,6 @@ namespace Grid {
    Active(activate);
  };
  void Active(bool activate){
    is_active=activate;
@@ -140,5 +145,37 @@ void GridLogConfigure(std::vector<std::string> &logstreams);
 extern Colours    GridLogColours;
 #define _NBACKTRACE (256)
 extern void * Grid_backtrace_buffer[_NBACKTRACE];
 #define BACKTRACEFILE() {\
 char string[20];					\
 std::sprintf(string,"backtrace.%d",Rank());				\
 std::FILE * fp = std::fopen(string,"w");				\
 BACKTRACEFP(fp)\
 std::fclose(fp);	    \
 }
 #ifdef HAVE_EXECINFO_H
 #define BACKTRACEFP(fp) { \
 int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
 char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
 for (int i = 0; i < symbols; i++){\
  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
 }\
 }
 #else 
 #define BACKTRACEFP(fp) { \
 std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
 std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
 std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
 std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
 }
 #endif
 #define BACKTRACE() BACKTRACEFP(stdout) 
 }
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -6,6 +6,10 @@ if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
 endif
 if BUILD_COMMS_SHMEM
  extra_sources+=communicator/Communicator_shmem.cc
 endif
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
 endif
--- a/lib/Old/Endeavour.tgz
+++ b/lib/Old/Endeavour.tgz
--- a/lib/PerfCount.cc
+++ b/lib/PerfCount.cc
@@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
-
+#define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 #ifdef __linux__
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." , INSTRUCTIONS},
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." , CACHE_REFERENCES},
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." },
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......"},
+    // 4
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS...."},
+#ifdef AVX512
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS....."},
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS..."},
+    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS.."},
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS"},
+    { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS  },
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS......."},
+    { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS  },
-  //  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS....."},
+    { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......"},
+    { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS...."},
+    // 11
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS..."},
+#else
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS....",INSTRUCTIONS},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......",L1D_READ_ACCESS},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
    // 11
 #endif
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS.......",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS.....",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
    //15
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......",INSTRUCTIONS},
  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS....",INSTRUCTIONS}
    //19
  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
 #endif
 };
 }
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -34,7 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <ctime>
 #include <chrono>
 #include <string.h>
-
+#include <unistd.h>
 #include <sys/ioctl.h>
 #ifdef __linux__
@@ -43,8 +43,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #include <sys/syscall.h>
 #endif
 namespace Grid {
 namespace Grid {
 #ifdef __linux__
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
@@ -58,6 +58,49 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 }
 #endif
 #ifdef TIMERS_OFF
 inline uint64_t cyclecount(void){ 
  return 0;
 }
 #define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
 #define __SSC_STOP  __SSC_MARK(0x110)
 #define __SSC_START __SSC_MARK(0x111)
 #else
 #define __SSC_MARK(mark) 
 #define __SSC_STOP  
 #define __SSC_START 
 /*
 * cycle counters arch dependent
 */
 #ifdef __bgq__
 inline uint64_t cyclecount(void){ 
   uint64_t tmp;
   asm volatile ("mfspr %0,0x10C" : "=&r" (tmp)  );
   return tmp;
 }
 #elif defined __x86_64__
 #include <x86intrin.h>
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
  //  unsigned int dummy;
  // return __rdtscp(&dummy);
 }
 #else
 inline uint64_t cyclecount(void){ 
   return 0;
 }
 #endif
 #endif
 class PerformanceCounter {
 private:
@@ -67,6 +110,7 @@ private:
    uint32_t type;
    uint64_t config;
    const char *name;
    int normalisation;
  } PerformanceCounterConfig; 
  static const PerformanceCounterConfig PerformanceCounterConfigs [];
@@ -74,26 +118,12 @@ private:
 public:
  enum PerformanceCounterType {
-    CPUCYCLES=0,
+    CACHE_REFERENCES=0,
-    INSTRUCTIONS,
+    CACHE_MISSES=1,
-    //    STALL_CYCLES,
+    CPUCYCLES=2,
-    CACHE_REFERENCES,
+    INSTRUCTIONS=3,
-    CACHE_MISSES,
+    L1D_READ_ACCESS=4,
-    L1D_READ_MISS,
+    PERFORMANCE_COUNTER_NUM_TYPES=19
    L1D_READ_ACCESS,
    L1D_WRITE_MISS,
    L1D_WRITE_ACCESS,
    L1D_PREFETCH_MISS,
    L1D_PREFETCH_ACCESS,
    LL_READ_MISS,
    //    LL_READ_ACCESS,
    LL_WRITE_MISS,
    LL_WRITE_ACCESS,
    LL_PREFETCH_MISS,
    LL_PREFETCH_ACCESS,
    L1I_READ_MISS,
    L1I_READ_ACCESS,
    PERFORMANCE_COUNTER_NUM_TYPES
  };
 public:
@@ -101,7 +131,9 @@ public:
  int PCT;
  long long count;
  long long cycles;
  int fd;
  int cyclefd;
  unsigned long long elapsed;
  uint64_t begin;
@@ -114,7 +146,9 @@ public:
    assert(_pct>=0);
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
    fd=-1;
    cyclefd=-1;
    count=0;
    cycles=0;
    PCT =_pct;
    Open();
 #endif
@@ -139,6 +173,15 @@ public:
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
    int norm = PerformanceCounterConfigs[PCT].normalisation;
    pe.type  = PerformanceCounterConfigs[norm].type;
    pe.config= PerformanceCounterConfigs[norm].config;
    name = PerformanceCounterConfigs[norm].name;
    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
    if (cyclefd == -1) {
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
 #endif
  }
@@ -146,10 +189,12 @@ public:
  {
 #ifdef __linux__
    if ( fd!= -1) {
-      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
-      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
    }
-    begin  =__rdtsc();
+    begin  =cyclecount();
 #else
    begin = 0;
 #endif
@@ -157,12 +202,15 @@ public:
  void Stop(void) {
    count=0;
    cycles=0;
 #ifdef __linux__
    if ( fd!= -1) {
-      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
      ::read(fd, &count, sizeof(long long));
      ::read(cyclefd, &cycles, sizeof(long long));
    }
-    elapsed = __rdtsc() - begin;
+    elapsed = cyclecount() - begin;
 #else
    elapsed = 0;
 #endif
@@ -170,16 +218,20 @@ public:
  }
  void Report(void) {
 #ifdef __linux__
-    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+    int N = PerformanceCounterConfigs[PCT].normalisation;
    const char * sn = PerformanceCounterConfigs[N].name ;
    const char * sc = PerformanceCounterConfigs[PCT].name;
      std::printf("tsc = %llu %s = %llu  %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, 
 		  sc, count, sc,sn, (double)count/(double)cycles);
 #else
-    printf("%llu cycles \n", elapsed );
+    std::printf("%llu cycles \n", elapsed );
 #endif
  }
  ~PerformanceCounter()
  {
 #ifdef __linux__
-    close(fd);
+    ::close(fd);    ::close(cyclefd);
 #endif
  }
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -42,10 +42,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
 #define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
 #define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
 #define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 #define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)
 #define RotateBit (0x100)
 namespace Grid {
  typedef uint32_t Integer;
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -71,13 +71,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
   struct StencilEntry { 
-    int _offset;
+     uint32_t _offset;
-    int _is_local;
+     uint32_t _byte_offset;
-    int _permute;
+     uint16_t _is_local;
-    int _around_the_world;
+     uint16_t _permute;
     uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
   };
-  template<class vobj,class cobj, class compressor>
+   template<class vobj,class cobj>
   class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
   public:
@@ -101,7 +102,16 @@ namespace Grid {
       std::vector<Packet> Packets;
 #define SEND_IMMEDIATE
 #define SERIAL_SENDS
       void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
 	 comms_bytes+=2.0*bytes;
 #ifdef SEND_IMMEDIATE
 	 commtime-=usecond();
 	 _grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
 	 commtime+=usecond();
 #endif
 	 Packet p;
 	 p.send_buf = xmit;
 	 p.recv_buf = rcv;
@@ -111,20 +121,63 @@ namespace Grid {
 	 p.done     = 0;
 	 comms_bytes+=2.0*bytes;
 	 Packets.push_back(p);
       }
 #ifdef SERIAL_SENDS
       void Communicate(void ) { 
 	 commtime-=usecond();
 	 for(int i=0;i<Packets.size();i++){
-	  _grid->SendToRecvFrom(Packets[i].send_buf,
+ #ifndef SEND_IMMEDIATE
 	   _grid->SendToRecvFrom(
 				 Packets[i].send_buf,
 				 Packets[i].to_rank,
 				 Packets[i].recv_buf,
 				 Packets[i].from_rank,
 				 Packets[i].bytes);
 #endif
 	   Packets[i].done = 1;
 	 }
 	 commtime+=usecond();
       }
 #else
       void Communicate(void ) { 
 	 typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 	 std::vector<std::vector<CommsRequest_t> > reqs(Packets.size());
 	 commtime-=usecond();
 	 const int concurrency=2;
 	 for(int i=0;i<Packets.size();i+=concurrency){
 	   for(int ii=0;ii<concurrency;ii++){
 	     int j = i+ii;
 	     if ( j<Packets.size() ) {
 #ifndef SEND_IMMEDIATE
 	       _grid->SendToRecvFromBegin(reqs[j],
 					  Packets[j].send_buf,
 					  Packets[j].to_rank,
 					  Packets[j].recv_buf,
 					  Packets[j].from_rank,
 					  Packets[j].bytes);
 #endif
 	     }
 	   }
 	   for(int ii=0;ii<concurrency;ii++){
 	     int j = i+ii;
 	     if ( j<Packets.size() ) {
 #ifndef SEND_IMMEDIATE
 	       _grid->SendToRecvFromComplete(reqs[i]);
 #endif
 	     }
 	   }
 	   for(int ii=0;ii<concurrency;ii++){
 	     int j = i+ii;
 	     if ( j<Packets.size() ) {
 	       Packets[j].done = 1;
 	     }
 	   }
 	 }
 	 commtime+=usecond();
       }
 #endif
       ///////////////////////////////////////////
       // Simd merge queue for asynch comms
@@ -144,25 +197,36 @@ namespace Grid {
 	 m.rpointers= rpointers;
 	 m.buffer_size = buffer_size;
 	 m.packet_id   = packet_id;
 #ifdef SEND_IMMEDIATE
 	 mergetime-=usecond();
 PARALLEL_FOR_LOOP
 	 for(int o=0;o<m.buffer_size;o++){
 	   merge1(m.mpointer[o],m.rpointers,o);
 	 }
 	 mergetime+=usecond();
 #else
 	 Mergers.push_back(m);
 #endif
       }
       void CommsMerge(void ) { 
 	 //PARALLEL_NESTED_LOOP2 
 	 for(int i=0;i<Mergers.size();i++){	
 	 spintime-=usecond();
 	 int packet_id = Mergers[i].packet_id;
 	 while(! Packets[packet_id].done ); // spin for completion
 	 spintime+=usecond();
 #ifndef SEND_IMMEDIATE
 	 mergetime-=usecond();
 PARALLEL_FOR_LOOP
 	   for(int o=0;o<Mergers[i].buffer_size;o++){
 	     merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 	   }
 	 mergetime+=usecond();
 #endif
 	 }
       }
@@ -182,8 +246,29 @@ PARALLEL_FOR_LOOP
       std::vector<int>                  _permute_type;
       // npoints x Osites() of these
-      std::vector<std::vector<StencilEntry> > _entries;
+       // Flat vector, change layout for cache friendly.
-      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
+       Vector<StencilEntry>  _entries;
       inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
       void PrecomputeByteOffsets(void){
 	 for(int i=0;i<_entries.size();i++){
 	   if( _entries[i]._is_local ) {
 	     _entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
 	   } else { 
 	     _entries[i]._byte_offset =(uint64_t)&comm_buf[0]+ _entries[i]._offset*sizeof(cobj);
 	   }
 	 }
       };
       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
 	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 	 local = _entries[ent]._is_local;
 	 perm  = _entries[ent]._permute;
 	 if (perm)  ptype = _permute_type[point]; 
 	 if (local) return base + _entries[ent]._byte_offset;
 	 else       return _entries[ent]._byte_offset;
       }
       // Comms buffers
       std::vector<Vector<scalar_object> > u_simd_send_buf;
@@ -215,7 +300,7 @@ PARALLEL_FOR_LOOP
 				      int checkerboard,
 				      const std::vector<int> &directions,
 				      const std::vector<int> &distances) 
-    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
+     :   _permute_type(npoints), _comm_buf_size(npoints)
     {
 #ifdef TIMING_HACK
       gathertime=0;
@@ -237,12 +322,12 @@ PARALLEL_FOR_LOOP
       int osites  = _grid->oSites();
       _entries.resize(_npoints* osites);
       for(int ii=0;ii<npoints;ii++){
 	 int i = ii; // reverse direction to get SIMD comms done first
 	 int point = i;
 	_entries[i].resize( osites);
 	 int dimension    = directions[i];
 	 int displacement = distances[i];
@@ -258,6 +343,9 @@ PARALLEL_FOR_LOOP
 	 int simd_layout     = _grid->_simd_layout[dimension];
 	 int comm_dim        = _grid->_processors[dimension] >1 ;
 	 int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
 	 int rotate_dim      = _grid->_simd_layout[dimension]>2;
 	 assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
 	 int sshift[2];
@@ -290,6 +378,8 @@ PARALLEL_FOR_LOOP
       u_send_buf.resize(_unified_buffer_size);
       comm_buf.resize(_unified_buffer_size);
       PrecomputeByteOffsets();
       const int Nsimd = grid->Nsimd();
       u_simd_send_buf.resize(Nsimd);
       u_simd_recv_buf.resize(Nsimd);
@@ -305,6 +395,7 @@ PARALLEL_FOR_LOOP
       int rd = _grid->_rdimensions[dimension];
       int ld = _grid->_ldimensions[dimension];
       int gd = _grid->_gdimensions[dimension];
       int ly = _grid->_simd_layout[dimension];
       // Map to always positive shift modulo global full dimension.
       int shift = (shiftpm+fd)%fd;
@@ -335,7 +426,7 @@ PARALLEL_FOR_LOOP
 	   int wrap = sshift/rd;
 	   int  num = sshift%rd;
 	   if ( x< rd-num ) permute_slice=wrap;
-	  else permute_slice = 1-wrap;
+	   else permute_slice = (wrap+1)%ly;
 	 }
 	 CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
@@ -355,7 +446,6 @@ PARALLEL_FOR_LOOP
       int simd_layout     = _grid->_simd_layout[dimension];
       int comm_dim        = _grid->_processors[dimension] >1 ;
      //      assert(simd_layout==1); // Why?
       assert(comm_dim==1);
       int shift = (shiftpm + fd) %fd;
       assert(shift>=0);
@@ -440,10 +530,11 @@ PARALLEL_FOR_LOOP
 	 // Simple block stride gather of SIMD objects
 	 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 	   for(int b=0;b<_grid->_slice_block[dimension];b++){
-	    _entries[point][lo+o+b]._offset  =ro+o+b;
+	     int idx=point+(lo+o+b)*_npoints;
-	    _entries[point][lo+o+b]._is_local=1;
+	     _entries[idx]._offset  =ro+o+b;
-	    _entries[point][lo+o+b]._permute=permute;
+	     _entries[idx]._permute=permute;
-	    _entries[point][lo+o+b]._around_the_world=wrap;
+	     _entries[idx]._is_local=1;
 	     _entries[idx]._around_the_world=wrap;
 	   }
 	   o +=_grid->_slice_stride[dimension];
 	 }
@@ -460,10 +551,11 @@ PARALLEL_FOR_LOOP
 	     int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
 	     if ( ocb&cbmask ) {
-	      _entries[point][lo+o+b]._offset =ro+o+b;
+	       int idx = point+(lo+o+b)*_npoints;
-	      _entries[point][lo+o+b]._is_local=1;
+	       _entries[idx]._offset =ro+o+b;
-	      _entries[point][lo+o+b]._permute=permute;
+	       _entries[idx]._is_local=1;
-	      _entries[point][lo+o+b]._around_the_world=wrap;
+	       _entries[idx]._permute=permute;
 	       _entries[idx]._around_the_world=wrap;
 	     }
 	   }
@@ -486,10 +578,11 @@ PARALLEL_FOR_LOOP
 	 // Simple block stride gather of SIMD objects
 	 for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 	   for(int b=0;b<_grid->_slice_block[dimension];b++){
-	    _entries[point][so+o+b]._offset  =offset+(bo++);
+	     int idx=point+(so+o+b)*_npoints;
-	    _entries[point][so+o+b]._is_local=0;
+	     _entries[idx]._offset  =offset+(bo++);
-	    _entries[point][so+o+b]._permute=0;
+	     _entries[idx]._is_local=0;
-	    _entries[point][so+o+b]._around_the_world=wrap;
+	     _entries[idx]._permute=0;
 	     _entries[idx]._around_the_world=wrap;
 	   }
 	   o +=_grid->_slice_stride[dimension];
 	 }
@@ -505,10 +598,11 @@ PARALLEL_FOR_LOOP
 	     int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	     if ( ocb & cbmask ) {
-	      _entries[point][so+o+b]._offset  =offset+(bo++);
+	       int idx = point+(so+o+b)*_npoints;
-	      _entries[point][so+o+b]._is_local=0;
+	       _entries[idx]._offset  =offset+(bo++);
-	      _entries[point][so+o+b]._permute =0;
+	       _entries[idx]._is_local=0;
-	      _entries[point][so+o+b]._around_the_world=wrap;
+	       _entries[idx]._permute =0;
 	       _entries[idx]._around_the_world=wrap;
 	     }
 	   }
 	   o +=_grid->_slice_stride[dimension];
@@ -517,19 +611,26 @@ PARALLEL_FOR_LOOP
     }
       template<class compressor>
       void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
       {
 	 Mergers.resize(0);
         Packets.resize(0);
         HaloGather(source,compress);
 	 this->Communicate();
 	 CommsMerge(); // spins
       }
 #if 0
       // Overlapping comms and compute typically slows down compute and  is useless
       // unless memory bandwidth greatly exceeds network
       template<class compressor>
       std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
 	 Mergers.resize(0); 
 	 Packets.resize(0);
 	 HaloGather(source,compress);
 	 return std::thread([&] { this->Communicate(); });
       }
      void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
      {
 	auto thr = HaloExchangeBegin(source,compress);
        HaloExchangeComplete(thr);
      }
       void HaloExchangeComplete(std::thread &thr) 
       {
 	 CommsMerge(); // spins
@@ -537,21 +638,10 @@ PARALLEL_FOR_LOOP
 	 thr.join();
 	 jointime+=usecond();
       }
-
+#endif
-      void HaloGather(const Lattice<vobj> &source,compressor &compress)
+       template<class compressor>
       void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point)
       {
 	// conformable(source._grid,_grid);
 	assert(source._grid==_grid);
 	halogtime-=usecond();
 	assert (comm_buf.size() == _unified_buffer_size );
 	u_comm_offset=0;
 	// Gather all comms buffers
 	for(int point = 0 ; point < _npoints; point++) {
 	  compress.Point(point);
 	   int dimension    = _directions[point];
 	   int displacement = _distances[point];
@@ -601,10 +691,27 @@ PARALLEL_FOR_LOOP
 	   }
       }
       template<class compressor>
       void HaloGather(const Lattice<vobj> &source,compressor &compress)
       {
 	 // conformable(source._grid,_grid);
 	 assert(source._grid==_grid);
 	 halogtime-=usecond();
 	 assert (comm_buf.size() == _unified_buffer_size );
 	 u_comm_offset=0;
 	 // Gather all comms buffers
 	 for(int point = 0 ; point < _npoints; point++) {
 	   compress.Point(point);
 	   HaloGatherDir(source,compress,point);
 	 }
 	 assert(u_comm_offset==_unified_buffer_size);
 	 halogtime+=usecond();
       }
       template<class compressor>
 	 void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
 	 {
 	   typedef typename cobj::vector_type vector_type;
@@ -653,13 +760,6 @@ PARALLEL_FOR_LOOP
 	       assert (recv_from_rank != _grid->ThisRank());
 	       //      FIXME Implement asynchronous send & also avoid buffer copy
 	      /*
 	      _grid->SendToRecvFrom((void *)&send_buf[0],
 				   xmit_to_rank,
 				    (void *)&comm_buf[u_comm_offset],
 				   recv_from_rank,
 				   bytes);
 	      */ 
 	       AddPacket((void *)&u_send_buf[u_comm_offset],
 			 (void *)  &comm_buf[u_comm_offset],
 			 xmit_to_rank,
@@ -672,6 +772,7 @@ PARALLEL_FOR_LOOP
 	 }
       template<class compressor>
 	 void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
 	 {
 	   const int Nsimd = _grid->Nsimd();
@@ -684,6 +785,7 @@ PARALLEL_FOR_LOOP
 	   int comm_dim        = _grid->_processors[dimension] >1 ;
 	   assert(comm_dim==1);
 	   // This will not work with a rotate dim
 	   assert(simd_layout==2);
 	   assert(shift>=0);
 	   assert(shift<fd);
@@ -729,6 +831,8 @@ PARALLEL_FOR_LOOP
 	       for(int i=0;i<Nsimd;i++){
 		 // FIXME 
 		 // This logic is hard coded to simd_layout ==2 and not allowing >2
 		 //		std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
 		 int inner_bit = (Nsimd>>(permute_type+1));
--- a/lib/Timer.h
+++ b/lib/Timer.h
@@ -39,11 +39,18 @@ namespace Grid {
  // Dress the output; use std::chrono
 // C++11 time facilities better?
-double usecond(void);
+inline double usecond(void) {
  struct timeval tv;
 #ifdef TIMERS_ON
  gettimeofday(&tv,NULL);
 #endif
  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
 }
 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
 typedef  std::chrono::milliseconds          GridTime;
 typedef  std::chrono::microseconds          GridUsecs;
 inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
 {
@@ -55,29 +62,39 @@ class GridStopWatch {
 private:
  bool running;
  GridTimePoint start;
-  GridTime accumulator;
+  GridUsecs accumulator;
 public:
  GridStopWatch () { 
    Reset();
  }
  void     Start(void) { 
    assert(running == false);
 #ifdef TIMERS_ON
    start = GridClock::now(); 
 #endif
    running = true;
  }
  void     Stop(void)  { 
    assert(running == true);
-    accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start); 
+#ifdef TIMERS_ON
    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
 #endif
    running = false; 
  };
  void     Reset(void){
    running = false;
 #ifdef TIMERS_ON
    start = GridClock::now();
-    accumulator = std::chrono::duration_cast<GridTime>(start-start); 
+#endif
    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
  GridTime Elapsed(void) {
    assert(running == false);
-    return accumulator;
+    return std::chrono::duration_cast<GridTime>( accumulator );
  }
  uint64_t useconds(void){
    assert(running == false);
    return (uint64_t) accumulator.count();
  }
 };
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -147,6 +147,56 @@ namespace Grid {
      }
      Orthogonalise();
    }
    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
    {
      // Run a Lanczos with sloppy convergence
 	const int Nstop = nn;
 	const int Nk = nn+20;
 	const int Np = nn+20;
 	const int Nm = Nk+Np;
 	const int MaxIt= 10000;
 	RealD resid = 1.0e-3;
 	Chebyshev<FineField> Cheb(0.5,64.0,21);
 	ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
 	//	IRL.lock = 1;
 	FineField noise(FineGrid); gaussian(RNG,noise);
 	FineField tmp(FineGrid); 
 	std::vector<RealD>     eval(Nm);
 	std::vector<FineField> evec(Nm,FineGrid);
 	int Nconv;
 	IRL.calc(eval,evec,
 		 noise,
 		 Nconv);
    	// pull back nn vectors
 	for(int b=0;b<nn;b++){
 	  subspace[b]   = evec[b];
 	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 	  hermop.Op(subspace[b],tmp); 
 	  std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
 	  noise = tmp -  sqrt(eval[b])*subspace[b] ;
 	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 	  noise = tmp +  eval[b]*subspace[b] ;
 	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
 	}
 	Orthogonalise();
 	for(int b=0;b<nn;b++){
 	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
 	}
    }
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
      RealD scale;
@@ -200,7 +250,7 @@ namespace Grid {
    ////////////////////
    Geometry         geom;
    GridBase *       _grid; 
-    CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil; 
+    CartesianStencil<siteVector,siteVector> Stencil; 
    std::vector<CoarseMatrix> A;
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -222,6 +222,7 @@ namespace Grid {
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
 //	std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
@@ -251,10 +252,10 @@ namespace Grid {
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
-	_Mat.Meooe(in,tmp);
+	_Mat.Meooe(in,out);
-	_Mat.MooeeInv(tmp,out);
+	_Mat.MooeeInv(out,tmp);
-	_Mat.Meooe(out,tmp);
+	_Mat.Meooe(tmp,out);
-	_Mat.MooeeInv(tmp,out);
+	_Mat.MooeeInv(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
@@ -270,6 +271,35 @@ namespace Grid {
      }
    };
    template<class Matrix,class Field>
      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
    protected:
      Matrix &_Mat;
    public:
      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
 	_Mat.MooeeInv(in,out);
 	_Mat.Meooe(out,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
      virtual  RealD MpcDag   (const Field &in, Field &out){
 	Field tmp(in._grid);
 	_Mat.MeooeDag(in,out);
 	_Mat.MooeeInvDag(out,tmp);
 	_Mat.MeooeDag(tmp,out);
 	_Mat.MooeeInvDag(out,tmp);
 	return axpy_norm(out,-1.0,tmp,in);
      }
    };
    /////////////////////////////////////////////////////////////
    // Base classes for functions of operators
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -58,12 +58,13 @@ namespace Grid {
      Field Mtmp(in._grid);
      AtoN = in;
      out = AtoN*Coeffs[0];
-      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
+//            std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
-      //      std::cout <<"0 " <<norm2(out)<<std::endl;
+//            std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
      for(int n=1;n<Coeffs.size();n++){
 	Mtmp = AtoN;
 	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
 //            std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
 //		std::cout << n<<" " <<norm2(out)<<std::endl;
      }
    };
@@ -82,7 +83,8 @@ namespace Grid {
  public:
    void csv(std::ostream &out){
-      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
+	RealD diff = hi-lo;
      for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
 	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
@@ -99,10 +101,24 @@ namespace Grid {
    Chebyshev(){};
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
    ////////////////////////////////////////////////////////////////////////////////////////////////////
 // CJ: the one we need for Lanczos
    void Init(RealD _lo,RealD _hi,int _order)
    {
      lo=_lo;
      hi=_hi;
      order=_order;
      if(order < 2) exit(-1);
      Coeffs.resize(order);
      Coeffs.assign(0.,order);
      Coeffs[order-1] = 1.;
    };
    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
    {
      lo=_lo;
@@ -182,6 +198,8 @@ namespace Grid {
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
      GridBase *grid=in._grid;
 //std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
 //<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
      int vol=grid->gSites();
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -16,9 +16,13 @@
 #define INCLUDED_ALG_REMEZ_H
 #include <stddef.h>
 #include <Config.h>
-//#include <algorithms/approx/bigfloat.h>
+#ifdef HAVE_GMP_H
 #include <algorithms/approx/bigfloat.h>
 #else
 #include <algorithms/approx/bigfloat_double.h>
 #endif
 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -84,7 +84,7 @@ public:
 	return;
      }
-      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
+      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
      GridStopWatch LinalgTimer;
      GridStopWatch MatrixTimer;
@@ -101,8 +101,8 @@ public:
 	MatrixTimer.Stop();
 	LinalgTimer.Start();
-	RealD    qqck = norm2(mmp);
+	//	RealD    qqck = norm2(mmp);
-	ComplexD dck  = innerProduct(p,mmp);
+	//	ComplexD dck  = innerProduct(p,mmp);
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
@@ -133,8 +133,8 @@ public:
 	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 		   <<" computed residual "<<sqrt(cp/ssq)
 		   <<" true residual "    <<true_residual
-		   <<" target "<<Tolerance;
+		   <<" target "<<Tolerance<<std::endl;
-	  std::cout<<" Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
+	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 	  std::cout<<std::endl;
 	  assert(true_residual/Tolerance < 1000.0);
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -274,7 +274,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  }
  // ugly hack
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-  assert(0);
+//  assert(0);
 }
  };
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@@ -38,32 +38,34 @@ template<class Field>
 class SortEigen {
 private:
 //hacking for testing for now
 private:
  static bool less_lmd(RealD left,RealD right){
-    return fabs(left) < fabs(right);
+    return left > right;
  }  
-  static bool less_pair(std::pair<RealD,Field>& left,
+  static bool less_pair(std::pair<RealD,Field const*>& left,
-		 std::pair<RealD,Field>& right){
+                        std::pair<RealD,Field const*>& right){
-    return fabs(left.first) < fabs(right.first);
+    return left.first > (right.first);
  }  
 public:
  void push(DenseVector<RealD>& lmd,
            DenseVector<Field>& evec,int N) {
    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
-    DenseVector<std::pair<RealD, Field> > emod;
+    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
-    typename DenseVector<std::pair<RealD, Field> >::iterator it;
+    for(int i=0;i<lmd.size();++i)
-    
+      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
    for(int i=0;i<lmd.size();++i){
      emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
    }
    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
-    it=emod.begin();
+    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
    for(int i=0;i<N;++i){
      lmd[i]=it->first;
-      evec[i]=it->second;
+      evec[i]=*(it->second);
      ++it;
    }
  }
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -29,6 +29,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_IRL_H
 #define GRID_IRL_H
 #include <string.h> //memset
 #ifdef USE_LAPACK
 #include <lapacke.h>
 #endif
 #include <algorithms/iterative/DenseMatrix.h>
 #include <algorithms/iterative/EigenSort.h>
@@ -49,6 +53,7 @@ public:
    int Niter;
    int converged;
    int Nstop;   // Number of evecs checked for convergence
    int Nk;      // Number of converged sought
    int Np;      // Np -- Number of spare vecs in kryloc space
    int Nm;      // Nm -- total number of vectors
@@ -57,6 +62,8 @@ public:
    SortEigen<Field> _sort;
 //    GridCartesian &_fgrid;
    LinearOperatorBase<Field> &_Linop;
    OperatorFunction<Field>   &_poly;
@@ -67,7 +74,27 @@ public:
    void init(void){};
    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);
-    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
+    ImplicitlyRestartedLanczos(
 				LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nstop, // sought vecs
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _Niter) : // Max iterations
      _Linop(Linop),
      _poly(poly),
      Nstop(_Nstop),
      Nk(_Nk),
      Nm(_Nm),
      eresid(_eresid),
      Niter(_Niter)
    { 
      Np = Nm-Nk; assert(Np>0);
    };
    ImplicitlyRestartedLanczos(
 				LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
@@ -75,6 +102,7 @@ public:
 			       int _Niter) : // Max iterations
      _Linop(Linop),
      _poly(poly),
      Nstop(_Nk),
      Nk(_Nk),
      Nm(_Nm),
      eresid(_eresid),
@@ -142,6 +170,7 @@ public:
      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
                                 // 7. vk+1 := wk/βk+1
 //	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
      const RealD tiny = 1.0e-20;
      if ( beta < tiny ) { 
 	std::cout << " beta is tiny "<<beta<<std::endl;
@@ -219,15 +248,122 @@ public:
      }
    }
 #ifdef USE_LAPACK
    void diagonalize_lapack(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
 		     int N1,
 		     int N2,
 		     DenseVector<RealD>& Qt,
 		     GridBase *grid){
  const int size = Nm;
 //  tevals.resize(size);
 //  tevecs.resize(size);
  int NN = N1;
  double evals_tmp[NN];
  double evec_tmp[NN][NN];
  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
 //  double AA[NN][NN];
  double DD[NN];
  double EE[NN];
  for (int i = 0; i< NN; i++)
    for (int j = i - 1; j <= i + 1; j++)
      if ( j < NN && j >= 0 ) {
        if (i==j) DD[i] = lmd[i];
        if (i==j) evals_tmp[i] = lmd[i];
        if (j==(i-1)) EE[j] = lme[j];
      }
  int evals_found;
  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
  int liwork =  3+NN*10 ;
  int iwork[liwork];
  double work[lwork];
  int isuppz[2*NN];
  char jobz = 'V'; // calculate evals & evecs
  char range = 'I'; // calculate all evals
  //    char range = 'A'; // calculate all evals
  char uplo = 'U'; // refer to upper half of original matrix
  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
  int ifail[NN];
  int info;
 //  int total = QMP_get_number_of_nodes();
 //  int node = QMP_get_node_number();
 //  GridBase *grid = evec[0]._grid;
  int total = grid->_Nprocessors;
  int node = grid->_processor;
  int interval = (NN/total)+1;
  double vl = 0.0, vu = 0.0;
  int il = interval*node+1 , iu = interval*(node+1);
  if (iu > NN)  iu=NN;
  double tol = 0.0;
    if (1) {
      memset(evals_tmp,0,sizeof(double)*NN);
      if ( il <= NN){
        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
        LAPACK_dstegr(&jobz, &range, &NN,
            (double*)DD, (double*)EE,
            &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
            &tol, // tolerance
            &evals_found, evals_tmp, (double*)evec_tmp, &NN,
            isuppz,
            work, &lwork, iwork, &liwork,
            &info);
        for (int i = iu-1; i>= il-1; i--){
          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
          evals_tmp[i] = evals_tmp[i - (il-1)];
          if (il>1) evals_tmp[i-(il-1)]=0.;
          for (int j = 0; j< NN; j++){
            evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
            if (il>1) evec_tmp[i-(il-1)][j]=0.;
          }
        }
      }
      {
 //        QMP_sum_double_array(evals_tmp,NN);
 //        QMP_sum_double_array((double *)evec_tmp,NN*NN);
         grid->GlobalSumVector(evals_tmp,NN);
         grid->GlobalSumVector((double*)evec_tmp,NN*NN);
      }
    } 
 // cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
  for(int i=0;i<NN;i++){
    for(int j=0;j<NN;j++)
      Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
      lmd [NN-1-i]=evals_tmp[i];
  }
 }
 #endif
    void diagonalize(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
-		     int Nm2,
+		     int N2,
-		     int Nm,
+		     int N1,
-		     DenseVector<RealD>& Qt)
+		     DenseVector<RealD>& Qt,
 		     GridBase *grid)
    {
-      int Niter = 100*Nm;
+
 #ifdef USE_LAPACK
    const int check_lapack=0; // just use lapack if 0, check against lapack if 1
    if(!check_lapack)
 	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
 	DenseVector <RealD> lmd2(N1);
 	DenseVector <RealD> lme2(N1);
 	DenseVector<RealD> Qt2(N1*N1);
         for(int k=0; k<N1; ++k){
 	    lmd2[k] = lmd[k];
 	    lme2[k] = lme[k];
 	  }
         for(int k=0; k<N1*N1; ++k)
 	Qt2[k] = Qt[k];
 //	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
 #endif
      int Niter = 100*N1;
      int kmin = 1;
-      int kmax = Nk;
+      int kmax = N2;
      // (this should be more sophisticated)
      for(int iter=0; iter<Niter; ++iter){
@@ -239,7 +375,7 @@ public:
 	// (Dsh: shift)
 	// transformation
-	qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
+	qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
 	// Convergence criterion (redef of kmin and kamx)
 	for(int j=kmax-1; j>= kmin; --j){
@@ -250,6 +386,23 @@ public:
 	  }
 	}
 	Niter = iter;
 #ifdef USE_LAPACK
    if(check_lapack){
 	const double SMALL=1e-8;
 	diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
 	DenseVector <RealD> lmd3(N2);
         for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
        _sort.push(lmd3,N2);
        _sort.push(lmd2,N2);
         for(int k=0; k<N2; ++k){
 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
 //	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
 	  }
         for(int k=0; k<N1*N1; ++k){
 //	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
 	}
    }
 #endif
 	return;
      continued:
@@ -265,6 +418,7 @@ public:
      abort();
    }
 #if 1
    static RealD normalise(Field& v) 
    {
      RealD nn = norm2(v);
@@ -326,6 +480,7 @@ until convergence
      {
 	GridBase *grid = evec[0]._grid;
 	assert(grid == src._grid);
 	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
 	std::cout << " -- Nm = " << Nm << std::endl;
@@ -356,11 +511,21 @@ until convergence
 	// (uniform vector) Why not src??
 	//	evec[0] = 1.0;
 	evec[0] = src;
 	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
 // << src._grid  << std::endl;
 	normalise(evec[0]);
 	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
 // << evec[0]._grid << std::endl;
 	// Initial Nk steps
 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
 //	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
 //	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
 	RitzMatrix(evec,Nk);
 	for(int k=0; k<Nk; ++k){
 //	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
 //	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
 	}
 	// Restarting loop begins
 	for(int iter = 0; iter<Niter; ++iter){
@@ -382,20 +547,24 @@ until convergence
 	    lme2[k] = lme[k+k1-1];
 	  }
 	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nm,Nm,Qt);
+	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
 	  // sorting
 	  _sort.push(eval2,Nm);
 	  // Implicitly shifted QR transformations
 	  setUnit_Qt(Nm,Qt);
-	  for(int ip=k2; ip<Nm; ++ip) 
+	  for(int ip=k2; ip<Nm; ++ip){ 
 	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
 	}
 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 	  for(int j=k1-1; j<k2+1; ++j){
 	    for(int k=0; k<Nm; ++k){
 	    B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+Nm*j] * evec[k];
 	    }
 	  }
@@ -418,21 +587,25 @@ until convergence
 	    lme2[k] = lme[k];
 	  }
 	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nk,Nm,Qt);
+	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
 	  for(int k = 0; k<Nk; ++k) B[k]=0.0;
 	  for(int j = 0; j<Nk; ++j){
 	    for(int k = 0; k<Nk; ++k){
 	    B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+j*Nm] * evec[k];
 	    }
 //	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 	  }
 //	_sort.push(eval2,B,Nk);
 	  Nconv = 0;
 	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 	  for(int i=0; i<Nk; ++i){
-	    _poly(_Linop,B[i],v);
+//	    _poly(_Linop,B[i],v);
 	    _Linop.HermOp(B[i],v);
 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
 	    RealD vden = norm2(B[i]);
@@ -440,11 +613,13 @@ until convergence
 	    v -= eval2[i]*B[i];
 	    RealD vv = norm2(v);
 	    std::cout.precision(13);
 	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
 	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
-	    if(vv<eresid*eresid){
+	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
 	    if((vv<eresid*eresid) && (i == Nconv) ){
 	      Iconv[Nconv] = i;
 	      ++Nconv;
 	    }
@@ -455,7 +630,7 @@ until convergence
 	  std::cout<<" #modes converged: "<<Nconv<<std::endl;
-	  if( Nconv>=Nk ){
+	  if( Nconv>=Nstop ){
 	    goto converged;
 	  }
 	} // end of iter loop
@@ -465,12 +640,11 @@ until convergence
      converged:
       // Sorting
-	
+       eval.resize(Nconv);
-	eval.clear();
+       evec.resize(Nconv,grid);
 	evec.clear();
       for(int i=0; i<Nconv; ++i){
-	  eval.push_back(eval2[Iconv[i]]);
+         eval[i] = eval2[Iconv[i]];
-	  evec.push_back(B[Iconv[i]]);
+         evec[i] = B[Iconv[i]];
       }
      _sort.push(eval,evec,Nconv);
@@ -1025,6 +1199,7 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx
  }
 }
 #endif
 };
--- a/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@@ -47,6 +47,10 @@ namespace Grid {
    int mmax;
    int nstep;
    int steps;
    GridStopWatch PrecTimer;
    GridStopWatch MatTimer;
    GridStopWatch LinalgTimer;
    LinearFunction<Field> &Preconditioner;
   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
@@ -68,14 +72,24 @@ namespace Grid {
      Field r(src._grid);
        PrecTimer.Reset();
         MatTimer.Reset();
      LinalgTimer.Reset();
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      steps=0;
      for(int k=0;k<MaxIterations;k++){
 	cp=GCRnStep(Linop,src,psi,rsq);
-	if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
+	std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
 	if(cp<rsq) {
 	  SolverTimer.Stop();
 	  Linop.HermOp(psi,r);
 	  axpy(r,-1.0,src,r);
 	  RealD tr = norm2(r);
@@ -83,6 +97,11 @@ namespace Grid {
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "    <<sqrt(tr/ssq)
 	           << " target "           <<Tolerance <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
 	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
 	  return;
 	}
@@ -90,6 +109,7 @@ namespace Grid {
      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
      assert(0);
    }
    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
      RealD cp;
@@ -116,24 +136,25 @@ namespace Grid {
      // initial guess x0 is taken as nonzero.
      // r0=src-A x0 = src
      //////////////////////////////////
      MatTimer.Start();
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
      MatTimer.Stop();
      r=src-Az;
      /////////////////////
      // p = Prec(r)
      /////////////////////
      PrecTimer.Start();
      Preconditioner(r,z);
      PrecTimer.Stop();
-      std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl; 
+      MatTimer.Start();
      std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl; 
      Linop.HermOp(z,tmp); 
      MatTimer.Stop();
      std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
      ttmp=tmp;
      tmp=tmp-r;
      std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
      /*
      std::cout<<GridLogMessage<<r<<std::endl;
      std::cout<<GridLogMessage<<z<<std::endl;
@@ -141,7 +162,9 @@ namespace Grid {
      std::cout<<GridLogMessage<<tmp<<std::endl;
      */
      MatTimer.Start();
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
      MatTimer.Stop();
      //p[0],q[0],qq[0] 
      p[0]= z;
@@ -165,16 +188,20 @@ namespace Grid {
 	cp = axpy_norm(r,-a,q[peri_k],r);  
 	std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl; 
 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
 	}
 	std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 
 	PrecTimer.Start();
 	Preconditioner(r,z);// solve Az = r
 	PrecTimer.Stop();
 	MatTimer.Start();
 	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
 	Linop.HermOp(z,tmp);
 	MatTimer.Stop();
        tmp=tmp-r;
 	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@@ -102,6 +102,8 @@ namespace Grid {
      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
      pickCheckerboard(Even,sol_e,out);
      pickCheckerboard(Odd ,sol_o,out);
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -115,27 +115,11 @@ public:
      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
      return idx;
    }
    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
 	coor[d] = index % dims[d];
 	index   = index / dims[d];
      }
    }
    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
-      CoorFromIndex(coor,Oindex,_rdimensions);
+      Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
    }
    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=0;d<nd;d++){
 	index = index+stride*coor[d];
 	stride=stride*dims[d];
      }
    }
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
@@ -147,13 +131,32 @@ public:
    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
-      CoorFromIndex(coor,lane,_simd_layout);
+      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
    }
    inline int PermuteDim(int dimension){
      return _simd_layout[dimension]>1;
    }
    inline int PermuteType(int dimension){
      int permute_type=0;
      //
      // FIXME:
      //
      // Best way to encode this would be to present a mask 
      // for which simd dimensions are rotated, and the rotation
      // size. If there is only one simd dimension rotated, this is just 
      // a permute. 
      //
      // Cases: PermuteType == 1,2,4,8
      // Distance should be either 0,1,2..
      //
      if ( _simd_layout[dimension] > 2 ) { 
 	for(int d=0;d<_ndimension;d++){
 	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
 	}
 	permute_type = RotateBit; // How to specify distance; this is not just direction.
 	return permute_type;
      }
      for(int d=_ndimension-1;d>dimension;d--){
 	if (_simd_layout[d]>1 ) permute_type++;
      }
@@ -163,12 +166,12 @@ public:
    // Array sizing queries
    ////////////////////////////////////////////////////////////////
-    inline int iSites(void) { return _isites; };
+    inline int iSites(void) const { return _isites; };
-    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
+    inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
-    inline int oSites(void) { return _osites; };
+    inline int oSites(void) const { return _osites; };
-    inline int lSites(void) { return _isites*_osites; }; 
+    inline int lSites(void) const { return _isites*_osites; }; 
-    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
+    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
-    inline int Nd    (void) { return _ndimension;};
+    inline int Nd    (void) const { return _ndimension;};
    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
@@ -179,7 +182,10 @@ public:
    // Global addressing
    ////////////////////////////////////////////////////////////////
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
-      CoorFromIndex(gcoor,gidx,_gdimensions);
+      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
    }
    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
    }
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
      gidx=0;
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -170,9 +170,15 @@ public:
 	// Use a reduced simd grid
 	_simd_layout[d] = simd_layout[d];
 	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
 	assert(_rdimensions[d]>0);
 	// all elements of a simd vector must have same checkerboard.
-	if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); 
+	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 	if ( _simd_layout[d]>1 ) {
 	  if ( d != _checker_dim ) { 
 	    assert( (_rdimensions[d]&0x1) == 0 );
 	  }
 	}
 	_osites *= _rdimensions[d];
 	_isites *= _simd_layout[d];
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -34,6 +34,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
 namespace Grid {
 class CartesianCommunicator {
  public:    
@@ -53,6 +56,8 @@ class CartesianCommunicator {
    typedef int CommsRequest_t;
 #endif
    static void Init(int *argc, char ***argv);
    // Constructor
    CartesianCommunicator(const std::vector<int> &pdimensions_in);
@@ -81,6 +86,7 @@ class CartesianCommunicator {
    void GlobalSumVector(RealD *,int N);
    void GlobalSum(uint32_t &);
    void GlobalSum(uint64_t &);
    void GlobalSum(ComplexF &c)
    {
@@ -115,11 +121,10 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);
-    void RecvFrom(void *recv,
+    void SendRecvPacket(void *xmit,
-		  int recv_from_rank,
+			void *recv,
 		  int bytes);
    void SendTo(void *xmit,
 			int xmit_to_rank,
 			int recv_from_rank,
 			int bytes);
    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -31,6 +31,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
  // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
 }
  int Rank(void) {
    int pe;
    MPI_Comm_rank(MPI_COMM_WORLD,&pe);
    return pe;
  }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
@@ -59,6 +72,10 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
@@ -108,21 +125,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
-void CartesianCommunicator::RecvFrom(void *recv,
+
-				     int from,
+void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
-  int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
+  assert(sender != receiver);
-  assert(ierr==0);
+  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 void CartesianCommunicator::SendTo(void *xmit,
 				   int dest,
 				   int bytes)
 {
  int rank = _processor; // used for tag; must know who it comes from
  int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -28,6 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "Grid.h"
 namespace Grid {
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
 }
 int Rank(void ){ return 0; };
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
@@ -47,16 +53,13 @@ void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSum(uint64_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}
-void CartesianCommunicator::RecvFrom(void *recv,
+void CartesianCommunicator::SendRecvPacket(void *xmit,
-				     int recv_from_rank,
+					   void *recv,
 				     int bytes) 
 {
  assert(0);
 }
 void CartesianCommunicator::SendTo(void *xmit,
 					   int xmit_to_rank,
 					   int recv_from_rank,
 					   int bytes)
 {
  assert(0);
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -0,0 +1,334 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_shmem.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpp/shmem.h>
 namespace Grid {
  // Should error check all MPI calls.
 #define SHMEM_VET(addr) 
 #define SHMEM_VET_DEBUG(addr) {				\
  if ( ! shmem_addr_accessible(addr,_processor) ) {\
    std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
    BACKTRACEFILE();		   \
  }\
 }
 int Rank(void) {
  return shmem_my_pe();
 }
 typedef struct HandShake_t { 
  uint64_t seq_local;
  uint64_t seq_remote;
 } HandShake;
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
  RConnections.resize(shmem_n_pes());
  for(int pe =0 ; pe<shmem_n_pes();pe++){
    XConnections[pe].seq_local = 0;
    XConnections[pe].seq_remote= 0;
    RConnections[pe].seq_local = 0;
    RConnections[pe].seq_remote= 0;
  }
  shmem_barrier_all();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  _processor = shmem_my_pe();
  Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  int Size = shmem_n_pes(); 
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
  //  int nreduce=1;
  //  int pestart=0;
  //  int logStride=0;
  source = u;
  dest   = 0;
  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
  shmem_barrier_all(); // necessary?
  u = dest;
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
  //  int nreduce=1;
  //  int pestart=0;
  //  int logStride=0;
  source = u;
  dest   = 0;
  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
  shmem_barrier_all(); // necessary?
  u = dest;
 }
 void CartesianCommunicator::GlobalSum(float &f){
  static float source ;
  static float dest   ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
  source = f;
  dest   =0.0;
  shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
  f = dest;
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  static float source ;
  static float dest   = 0 ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
  if ( shmem_addr_accessible(f,_processor)  ){
    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
    return;
  }
  for(int i=0;i<N;i++){
    dest   =0.0;
    source = f[i];
    shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
    f[i] = dest;
  }
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  static double source;
  static double dest  ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
  source = d;
  dest   = 0;
  shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
  d = dest;
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  static double source ;
  static double dest   ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
  if ( shmem_addr_accessible(d,_processor)  ){
    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
    return;
  }
  for(int i=0;i<N;i++){
    source = d[i];
    dest   =0.0;
    shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
    d[i] = dest;
  }
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
  assert(std::abs(shift) <_processors[dim]);
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,source,_processors);
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  Lexicographic::IndexFromCoor(coor,rank,_processors);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  Lexicographic::CoorFromIndex(coor,rank,_processors);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  SHMEM_VET(xmit);
  SHMEM_VET(recv);
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  static uint64_t seq;
  assert(recv!=xmit);
  volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
  volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
  if ( _processor == sender ) {
    printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
    // Check he has posted a receive
    while(SendSeq->seq_remote == SendSeq->seq_local);
    printf("Sender receive %d posted\n",sender,receiver);
    // Advance our send count
    seq = ++(SendSeq->seq_local);
    // Send this packet 
    SHMEM_VET(recv);
    shmem_putmem(recv,xmit,bytes,receiver);
    shmem_fence();
    printf("Sender sent payload %d\n",seq);
    //Notify him we're done
    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
    shmem_fence();
    printf("Sender ringing door bell  %d\n",seq);
  }
  if ( _processor == receiver ) {
    printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
    // Post a receive
    seq = ++(RecvSeq->seq_local);
    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
    printf("Receiver Opening letter box %d\n",seq);
    // Now wait until he has advanced our reception counter
    while(RecvSeq->seq_remote != RecvSeq->seq_local);
    printf("Receiver Got the mail %d\n",seq);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  SHMEM_VET(xmit);
  SHMEM_VET(recv);
  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
  shmem_putmem(recv,xmit,bytes,dest);
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  //  shmem_quiet();      // I'm done
  shmem_barrier_all();// He's done too
 }
 void CartesianCommunicator::Barrier(void)
 {
  shmem_barrier_all();
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
  int words = bytes/4;
  if ( shmem_addr_accessible(data,_processor)  ){
    shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync);
    return;
  }
  for(int w=0;w<words;w++){
    word = array[w];
    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
    if ( shmem_my_pe() != root ) {
      array[w] = word;
    }
    shmem_barrier_all();
  }
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
  int words = bytes/4;
  for(int w=0;w<words;w++){
    word = array[w];
    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
    if ( shmem_my_pe() != root ) {
      array[w]= word;
    }
    shmem_barrier_all();
  }
 }
 }
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -35,7 +35,7 @@ class SimpleCompressor {
 public:
  void Point(int) {};
-  vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
+  vobj operator() (const vobj &arg) {
    return arg;
  }
 };
@@ -56,24 +56,24 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-
+  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-	int o  = n*rhs._grid->_slice_stride[dimension];
+	int o  = n*stride;
-	int bo = n*rhs._grid->_slice_block[dimension];
+	int bo = n*e2;
-	buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	buffer[off+bo+b]=compress(rhs._odata[so+o+b]);
      }
    }
  } else { 
     int bo=0;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
-	 int o  = n*rhs._grid->_slice_stride[dimension];
+	 int o  = n*stride;
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	 if ( ocb &cbmask ) {
-	   buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	   buffer[off+bo++]=compress(rhs._odata[so+o+b]);
 	 }
       }
     }
@@ -97,16 +97,16 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  
+  int n1=rhs._grid->_slice_stride[dimension];
  int n2=rhs._grid->_slice_block[dimension];
  if ( cbmask ==0x3){
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-	int o=n*rhs._grid->_slice_stride[dimension];
+	int o      =   n*n1;
-	int offset = b+n*rhs._grid->_slice_block[dimension];
+	int offset = b+n*n2;
-
+	cobj temp =compress(rhs._odata[so+o+b]);
 	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 	extract<cobj>(temp,pointers,offset);
      }
@@ -121,7 +121,7 @@ PARALLEL_NESTED_LOOP2
 	int offset = b+n*rhs._grid->_slice_block[dimension];
 	if ( ocb & cbmask ) {
-	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	  cobj temp =compress(rhs._odata[so+o+b]);
 	  extract<cobj>(temp,pointers,offset);
 	}
      }
@@ -243,13 +243,13 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
-
+  int stride = rhs._grid->_slice_stride[dimension];
  if(cbmask == 0x3 ){
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int o =n*stride+b;
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
      }
@@ -259,7 +259,7 @@ PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int o =n*stride+b;
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
@@ -285,11 +285,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block [dimension];
  int stride = rhs._grid->_slice_stride[dimension];
 PARALLEL_NESTED_LOOP2
  for(int n=0;n<e1;n++){
  for(int b=0;b<e2;b++){
-      int o  =n*rhs._grid->_slice_stride[dimension];
+      int o  =n*stride;
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
      if ( ocb&cbmask ) {
 	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
@@ -323,6 +324,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int gd = grid->_gdimensions[dimension];
  int ly = grid->_simd_layout[dimension];
  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;
@@ -331,6 +333,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
  // the permute type
  int permute_dim =grid->PermuteDim(dimension);
  int permute_type=grid->PermuteType(dimension);
  int permute_type_dist;
  for(int x=0;x<rd;x++){       
@@ -342,15 +345,31 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;
    // FIXME : This must change where we have a 
    // Rotate slice.
    // Document how this works ; why didn't I do this when I first wrote it...
    // wrap is whether sshift > rd.
    //  num is sshift mod rd.
    // 
    int permute_slice=0;
    if(permute_dim){
      int wrap = sshift/rd;
      int  num = sshift%rd;
      if ( x< rd-num ) permute_slice=wrap;
-      else permute_slice = 1-wrap;
+      else permute_slice = (wrap+1)%ly;
      if ( (ly>2) && (permute_slice) ) {
 	assert(permute_type & RotateBit);
 	permute_type_dist = permute_type|permute_slice;
      } else {
 	permute_type_dist = permute_type;
      }
-    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+    }
    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@@ -191,8 +191,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);
-  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
+  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
-  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
+  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
  int bytes = buffer_size*sizeof(scalar_object);
  std::vector<scalar_object *>  pointers(Nsimd); // 
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -55,7 +55,13 @@ extern int GridCshiftPermuteMap[4][16];
 // Basic expressions used in Expression Template
 ////////////////////////////////////////////////
-class LatticeBase {};
+class LatticeBase
 {
 public:
    virtual ~LatticeBase(void) = default;
    GridBase *_grid;
 };
 class LatticeExpressionBase {};
 template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
@@ -88,8 +94,6 @@ template<class vobj>
 class Lattice : public LatticeBase
 {
 public:
    GridBase *_grid;
    int checkerboard;
    Vector<vobj> _odata;
@@ -177,8 +181,8 @@ PARALLEL_FOR_LOOP
  }
  //GridFromExpression is tricky to do
  template<class Op,class T1>
-    Lattice(const LatticeUnaryExpression<Op,T1> & expr):    _grid(nullptr){
+    Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
-
+    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
@@ -199,7 +203,8 @@ PARALLEL_FOR_LOOP
    }
  };
  template<class Op,class T1, class T2>
-  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr):    _grid(nullptr){
+  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
@@ -220,7 +225,8 @@ PARALLEL_FOR_LOOP
    }
  };
  template<class Op,class T1, class T2, class T3>
-  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr):    _grid(nullptr){
+  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);
@@ -240,7 +246,8 @@ PARALLEL_FOR_LOOP
    // Constructor requires "grid" passed.
    // what about a default grid?
    //////////////////////////////////////////////////////////////////
-    Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
+    Lattice(GridBase *grid) : _odata(grid->oSites()) {
        _grid = grid;
    //        _odata.reserve(_grid->oSites());
    //        _odata.resize(_grid->oSites());
    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
@@ -248,6 +255,8 @@ PARALLEL_FOR_LOOP
        checkerboard=0;
    }
    virtual ~Lattice(void) = default;
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -152,7 +152,7 @@ PARALLEL_FOR_LOOP
    // Peek a scalar object from the SIMD array
    //////////////////////////////////////////////////////////
    template<class vobj,class sobj>
-    void peekLocalSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
      GridBase *grid=l._grid;
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -152,7 +152,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  assert(grid!=NULL);
  // FIXME
-  std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
+  // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@@ -178,7 +178,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  // sum over reduced dimension planes, breaking out orthog dir
  for(int ss=0;ss<grid->oSites();ss++){
-    GridBase::CoorFromIndex(coor,ss,grid->_rdimensions);
+    Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
    int r = coor[orthogdim];
    lvSum[r]=lvSum[r]+Data._odata[ss];
  }  
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -75,7 +75,7 @@ namespace Grid {
    std::seed_seq src;
-    fixedSeed(std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
+    fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
    result_type operator () (void){
@@ -122,6 +122,7 @@ namespace Grid {
    std::vector<RngEngine>                             _generators;
    std::vector<std::uniform_real_distribution<RealD>> _uniform;
    std::vector<std::normal_distribution<RealD>>       _gaussian;
    std::vector<std::discrete_distribution<int32_t>>     _bernoulli;
    void GetState(std::vector<RngStateType> & saved,int gen) {
      saved.resize(RngStateCount);
@@ -161,6 +162,7 @@ namespace Grid {
      _generators.resize(1);
      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
      _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
      _seeded=0;
    }
@@ -242,7 +244,7 @@ namespace Grid {
      std::random_device rd;
      Seed(rd);
    }
-    void SeedFixedIntegers(std::vector<int> &seeds){
+    void SeedFixedIntegers(const std::vector<int> &seeds){
      fixedSeed src(seeds);
      Seed(src);
    }
@@ -266,6 +268,7 @@ namespace Grid {
      _generators.resize(_vol);
      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
      _seeded=0;
    }
@@ -354,7 +357,7 @@ PARALLEL_FOR_LOOP
      std::random_device rd;
      Seed(rd);
    }
-    void SeedFixedIntegers(std::vector<int> &seeds){
+    void SeedFixedIntegers(const std::vector<int> &seeds){
      fixedSeed src(seeds);
      Seed(src);
    }
@@ -369,13 +372,21 @@ PARALLEL_FOR_LOOP
    rng.fill(l,rng._gaussian);
  }
  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){
    rng.fill(l,rng._bernoulli);
  }
  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
    rng.fill(l,rng._uniform);
  }
  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
    rng.fill(l,rng._gaussian);
  }
  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){
    rng.fill(l,rng._bernoulli);
  }
 }
 #endif
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@@ -115,9 +115,9 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    int sc;
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);
-    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
    for(int i=0;i<nbasis;i++) {
@@ -160,9 +160,9 @@ PARALLEL_FOR_LOOP
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);
-    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
    // z = A x + y
    fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
@@ -225,9 +225,9 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);
-    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
@@ -311,9 +311,9 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);
-    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
    for(int i=0;i<nbasis;i++) {
      if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
@@ -325,6 +325,126 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
 // Simd layouts need not match since we use peek/poke Local
 template<class vobj,class vvobj>
 void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vvobj::scalar_object ssobj;
  sobj s;
  ssobj ss;
  GridBase *ig = in._grid;
  GridBase *og = out._grid;
  int ni = ig->_ndimension;
  int no = og->_ndimension;
  assert(ni == no);
  for(int d=0;d<no;d++){
    assert(ig->_processors[d]  == og->_processors[d]);
    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
  }
 PARALLEL_FOR_LOOP
  for(int idx=0;idx<ig->lSites();idx++){
    std::vector<int> lcoor(ni);
    ig->LocalIndexToLocalCoor(idx,lcoor);
    peekLocalSite(s,in,lcoor);
    ss=s;
    pokeLocalSite(ss,out,lcoor);
  }
 }
 template<class vobj>
 void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
 {
  typedef typename vobj::scalar_object sobj;
  sobj s;
  GridBase *lg = lowDim._grid;
  GridBase *hg = higherDim._grid;
  int nl = lg->_ndimension;
  int nh = hg->_ndimension;
  assert(nl+1 == nh);
  assert(orthog<nh);
  assert(orthog>=0);
  assert(hg->_processors[orthog]==1);
  int dl; dl = 0;
  for(int d=0;d<nh;d++){
    if ( d != orthog) {
      assert(lg->_processors[dl]  == hg->_processors[d]);
      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
      dl++;
    }
  }
  // the above should guarantee that the operations are local
 PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
    dl=0;
    hcoor[orthog] = slice;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
 	hcoor[d]=lcoor[dl++];
      }
    }
    peekLocalSite(s,lowDim,lcoor);
    pokeLocalSite(s,higherDim,hcoor);
  }
 }
 template<class vobj>
 void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog)
 {
  typedef typename vobj::scalar_object sobj;
  sobj s;
  GridBase *lg = lowDim._grid;
  GridBase *hg = higherDim._grid;
  int nl = lg->_ndimension;
  int nh = hg->_ndimension;
  assert(nl+1 == nh);
  assert(orthog<nh);
  assert(orthog>=0);
  assert(hg->_processors[orthog]==1);
  int dl; dl = 0;
  for(int d=0;d<nh;d++){
    if ( d != orthog) {
      assert(lg->_processors[dl]  == hg->_processors[d]);
      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
      dl++;
    }
  }
  // the above should guarantee that the operations are local
 PARALLEL_FOR_LOOP
  for(int idx=0;idx<lg->lSites();idx++){
    std::vector<int> lcoor(nl);
    std::vector<int> hcoor(nh);
    lg->LocalIndexToLocalCoor(idx,lcoor);
    dl=0;
    hcoor[orthog] = slice;
    for(int d=0;d<nh;d++){
      if ( d!=orthog ) { 
 	hcoor[d]=lcoor[dl++];
      }
    }
    peekLocalSite(s,higherDim,hcoor);
    pokeLocalSite(s,lowDim,lcoor);
  }
 }
 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -146,7 +146,7 @@ class BinaryIO {
    csum = 0;
    std::vector<int> lcoor;
    for(int l=0;l<grid->lSites();l++){
-      grid->CoorFromIndex(lcoor,l,grid->_ldimensions);
+      Lexicographic::CoorFromIndex(lcoor,l,grid->_ldimensions);
      peekLocalSite(siteObj,lat,lcoor);
      munge(siteObj,fileObj,csum);
    }
@@ -168,6 +168,7 @@ class BinaryIO {
    GridBase *grid = Umu._grid;
    std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
    GridStopWatch timer; timer.Start();
    int ieee32big = (format == std::string("IEEE32BIG"));
    int ieee32    = (format == std::string("IEEE32"));
@@ -182,6 +183,7 @@ class BinaryIO {
    Umu = zero;
    uint32_t csum=0;
    uint64_t bytes=0;
    fobj file_object;
    sobj munged;
@@ -194,7 +196,7 @@ class BinaryIO {
      if ( grid->IsBoss() ) {
 	fin.read((char *)&file_object,sizeof(file_object));
-	
+	bytes += sizeof(file_object);
 	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
 	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
 	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
@@ -205,6 +207,10 @@ class BinaryIO {
      // The boss who read the file has their value poked
      pokeSite(munged,Umu,site);
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -224,13 +230,14 @@ class BinaryIO {
    // Serialise through node zero
    //////////////////////////////////////////////////
    std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
    GridStopWatch timer; timer.Start();
    std::ofstream fout;
    if ( grid->IsBoss() ) {
      fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
      fout.seekp(offset);
    }
-    
+    uint64_t bytes=0;
    uint32_t csum=0;
    fobj file_object;
    sobj unmunged;
@@ -253,9 +260,14 @@ class BinaryIO {
 	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
 	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
 	// NB could gather an xstrip as an optimisation.
 	fout.write((char *)&file_object,sizeof(file_object));
 	bytes+=sizeof(file_object);
      }
    }}}}
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -265,6 +277,7 @@ class BinaryIO {
    typedef typename GridSerialRNG::RngStateType RngStateType;
    const int RngStateCount = GridSerialRNG::RngStateCount;
    GridBase *grid = parallel._grid;
    int gsites = grid->_gsites;
@@ -310,7 +323,7 @@ class BinaryIO {
      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
      fout.write((char *)&saved[0],bytes);
    }
-
+    grid->Broadcast(0,(void *)&csum,sizeof(csum));
    return csum;
  }
  static inline uint32_t readRNGSerial(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file,int offset)
@@ -360,6 +373,8 @@ class BinaryIO {
      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
    }
    grid->Broadcast(0,(void *)&csum,sizeof(csum));
    return csum;
  }
@@ -426,6 +441,9 @@ class BinaryIO {
      std::cout << std::endl;
    }
    GridStopWatch timer; timer.Start();
    uint64_t bytes=0;
    int myrank = grid->ThisRank();
    int iorank = grid->RankFromProcessorCoor(ioproc);
@@ -439,9 +457,9 @@ class BinaryIO {
    // available (how short sighted is that?)
    //////////////////////////////////////////////////////////
    Umu = zero;
-    uint32_t csum=0;
+    static uint32_t csum=0;
    fobj fileObj;
-    sobj siteObj;
+    static sobj siteObj; // Static to place in symmetric region for SHMEM
      // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
@@ -451,7 +469,7 @@ class BinaryIO {
      std::vector<int> lsite(nd);
      std::vector<int> iosite(nd);
-      grid->CoorFromIndex(tsite,tlex,range);
+      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
 	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
@@ -472,6 +490,7 @@ class BinaryIO {
 	fin.seekg(offset+g_idx*sizeof(fileObj));
 	fin.read((char *)&fileObj,sizeof(fileObj));
 	bytes+=sizeof(fileObj);
 	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
 	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
@@ -480,22 +499,28 @@ class BinaryIO {
 	munge(fileObj,siteObj,csum);
 	if ( rank != myrank ) {
 	  grid->SendTo((void *)&siteObj,rank,sizeof(siteObj));
 	} else { 
 	  pokeLocalSite(siteObj,Umu,lsite);
      }	
-      } else { 
+      // Possibly do transport through pt2pt 
-	if ( myrank == rank ) {
+      if ( rank != iorank ) { 
-	  grid->RecvFrom((void *)&siteObj,iorank,sizeof(siteObj));
+	if ( (myrank == rank) || (myrank==iorank) ) {
-	  pokeLocalSite(siteObj,Umu,lsite);
+	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
 	}
      }
      // Poke at destination
      if ( myrank == rank ) {
 	  pokeLocalSite(siteObj,Umu,lsite);
      }
      grid->Barrier(); // necessary?
    }
    grid->GlobalSum(csum);
    grid->GlobalSum(bytes);
    grid->Barrier();
    timer.Stop();
    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
@@ -530,7 +555,7 @@ class BinaryIO {
    for(int d=0;d<grid->_ndimension;d++) {
-      if ( d==0 ) parallel[d] = 0;
+      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
      if (parallel[d]) {
 	range[d] = grid->_ldimensions[d];
@@ -559,6 +584,9 @@ class BinaryIO {
      std::cout << std::endl;
    }
    GridStopWatch timer; timer.Start();
    uint64_t bytes=0;
    int myrank = grid->ThisRank();
    int iorank = grid->RankFromProcessorCoor(ioproc);
@@ -577,9 +605,9 @@ class BinaryIO {
    uint32_t csum=0;
    fobj fileObj;
-    sobj siteObj;
+    static sobj siteObj; // static for SHMEM target; otherwise dynamic allocate with AlignedAllocator
    // should aggregate a whole chunk and then write.
    // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
@@ -588,7 +616,7 @@ class BinaryIO {
      std::vector<int> lsite(nd);
      std::vector<int> iosite(nd);
-      grid->CoorFromIndex(tsite,tlex,range);
+      Lexicographic::CoorFromIndex(tsite,tlex,range);
      for(int d=0;d<nd;d++){
 	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
@@ -606,13 +634,21 @@ class BinaryIO {
      ////////////////////////////////
      // iorank writes from the seek
      ////////////////////////////////
      if (myrank == iorank) {
-	if ( rank != myrank ) {
+      // Owner of data peeks it
 	  grid->RecvFrom((void *)&siteObj,rank,sizeof(siteObj));
 	} else { 
      peekLocalSite(siteObj,Umu,lsite);
      // Pair of nodes may need to do pt2pt send
      if ( rank != iorank ) { // comms is necessary
 	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
 	  // Send to IOrank 
 	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
 	}
      }
      grid->Barrier(); // necessary?
      if (myrank == iorank) {
 	munge(siteObj,fileObj,csum);
@@ -623,17 +659,16 @@ class BinaryIO {
 	fout.seekp(offset+g_idx*sizeof(fileObj));
 	fout.write((char *)&fileObj,sizeof(fileObj));
-
+	bytes+=sizeof(fileObj);
      } else { 
 	if ( myrank == rank ) {
 	  peekLocalSite(siteObj,Umu,lsite);
 	  grid->SendTo((void *)&siteObj,iorank,sizeof(siteObj));
      }
    }
      grid->Barrier(); // necessary// or every 16 packets to rate throttle??
    }
    grid->GlobalSum(csum);
    grid->GlobalSum(bytes);
    timer.Stop();
    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
 	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    return csum;
  }
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -213,37 +213,38 @@ class NerscIO : public BinaryIO {
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
  #define dump_nersc_header(field, s)\
  s << "BEGIN_HEADER"      << std::endl;\
  s << "HDR_VERSION = "    << field.hdr_version    << std::endl;\
  s << "DATATYPE = "       << field.data_type      << std::endl;\
  s << "STORAGE_FORMAT = " << field.storage_format << std::endl;\
  for(int i=0;i<4;i++){\
    s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;\
  }\
  s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;\
  s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;\
  for(int i=0;i<4;i++){\
    s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;\
  }\
  \
  s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;\
  s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;\
  s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;\
  s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;\
  s << "CREATOR = "         << field.creator          << std::endl;\
  s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;\
  s << "CREATION_DATE = "   << field.creation_date    << std::endl;\
  s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;\
  s << "FLOATING_POINT = "  << field.floating_point   << std::endl;\
  s << "END_HEADER"         << std::endl;
  static inline unsigned int writeHeader(NerscField &field,std::string file)
  {
    std::ofstream fout(file,std::ios::out|std::ios::in);
    fout.seekp(0,std::ios::beg);
-    fout << "BEGIN_HEADER"      << std::endl;
+    dump_nersc_header(field, fout);
    fout << "HDR_VERSION = "    << field.hdr_version    << std::endl;
    fout << "DATATYPE = "       << field.data_type      << std::endl;
    fout << "STORAGE_FORMAT = " << field.storage_format << std::endl;
    for(int i=0;i<4;i++){
      fout << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;
    }
    // just to keep the space and write it later
    fout << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;
    fout << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;
    for(int i=0;i<4;i++){
      fout << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;
    }
    fout << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;
    fout << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;
    fout << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;
    fout << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;
    fout << "CREATOR = "         << field.creator          << std::endl;
    fout << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;
    fout << "CREATION_DATE = "   << field.creation_date    << std::endl;
    fout << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;
    fout << "FLOATING_POINT = "  << field.floating_point   << std::endl;
    fout << "END_HEADER"         << std::endl;
    field.data_start = fout.tellp();
    return field.data_start;
 }
@@ -353,7 +354,7 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
      	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
    }
-  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3X3") ) {
+  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
    if ( ieee32 || ieee32big ) {
      //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
@@ -372,6 +373,7 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
  assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
  assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
  assert(csum == header.checksum );
  std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
@@ -419,6 +421,7 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
    std::string file1 = file+"para";
    int offset1 = writeHeader(header,file1);
    int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
    //int csum1=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
    std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl;
@@ -429,11 +432,12 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
  } else { 
    header.floating_point = std::string("IEEE64BIG");
-    header.data_type      = std::string("4D_SU3_GAUGE_3X3");
+    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
    NerscSimpleUnmunger<fobj3D,sobj> munge;
    BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
    offset = writeHeader(header,file);
-    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
+    //    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
    csum=BinaryIO::writeObjectParallel<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
  }
  std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
@@ -507,6 +511,8 @@ static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel
  // munger is a function of <floating point, Real, data_type>
  uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);
  std::cerr<<" Csum "<< csum << " "<< header.checksum <<std::endl;
  assert(csum == header.checksum );
  std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -90,7 +90,7 @@ namespace QCD {
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
-    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
    // Spin matrix
@@ -383,7 +383,6 @@ namespace QCD {
    //////////////////////////////////////////////
    // Poke scalars
    //////////////////////////////////////////////
    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
    {
      pokeIndex<SpinIndex>(lhs,rhs,i);
@@ -407,6 +406,40 @@ namespace QCD {
      pokeIndex<LorentzIndex>(lhs,rhs,i);
    }
    //////////////////////////////////////////////
    // Fermion <-> propagator assignements
    //////////////////////////////////////////////
    template <class Prop, class Ferm>
    void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
    {
        for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
            for(int i = 0; i < Nc; ++i)
            {
                pokeColour(pjs, peekColour(fj, i), i, c);
            }
            pokeSpin(p, pjs, j, s);
        }
    }
    template <class Prop, class Ferm>
    void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
    {
        for(int j = 0; j < Ns; ++j)
        {
            auto pjs = peekSpin(p, j, s);
            auto fj  = peekSpin(f, j);
            for(int i = 0; i < Nc; ++i)
            {
                pokeColour(fj, peekColour(pjs, i, c), i);
            }
            pokeSpin(f, fj, j);
        }
    }
    //////////////////////////////////////////////
    // transpose array and scalar
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@@ -113,6 +113,8 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		
 #define GparityFermOpTemplateInstantiate(A) 
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
@@ -208,6 +210,14 @@ typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
 typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
 typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
 typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
 typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -527,6 +527,7 @@ namespace QCD {
  }
  FermOpTemplateInstantiate(CayleyFermion5D);
  GparityFermOpTemplateInstantiate(CayleyFermion5D);
 }}
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -130,7 +130,7 @@ namespace Grid {
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
-      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
      ImplParams Params;
@@ -142,6 +142,10 @@ namespace Grid {
        mult(&phi(),&U(mu),&chi());
      }
      template<class ref>
      inline void loadLinkElement(Simd & reg,ref &memory){
 	reg = memory;
      }
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
      {
        conformable(Uds._grid,GaugeGrid);
@@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP
    };
    ///////
    // Single flavour four spinors with colour index, 5d redblack
    ///////
    template<class S,int Nrepresentation=Nc>
    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
    public:
      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
      INHERIT_GIMPL_TYPES(Gimpl);
      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
      typedef iImplSpinor    <Simd>           SiteSpinor;
      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
      typedef Lattice<SiteSpinor>             FermionField;
      // Make the doubled gauge field a *scalar*
      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
      ImplParams Params;
      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
      bool overlapCommsCompute(void) { return false; };
      template<class ref>
      inline void loadLinkElement(Simd & reg,ref &memory){
 	vsplat(reg,memory);
      }
      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
      {
 	SiteGaugeLink UU;
 	for(int i=0;i<Nrepresentation;i++){
 	  for(int j=0;j<Nrepresentation;j++){
 	    vsplat(UU()()(i,j),U(mu)()(i,j));
 	  }
 	}
        mult(&phi(),&UU(),&chi());
      }
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
      {
 	SiteScalarGaugeField  ScalarUmu;
 	SiteDoubledGaugeField ScalarUds;
        GaugeLinkField U   (Umu._grid);
 	GaugeField     Uadj(Umu._grid);
        for(int mu=0;mu<Nd;mu++){
  	  U = PeekIndex<LorentzIndex>(Umu,mu);
 	  U = adj(Cshift(U,mu,-1));
 	  PokeIndex<LorentzIndex>(Uadj,U,mu);
 	}
 	for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
 	  std::vector<int> lcoor;
 	  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
 	  peekLocalSite(ScalarUmu,Umu,lcoor);
 	  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
 	  peekLocalSite(ScalarUmu,Uadj,lcoor);
 	  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
 	  pokeLocalSite(ScalarUds,Uds,lcoor);
 	}
      }
      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
 	assert(0);
      }   
      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
 	assert(0);
      }
    };
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
@@ -205,7 +303,7 @@ PARALLEL_FOR_LOOP
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
-      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
      typedef GparityWilsonImplParams ImplParams;
@@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@@ -48,14 +48,16 @@ namespace Grid {
 			GridCartesian         &FourDimGrid,
 			GridRedBlackCartesian &FourDimRedBlackGrid,
 			RealD _mass,RealD _M5,
-			RealD scale) :
+//			RealD scale):
 			RealD scale,const ImplParams &p= ImplParams()) :
      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
      MobiusFermion<Impl>(_Umu,
 		    FiveDimGrid,
 		    FiveDimRedBlackGrid,
 		    FourDimGrid,
-		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
+	FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
 //		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
      {
      }
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -48,12 +48,7 @@ namespace QCD {
      mu=p;
    };
-    virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) {
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      return spinproject(in);
    }
    SiteHalfSpinor spinproject(const SiteSpinor &in)
    {
      SiteHalfSpinor ret;
      int mudag=mu;
      if (!dag) {
@@ -92,6 +87,173 @@ namespace QCD {
    }
  };
  /////////////////////////
  // optimised versions
  /////////////////////////
  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonXpCompressor {
  public:
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      spProjXp(ret,in);
      return ret;
    }
  };
  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonYpCompressor {
  public:
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      spProjYp(ret,in);
      return ret;
    }
  };
  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonZpCompressor {
  public:
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      spProjZp(ret,in);
      return ret;
    }
  };
  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonTpCompressor {
  public:
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      spProjTp(ret,in);
      return ret;
    }
  };
  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonXmCompressor {
  public:
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      spProjXm(ret,in);
      return ret;
    }
  };
  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonYmCompressor {
  public:
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      spProjYm(ret,in);
      return ret;
    }
  };
  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonZmCompressor {
  public:
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      spProjZm(ret,in);
      return ret;
    }
  };
  template<class SiteHalfSpinor,class SiteSpinor>
  class WilsonTmCompressor {
  public:
    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      spProjTm(ret,in);
      return ret;
    }
  };
    // Fast comms buffer manipulation which should inline right through (avoid direction
    // dependent logic that prevents inlining
  template<class vobj,class cobj>
  class WilsonStencil : public CartesianStencil<vobj,cobj> {
  public:
    WilsonStencil(GridBase *grid,
 		int npoints,
 		int checkerboard,
 		const std::vector<int> &directions,
 		const std::vector<int> &distances)  : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
      {    };
    template < class compressor>
    std::thread HaloExchangeOptBegin(const Lattice<vobj> &source,compressor &compress) {
      this->Mergers.resize(0); 
      this->Packets.resize(0);
      this->HaloGatherOpt(source,compress);
      return std::thread([&] { this->Communicate(); });
    }
    template < class compressor>
    void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
    {
      auto thr = this->HaloExchangeOptBegin(source,compress);
      this->HaloExchangeOptComplete(thr);
    }
    void HaloExchangeOptComplete(std::thread &thr) 
    {
 	this->CommsMerge(); // spins
 	this->jointime-=usecond();
 	thr.join();
 	this->jointime+=usecond();
    }
    template < class compressor>
    void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
    {
 	// conformable(source._grid,_grid);
 	assert(source._grid==this->_grid);
 	this->halogtime-=usecond();
 	assert (this->comm_buf.size() == this->_unified_buffer_size );
 	this->u_comm_offset=0;
 	int dag = compress.dag;
 	static std::vector<int> dirs(Nd*2);
 	for(int mu=0;mu<Nd;mu++){
 	  if ( dag ) {
 	    dirs[mu]  =mu;
 	    dirs[mu+4]=mu+Nd;
 	  } else { 
 	    dirs[mu]  =mu+Nd;
 	    dirs[mu+Nd]=mu;
 	  }
 	}
 	WilsonXpCompressor<cobj,vobj> XpCompress;
 	this->HaloGatherDir(source,XpCompress,dirs[0]);
 	WilsonYpCompressor<cobj,vobj> YpCompress;
 	this->HaloGatherDir(source,YpCompress,dirs[1]);
 	WilsonZpCompressor<cobj,vobj> ZpCompress;
 	this->HaloGatherDir(source,ZpCompress,dirs[2]);
 	WilsonTpCompressor<cobj,vobj> TpCompress;
 	this->HaloGatherDir(source,TpCompress,dirs[3]);
 	WilsonXmCompressor<cobj,vobj> XmCompress;
 	this->HaloGatherDir(source,XmCompress,dirs[4]);
 	WilsonYmCompressor<cobj,vobj> YmCompress;
 	this->HaloGatherDir(source,YmCompress,dirs[5]);
 	WilsonZmCompressor<cobj,vobj> ZmCompress;
 	this->HaloGatherDir(source,ZmCompress,dirs[6]);
 	WilsonTmCompressor<cobj,vobj> TmCompress;
 	this->HaloGatherDir(source,TmCompress,dirs[7]);
 	assert(this->u_comm_offset==this->_unified_buffer_size);
 	this->halogtime+=usecond();
      }
  };
 }} // namespace close
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -64,7 +64,9 @@ namespace QCD {
  template<class Impl>
  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  {
-    Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
+    GaugeField HUmu(_Umu._grid);
    HUmu = _Umu*(-0.5);
    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
    pickCheckerboard(Even,UmuEven,Umu);
    pickCheckerboard(Odd ,UmuOdd,Umu);
  }
@@ -286,121 +288,27 @@ PARALLEL_FOR_LOOP
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
    if ( Impl::overlapCommsCompute () ) { 
      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
    } else { 
      DhopInternalCommsThenCompute(st,U,in,out,dag);
    }
  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 							 const FermionField &in, FermionField &out,int dag) {
    assert((dag==DaggerNo) ||(dag==DaggerYes));
    Compressor compressor(dag);
    st.HaloExchange(in,compressor);
    if ( dag == DaggerYes ) {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
+	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    } else {
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
+	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    }
    } else {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
 	}
      }
    }
  };
  template<class Impl>
  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 						     const FermionField &in, FermionField &out,int dag) {
    assert((dag==DaggerNo) ||(dag==DaggerYes));
    Compressor compressor(dag);
    auto handle = st.HaloExchangeBegin(in,compressor);
    bool local    = true;
    bool nonlocal = false;
    if ( dag == DaggerYes ) {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      }
    } else {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      }
    }
    st.HaloExchangeComplete(handle);
    local    = false;
    nonlocal = true;
    if ( dag == DaggerYes ) {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      }
    } else {
      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      } else { 
 PARALLEL_FOR_LOOP
        for(int sss=0;sss<in._grid->oSites();sss++){
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
 	}
      }
    }
  };
  FermOpTemplateInstantiate(WilsonFermion);
  GparityFermOpTemplateInstantiate(WilsonFermion);
 }}
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -114,12 +114,6 @@ namespace Grid {
      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;
      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 				    const FermionField &in, FermionField &out,int dag) ;
      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 				    const FermionField &in, FermionField &out,int dag) ;
      // Constructor
      WilsonFermion(GaugeField &_Umu,
 		    GridCartesian         &Fgrid,
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -38,8 +38,6 @@ namespace QCD {
 // S-direction is INNERMOST and takes no part in the parity.
 const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 int WilsonFermion5DStatic::HandOptDslash;
 int WilsonFermion5DStatic::AsmOptDslash;
  // 5d lattice for DWF.
 template<class Impl>
@@ -67,10 +65,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FourDimRedBlackGrid._ndimension==4);
  assert(FiveDimRedBlackGrid._checker_dim==1);
  // Dimension zero of the five-d is the Ls direction
@@ -99,16 +95,74 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  alltime=0;
  commtime=0;
  jointime=0;
  dslashtime=0;
  dslash1time=0;
 }  
 template<class Impl>
 WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
 				       GridCartesian         &FiveDimGrid,
 				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 				       GridCartesian         &FourDimGrid,
 				       RealD _M5,const ImplParams &p) :
  Kernels(p),
  _FiveDimGrid        (&FiveDimGrid),
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
  _FourDimGrid        (&FourDimGrid),
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
  M5(_M5),
  Umu(_FourDimGrid),
  UmuEven(_FourDimGrid),
  UmuOdd (_FourDimGrid),
  Lebesgue(_FourDimGrid),
  LebesgueEvenOdd(_FourDimGrid)
 {
  int nsimd = Simd::Nsimd();
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
  assert(FourDimGrid._ndimension==4);
  // Dimension zero of the five-d is the Ls direction
  Ls=FiveDimGrid._fdimensions[0];
  assert(FiveDimGrid._processors[0]         ==1);
  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
  assert(FiveDimRedBlackGrid._processors[0] ==1);
  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
  // Other dimensions must match the decomposition of the four-D fields 
  for(int d=0;d<4;d++){
    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
    assert(FourDimGrid._simd_layout[d]=1);
    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
  }
  {
    GaugeField HUmu(_Umu._grid);
    HUmu = _Umu*(-0.5);
    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
    UmuEven=Umu;// Really want a reference.
    UmuOdd =Umu;
  }
 }  
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
-  Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
+  GaugeField HUmu(_Umu._grid);
  HUmu = _Umu*(-0.5);
  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
 }
@@ -232,30 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
  std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
  std::cout<<GridLogMessage << "Wilson5d      time "<<alltime <<" us"<<std::endl;
  std::cout<<GridLogMessage << "HaloBegin     time "<<commtime <<" us"<<std::endl;
  std::cout<<GridLogMessage << "Dslash        time "<<dslashtime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Dslash1       time "<<dslash1time<<" us"<<std::endl;
  std::cout<<GridLogMessage << "HaloComplete  time "<<jointime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
  std::cout<<GridLogMessage << "Stencil all gather      time "<<Stencil.halogtime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil splice   gather time "<<Stencil.splicetime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "********************"<<std::endl;
  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil spin   simd   "<<Stencil.spintime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "********************"<<std::endl;
  std::cout<<GridLogMessage << "Stencil MB/s          "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "********************"<<std::endl;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 				  const FermionField &A,
@@ -277,280 +307,32 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
 {
  if ( Impl::overlapCommsCompute () ) { 
    DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
  } else { 
    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
  }
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  alltime-=usecond();
  Compressor compressor(dag);
-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+  int LLs = in._grid->_rdimensions[0];
-  int threads = GridThread::GetThreads();
+  st.HaloExchange(in,compressor);
  int HT      = GridThread::GetHyperThreads();
  int cores   = GridThread::GetCores();
  int nwork = U._grid->oSites();
  commtime -=usecond();
  auto handle = st.HaloExchangeBegin(in,compressor);
  st.HaloExchangeComplete(handle);
  commtime +=usecond();
  jointime -=usecond();
  jointime +=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
  // Designed to create 
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 #pragma omp parallel for schedule(static)
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<U._grid->oSites();ss++){
 	{
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
 	int sU=ss;
-	    int sF = sd+Ls*sU;
+	int sF=LLs*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
 	  }
 	}
      }
    }
  } else {
    if( this->AsmOptDslash ) {
      //      for(int i=0;i<1;i++){
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
      //	PerformanceCounter Counter(i);
      //	Counter.Start();
 #pragma omp parallel for 
      for(int t=0;t<threads;t++){
 	int hyperthread = t%HT;
 	int core        = t/HT;
        int sswork, swork,soff,ssoff,  sU,sF;
 	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 	for(int ss=0;ss<sswork;ss++){
 	  for(int s=soff;s<soff+swork;s++){
 	    sU=ss+ ssoff;
 	    if ( LebesgueOrder::UseLebesgueOrder ) {
 	      sU = lo.Reorder(sU);
 	    }
 	    sF = s+Ls*sU;
 	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
 	  }
 	}
      }
      //      Counter.Stop();
      //      Counter.Report();
      //      }
    } else if( this->HandOptDslash ) {
      /*
 #pragma omp parallel for schedule(static)
      for(int t=0;t<threads;t++){
 	int hyperthread = t%HT;
 	int core        = t/HT;
        int sswork, swork,soff,ssoff,  sU,sF;
 	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 	for(int ss=0;ss<sswork;ss++){
 	  sU=ss+ ssoff;
 	  for(int s=soff;s<soff+swork;s++){
 	    sF = s+Ls*sU;
 	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	  }
 	}
      }
      */
 #pragma omp parallel for schedule(static)
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
    }
  } else {
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<U._grid->oSites();ss++){
      int sU=ss;
-	for(int s=0;s<Ls;s++){
+      int sF=LLs*sU;
-	  int sF = s+Ls*sU; 
+      Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
    }
  }
 }
  }
  dslashtime +=usecond();
  alltime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 						     DoubledGaugeField & U,
 						     const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  alltime-=usecond();
  int calls;
  int updates;
  Compressor compressor(dag);
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
  int threads = GridThread::GetThreads();
  int HT      = GridThread::GetHyperThreads();
  int cores   = GridThread::GetCores();
  int nwork = U._grid->oSites();
  commtime -=usecond();
  auto handle = st.HaloExchangeBegin(in,compressor);
  commtime +=usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
  // Designed to create 
  // - per thread reuse in L1 cache for U
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
  bool local    = true;
  bool nonlocal = false;
  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	  }
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	{
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
 	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	  }
 	}
      }
    }
  } else {
    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	}
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	}
      }
    }
  }
  dslashtime +=usecond();
  jointime -=usecond();
  st.HaloExchangeComplete(handle);
  jointime +=usecond();
  local    = false;
  nonlocal = true;
  dslash1time -=usecond();
  if ( dag == DaggerYes ) {
    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	  }
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	{
 	  int sd;
 	  for(sd=0;sd<Ls;sd++){
 	    int sU=ss;
 	    int sF = sd+Ls*sU;
 	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	  }
 	}
      }
    }
  } else {
    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	}
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
 	int sU=ss;
 	for(int s=0;s<Ls;s++){
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
 	}
      }
    }
  }
  dslash1time +=usecond();
  alltime+=usecond();
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
@@ -593,6 +375,9 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 }
 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
 template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
 }}
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -1,3 +1,4 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -48,8 +49,6 @@ namespace Grid {
    class WilsonFermion5DStatic { 
    public:
      // S-direction is INNERMOST and takes no part in the parity.
      static int AsmOptDslash; // these are a temporary hack
      static int HandOptDslash; // these are a temporary hack
      static const std::vector<int> directions;
      static const std::vector<int> displacements;
      const int npoint = 8;
@@ -61,11 +60,7 @@ namespace Grid {
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef WilsonKernels<Impl> Kernels;
-     double alltime;
+
     double jointime;
     double commtime;
     double dslashtime;
     double dslash1time;
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
@@ -86,6 +81,7 @@ namespace Grid {
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
      // These can be overridden by fancy 5d chiral action
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
@@ -120,19 +116,6 @@ namespace Grid {
 			FermionField &out,
 			int dag);
      void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
 			FermionField &out,
 			int dag);
      void DhopInternalCommsOverlapCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
 			FermionField &out,
 			int dag);
      // Constructors
      WilsonFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
@@ -141,14 +124,21 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      double _M5,const ImplParams &p= ImplParams());
      // Constructors
      WilsonFermion5D(int simd, 
 		      GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
 		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 		      GridCartesian         &FourDimGrid,
 		      double _M5,const ImplParams &p= ImplParams());
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);
      void Report(void);
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
-    protected:
+    public:
      // Add these to the support from Wilson
      GridBase *_FourDimGrid;
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -31,440 +31,410 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 namespace QCD {
  int WilsonKernelsStatic::HandOpt;
  int WilsonKernelsStatic::AsmOpt;
 template<class Impl> 
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
-  // Need controls to do interior, exterior, or both
+template<class Impl> 
 void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
  if ( AsmOpt ) {
    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
  } else {
    for(int site=0;site<Ns;site++) {
      for(int s=0;s<Ls;s++) {
 	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
 	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
 	sF++;
      }
      sU++;
    }
  }
 }
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
+					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 {
  // No asm implementation yet.
  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
  //  else
  for(int site=0;site<Ns;site++) {
    for(int s=0;s<Ls;s++) {
      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
      sF++;
    }
    sU++;
  }
 }
  ////////////////////////////////////////////
  // Generic implementation; move to different file?
  ////////////////////////////////////////////
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					   int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
  SiteHalfSpinor *chi_p;
  SiteHalfSpinor Uchi;
  SiteSpinor result;
  StencilEntry *SE;
  int ptype;
  int num = 0;
  result=zero;
  ///////////////////////////
  // Xp
  ///////////////////////////
  SE=st.GetEntry(ptype,Xp,sF);
-  if (local && SE->_is_local ) { 
+  if (SE->_is_local ) { 
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXp(chi,in._odata[SE->_offset]);
    }
  } else { 
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) { 
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
-    chi=buf[SE->_offset];
+  spReconXp(result,Uchi);
  }
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
    accumReconXp(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Yp
  ///////////////////////////
  SE=st.GetEntry(ptype,Yp,sF);
-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYp(chi,in._odata[SE->_offset]);
    }
  } else { 
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) { 
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
    chi=buf[SE->_offset];
  }
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
  accumReconYp(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Zp
  ///////////////////////////
  SE=st.GetEntry(ptype,Zp,sF);
-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZp(chi,in._odata[SE->_offset]);
    }
  } else { 
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) { 
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
    chi=buf[SE->_offset];
  }
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
  accumReconZp(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Tp
  ///////////////////////////
  SE=st.GetEntry(ptype,Tp,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjTp(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
    chi=buf[SE->_offset];
  }
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
  accumReconTp(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Xm
  ///////////////////////////
  SE=st.GetEntry(ptype,Xm,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXm(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
    chi=buf[SE->_offset];
  }
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
  accumReconXm(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Ym
  ///////////////////////////
  SE=st.GetEntry(ptype,Ym,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYm(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
    chi=buf[SE->_offset];
  }
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
  accumReconYm(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Zm
  ///////////////////////////
  SE=st.GetEntry(ptype,Zm,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZm(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
    chi=buf[SE->_offset];
  }
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
  accumReconZm(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Tm
  ///////////////////////////
  SE=st.GetEntry(ptype,Tm,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else { 
      spProjTm(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
    chi=buf[SE->_offset];
  }
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
  accumReconTm(result,Uchi);
    num++;
  }
-  if ( local ) {
+  vstream(out._odata[sF],result);
    vstream(out._odata[sF],result*(-0.5));
  } else if ( num ) { 
    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
  }
 };
  // Need controls to do interior, exterior, or both
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
 						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
+						  int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
  SiteHalfSpinor *chi_p;    
  SiteHalfSpinor Uchi;
  SiteSpinor result;
  StencilEntry *SE;
  int ptype;
  int num = 0;
  result=zero;
  ///////////////////////////
  // Xp
  ///////////////////////////
  SE=st.GetEntry(ptype,Xm,sF);
-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXp(chi,in._odata[SE->_offset]);
    }
  } else { 
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) { 
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
-    chi=buf[SE->_offset];
+  spReconXp(result,Uchi);
  }
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
    accumReconXp(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Yp
  ///////////////////////////
  SE=st.GetEntry(ptype,Ym,sF);
-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYp(chi,in._odata[SE->_offset]);
    }
  } else { 
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) { 
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
    chi=buf[SE->_offset];
  }
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
  accumReconYp(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Zp
  ///////////////////////////
  SE=st.GetEntry(ptype,Zm,sF);
-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZp(chi,in._odata[SE->_offset]);
    }
  } else { 
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) { 
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
    chi=buf[SE->_offset];
  }
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
  accumReconZp(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Tp
  ///////////////////////////
  SE=st.GetEntry(ptype,Tm,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjTp(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
    chi=buf[SE->_offset];
  }
  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
  accumReconTp(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Xm
  ///////////////////////////
  SE=st.GetEntry(ptype,Xp,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXm(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
    chi=buf[SE->_offset];
  }
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
  accumReconXm(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Ym
  ///////////////////////////
  SE=st.GetEntry(ptype,Yp,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYm(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
    chi=buf[SE->_offset];
  }
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
  accumReconYm(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Zm
  ///////////////////////////
  SE=st.GetEntry(ptype,Zp,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZm(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
    chi=buf[SE->_offset];
  }
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
  accumReconZm(result,Uchi);
    num++;
  }
  ///////////////////////////
  // Tm
  ///////////////////////////
  SE=st.GetEntry(ptype,Tp,sF);
-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else { 
      spProjTm(chi,in._odata[SE->_offset]);
    }
  } else {
    chi_p=&buf[SE->_offset];
  }
-  if ( nonlocal && (!SE->_is_local) ) {
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
    chi=buf[SE->_offset];
  }
  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
  accumReconTm(result,Uchi);
    num++;
  }
-  if ( local ) {
+  vstream(out._odata[sF],result);
    vstream(out._odata[sF],result*(-0.5));
  } else if ( num ) { 
    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
  }
 };
 template<class Impl> 
@@ -593,19 +563,13 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
    spReconTm(result,Uchi);
  }
-  vstream(out._odata[sF],result*(-0.5));
+  vstream(out._odata[sF],result);
 }
 #if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					      int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 }
 #endif
  FermOpTemplateInstantiate(WilsonKernels);
 template class WilsonKernels<DomainWallRedBlack5dImplF>;		
 template class WilsonKernels<DomainWallRedBlack5dImplD>;
 }}
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -38,37 +38,56 @@ namespace Grid {
    // Helper routines that implement Wilson stencil for a single site.
    // Common to both the WilsonFermion and WilsonFermion5D
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    class WilsonKernelsStatic { 
    public:
      // S-direction is INNERMOST and takes no part in the parity.
      static int AsmOpt;  // these are a temporary hack
      static int HandOpt; // these are a temporary hack
    };
-    template<class Impl> class WilsonKernels : public FermionOperator<Impl> { 
+    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef FermionOperator<Impl> Base;
    public:
     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			   int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
    private:
     // Specialised variants
     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			   int sF,int sU, const FermionField &in, FermionField &out);
     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in,FermionField &out);
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
-     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-				 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,const FermionField &in, FermionField &out);
     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 				 int sF,int sU,const FermionField &in, FermionField &out);
    public:
     WilsonKernels(const ImplParams &p= ImplParams());
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -2,6 +2,8 @@
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
    Copyright (C) 2015
@@ -26,320 +28,88 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 #if defined(AVX512) || defined (IMCI)
 #include <simd/Avx512Asm.h>
 #undef VLOAD
 #undef VSTORE
 #undef VMUL
 #undef VMADD
 #undef ZEND
 #undef ZLOAD
 #undef ZMUL
 #undef ZMADD
 #undef VZERO
 #undef VTIMESI
 #undef VTIMESMINUSI
 #define VZERO(A)                  VZEROf(A)
 #define VMOV(A,B)                 VMOVf(A,B)
 #define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 #define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 #define VADD(A,B,C)               VADDf(A,B,C)
 #define VSUB(A,B,C)               VSUBf(A,B,C)
 #define VMUL(Uri,Uir,Chi,UChi,Z)  VMULf(Uri,Uir,Chi,UChi,Z)
 #define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
 #define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
 #define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
 #define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 #define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 #define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
 #define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
 #define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
 #define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
 #define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
 #define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
 #define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
 #define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
 #define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
 #define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
 #define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
 #define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
 #define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
 #define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
 #define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 #define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 #define VPERM0(A,B)               VPERM0f(A,B)
 #define VPERM1(A,B)               VPERM1f(A,B)
 #define VPERM2(A,B)               VPERM2f(A,B)
 #define VPERM3(A,B)               VPERM3f(A,B)
 #define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 #define ZEND1(A,B,C)               ZEND1f(A,B,C)
 #define ZEND2(A,B,C)               ZEND2f(A,B,C)
 #define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
 #define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 #define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 #define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
 #define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 namespace Grid {
 namespace QCD {
  ///////////////////////////////////////////////////////////
  // Default to no assembler implementation
  ///////////////////////////////////////////////////////////
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
+					       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
-  uint64_t  now;
+  assert(0);
-  uint64_t first ;
+}
-  int offset,local,perm, ptype;
+
-  const SiteHalfSpinor *pbuf = & buf[0];
+#if defined(AVX512) 
  const SiteSpinor   *plocal = & in._odata[0];
  void *pf;
  int osites = in._grid->oSites();
-  StencilEntry *SE;
+  ///////////////////////////////////////////////////////////
  // If we are AVX512 specialise the single precision routine
  ///////////////////////////////////////////////////////////
-  //#define STAMP(i) timers[i] = __rdtsc() ; 
+#include <simd/Intel512wilson.h>
-#define STAMP(i) //timers[i] = __rdtsc() ; 
+#include <simd/Intel512single.h>
-  MASK_REGS;
+static Vector<vComplexF> signs;
-  first = __rdtsc();
+int setupSigns(void ){
  Vector<vComplexF> bother(2);
  signs = bother;
  vrsign(signs[0]);
  visign(signs[1]);
  return 1;
 }
 static int signInit = setupSigns();
-  SE=st.GetEntry(ptype,Xm,ss);
+#define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
-#if 0
+template<>
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
-  else               pf=(void *)&pbuf[SE->_offset];
+						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						     int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
 #undef MULT_2SPIN
 #define MAYBEPERM(A,B) 
 #define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 template<>
 void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								   int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
  LOAD64(%r9,pf);
  __asm__( 
 	  VPREFETCH(0,%r9)
 	  VPREFETCH(1,%r9)
 	  VPREFETCH(2,%r9)
 	  VPREFETCH(3,%r9)
 	  VPREFETCH(4,%r9)
 	  VPREFETCH(5,%r9)
 	  VPREFETCH(6,%r9)
 	  VPREFETCH(7,%r9)
 	  VPREFETCH(8,%r9)
 	  VPREFETCH(9,%r9)
 	  VPREFETCH(10,%r9)
 	  VPREFETCH(11,%r9) );
 #endif
-  // Xm
+template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
-  offset = SE->_offset;
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-  local  = SE->_is_local;
+							      int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Ym,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    XM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFXM(Xm,pf);
  }
  XM_RECON;
  // Ym
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Zm,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    YM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFYM(Ym,pf);
  }
  YM_RECON_ACCUM;
  // Zm
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Tm,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    ZM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFZM(Zm,pf);
  }
  ZM_RECON_ACCUM;
  // Tm
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  SE=st.GetEntry(ptype,Tp,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    TM_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFTM(Tm,pf);
  }
  TM_RECON_ACCUM;
  // Tp
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Zp,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    TP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFTP(Tp,pf);
  }
  TP_RECON_ACCUM;
  // Zp
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Yp,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    ZP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFZP(Zp,pf);
  }
  ZP_RECON_ACCUM;
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  // Prefetch
  SE=st.GetEntry(ptype,Xp,ss);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    YP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFYP(Yp,pf);
  }
  YP_RECON_ACCUM;
  // Xp
  perm   = SE->_permute;
  offset = SE->_offset;
  local  = SE->_is_local;
  //  PREFETCH_R(A);
  // Prefetch
  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
  else               pf=(void *)&pbuf[SE->_offset];
  if ( local ) {
    XP_PROJMEM(&plocal[offset]);
    if ( perm) {
      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI(&pbuf[offset]);
  }
  {
    MULT_2SPIN_DIR_PFXP(Xp,pf);
  }
  XP_RECON_ACCUM;
 debug:
  SAVE_RESULT(&out._odata[ss]);
 }
  template class WilsonKernels<WilsonImplF>;		
  template class WilsonKernels<WilsonImplD>; 
 template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 }}
-#endif
+
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -0,0 +1,164 @@
 {
  int locala,perma, ptypea;
  int localb,permb, ptypeb;
  uint64_t basea, baseb;
  uint64_t basex;
  const uint64_t plocal =(uint64_t) & in._odata[0];
  //  vComplexF isigns[2] = { signs[0], signs[1] };
  vComplexF *isigns = &signs[0];
  MASK_REGS;
  for(int site=0;site<Ns;site++) {
  for(int s=0;s<Ls;s++) {
  ////////////////////////////////
  // Xp
  ////////////////////////////////
  int ent=ss*8;// 2*Ndim
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  basex = basea;
  if ( locala ) {
    LOAD64(%r10,isigns);
    XM_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR3,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFXP(Xp,baseb);
  }
  LOAD64(%r10,isigns);
  XM_RECON;
  ////////////////////////////////
  // Yp
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YM_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR2,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFYP(Yp,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YM_RECON_ACCUM;
  ////////////////////////////////
  // Zp
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZM_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR1,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFZP(Zp,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZM_RECON_ACCUM;
  ////////////////////////////////
  // Tp
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TM_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR0,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFTP(Tp,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TM_RECON_ACCUM;
  ////////////////////////////////
  // Xm
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    XP_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR3,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFXM(Xm,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  XP_RECON_ACCUM;
  ////////////////////////////////
  // Ym
  ////////////////////////////////
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    YP_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR2,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFYM(Ym,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  YP_RECON_ACCUM;
  ////////////////////////////////
  // Zm
  ////////////////////////////////
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
  if ( locala ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    ZP_PROJMEM(basea);
    MAYBEPERM(PERMUTE_DIR1,perma);
  } else { 
    LOAD_CHI(basea);
  }
  {
    MULT_2SPIN_DIR_PFZM(Zm,baseb);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  ZP_RECON_ACCUM;
  ////////////////////////////////
  // Tm
  ////////////////////////////////
  basea = (uint64_t)&out._odata[ss];
  if ( localb ) {
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
    TP_PROJMEM(baseb);
    MAYBEPERM(PERMUTE_DIR0,permb);
  } else { 
    LOAD_CHI(baseb);
  }
  {
    MULT_2SPIN_DIR_PFTM(Tm,basea);
  }
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
  TP_RECON_ACCUM;
  PREFETCH_CHIMU(basex);
  SAVE_RESULT(&out._odata[ss]);
  ss++;
  } 
  sU++;
  }
 }
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chi_11 = ref()(1)(1);\
    Chi_12 = ref()(1)(2);
 // To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
   auto & ref(U._odata[sU](A));	\
-    U_00 = ref()(0,0);\
+   Impl::loadLinkElement(U_00,ref()(0,0));	\
-    U_10 = ref()(1,0);\
+   Impl::loadLinkElement(U_10,ref()(1,0));	\
-    U_20 = ref()(2,0);\
+   Impl::loadLinkElement(U_20,ref()(2,0));	\
-    U_01 = ref()(0,1);\
+   Impl::loadLinkElement(U_01,ref()(0,1));	\
-    U_11 = ref()(1,1);				\
+   Impl::loadLinkElement(U_11,ref()(1,1));	\
-    U_21 = ref()(2,1);\
+   Impl::loadLinkElement(U_21,ref()(2,1));	\
    UChi_00 = U_00*Chi_00;\
    UChi_10 = U_00*Chi_10;\
    UChi_01 = U_10*Chi_00;\
@@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_11+= U_11*Chi_11;\
    UChi_02+= U_21*Chi_01;\
    UChi_12+= U_21*Chi_11;\
-    U_00 = ref()(0,2);\
+    Impl::loadLinkElement(U_00,ref()(0,2));	\
-    U_10 = ref()(1,2);\
+    Impl::loadLinkElement(U_10,ref()(1,2));	\
-    U_20 = ref()(2,2);\
+    Impl::loadLinkElement(U_20,ref()(2,2));	\
    UChi_00+= U_00*Chi_02;\
    UChi_10+= U_00*Chi_12;\
    UChi_01+= U_10*Chi_02;\
@@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;
 #define PERMUTE_DIR(dir)			\
      permute##dir(Chi_00,Chi_00);\
      permute##dir(Chi_01,Chi_01);\
@@ -309,546 +311,10 @@ namespace Grid {
 namespace QCD {
 template<class Impl>
 int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 {
  //  std::cout << "Hand op Dhop "<<std::endl;
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
  REGISTER Simd Chi_00;    // two spinor; 6 regs
  REGISTER Simd Chi_01;
  REGISTER Simd Chi_02;
  REGISTER Simd Chi_10;
  REGISTER Simd Chi_11;
  REGISTER Simd Chi_12;   // 14 left
  REGISTER Simd UChi_00;  // two spinor; 6 regs
  REGISTER Simd UChi_01;
  REGISTER Simd UChi_02;
  REGISTER Simd UChi_10;
  REGISTER Simd UChi_11;
  REGISTER Simd UChi_12;  // 8 left
  REGISTER Simd U_00;  // two rows of U matrix
  REGISTER Simd U_10;
  REGISTER Simd U_20;  
  REGISTER Simd U_01;
  REGISTER Simd U_11;
  REGISTER Simd U_21;  // 2 reg left.
 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
 #define Chimu_02 Chi_02
 #define Chimu_10 Chi_10
 #define Chimu_11 Chi_11
 #define Chimu_12 Chi_12
 #define Chimu_20 UChi_00
 #define Chimu_21 UChi_01
 #define Chimu_22 UChi_02
 #define Chimu_30 UChi_10
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12
  StencilEntry *SE;
  int offset, ptype;
  int num = 0;
  // Xp
  SE=st.GetEntry(ptype,Xp,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    XP_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Xp);
    XP_RECON_ACCUM;
    num++;  
  }
  // Yp
  SE=st.GetEntry(ptype,Yp,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    YP_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Yp);
    YP_RECON_ACCUM;
    num++;  
  }
  // Zp
  SE=st.GetEntry(ptype,Zp,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    ZP_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }  
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Zp);
    ZP_RECON_ACCUM;
    num++;  
  }
  // Tp
  SE=st.GetEntry(ptype,Tp,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    TP_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Tp);
    TP_RECON_ACCUM;
    num++;  
  }
  // Xm
  SE=st.GetEntry(ptype,Xm,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    XM_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Xm);
    XM_RECON_ACCUM;
    num++;  
  }
  // Ym
  SE=st.GetEntry(ptype,Ym,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    YM_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Ym);
    YM_RECON_ACCUM;
    num++;  
  }
  // Zm
  SE=st.GetEntry(ptype,Zm,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    ZM_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Zm);
    ZM_RECON_ACCUM;
    num++;  
  }
  // Tm
  SE=st.GetEntry(ptype,Tm,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    TM_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Tm);
    TM_RECON_ACCUM;
    num++;  
  }
  SiteSpinor & ref (out._odata[ss]);
  if ( Local ) {
    vstream(ref()(0)(0),result_00*(-0.5));
    vstream(ref()(0)(1),result_01*(-0.5));
    vstream(ref()(0)(2),result_02*(-0.5));
    vstream(ref()(1)(0),result_10*(-0.5));
    vstream(ref()(1)(1),result_11*(-0.5));
    vstream(ref()(1)(2),result_12*(-0.5));
    vstream(ref()(2)(0),result_20*(-0.5));
    vstream(ref()(2)(1),result_21*(-0.5));
    vstream(ref()(2)(2),result_22*(-0.5));
    vstream(ref()(3)(0),result_30*(-0.5));
    vstream(ref()(3)(1),result_31*(-0.5));
    vstream(ref()(3)(2),result_32*(-0.5));
    return 1;
  } else if ( num ) { 
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
    return 1;
  }
  return 0;
 }
 template<class Impl>
 int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 {
  //  std::cout << "Hand op Dhop "<<std::endl;
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
  REGISTER Simd Chi_00;    // two spinor; 6 regs
  REGISTER Simd Chi_01;
  REGISTER Simd Chi_02;
  REGISTER Simd Chi_10;
  REGISTER Simd Chi_11;
  REGISTER Simd Chi_12;   // 14 left
  REGISTER Simd UChi_00;  // two spinor; 6 regs
  REGISTER Simd UChi_01;
  REGISTER Simd UChi_02;
  REGISTER Simd UChi_10;
  REGISTER Simd UChi_11;
  REGISTER Simd UChi_12;  // 8 left
  REGISTER Simd U_00;  // two rows of U matrix
  REGISTER Simd U_10;
  REGISTER Simd U_20;  
  REGISTER Simd U_01;
  REGISTER Simd U_11;
  REGISTER Simd U_21;  // 2 reg left.
 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
 #define Chimu_02 Chi_02
 #define Chimu_10 Chi_10
 #define Chimu_11 Chi_11
 #define Chimu_12 Chi_12
 #define Chimu_20 UChi_00
 #define Chimu_21 UChi_01
 #define Chimu_22 UChi_02
 #define Chimu_30 UChi_10
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12
  StencilEntry *SE;
  int offset, ptype;
  int num = 0;
  // Xp
  SE=st.GetEntry(ptype,Xp,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    XM_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Xp);
    XM_RECON_ACCUM;
    num++;  
  }
  // Yp
  SE=st.GetEntry(ptype,Yp,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    YM_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Yp);
    YM_RECON_ACCUM;
    num++;  
  }
  // Zp
  SE=st.GetEntry(ptype,Zp,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    ZM_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }  
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Zp);
    ZM_RECON_ACCUM;
    num++;  
  }
  // Tp
  SE=st.GetEntry(ptype,Tp,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    TM_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Tp);
    TM_RECON_ACCUM;
    num++;  
  }
  // Xm
  SE=st.GetEntry(ptype,Xm,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    XP_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Xm);
    XP_RECON_ACCUM;
    num++;  
  }
  // Ym
  SE=st.GetEntry(ptype,Ym,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    YP_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Ym);
    YP_RECON_ACCUM;
    num++;  
  }
  // Zm
  SE=st.GetEntry(ptype,Zm,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    ZP_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Zm);
    ZP_RECON_ACCUM;
    num++;  
  }
  // Tm
  SE=st.GetEntry(ptype,Tm,ss);
  offset = SE->_offset;
  if (Local && SE->_is_local ) { 
    LOAD_CHIMU;
    TP_PROJ;
    if ( SE->_permute ) {
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  }
  if ( Nonlocal && (!SE->_is_local) ) { 
    LOAD_CHI;
  }
  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
    MULT_2SPIN(Tm);
    TP_RECON_ACCUM;
    num++;  
  }
  SiteSpinor & ref (out._odata[ss]);
  if ( Local ) {
    vstream(ref()(0)(0),result_00*(-0.5));
    vstream(ref()(0)(1),result_01*(-0.5));
    vstream(ref()(0)(2),result_02*(-0.5));
    vstream(ref()(1)(0),result_10*(-0.5));
    vstream(ref()(1)(1),result_11*(-0.5));
    vstream(ref()(1)(2),result_12*(-0.5));
    vstream(ref()(2)(0),result_20*(-0.5));
    vstream(ref()(2)(1),result_21*(-0.5));
    vstream(ref()(2)(2),result_22*(-0.5));
    vstream(ref()(3)(0),result_30*(-0.5));
    vstream(ref()(3)(1),result_31*(-0.5));
    vstream(ref()(3)(2),result_32*(-0.5));
    return 1;
  } else if ( num ) { 
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
    return 1;
  }
  return 0;
 }
  /*
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@@ -1073,89 +539,346 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
  {
    SiteSpinor & ref (out._odata[ss]);
-    vstream(ref()(0)(0),result_00*(-0.5));
+    vstream(ref()(0)(0),result_00);
-    vstream(ref()(0)(1),result_01*(-0.5));
+    vstream(ref()(0)(1),result_01);
-    vstream(ref()(0)(2),result_02*(-0.5));
+    vstream(ref()(0)(2),result_02);
-    vstream(ref()(1)(0),result_10*(-0.5));
+    vstream(ref()(1)(0),result_10);
-    vstream(ref()(1)(1),result_11*(-0.5));
+    vstream(ref()(1)(1),result_11);
-    vstream(ref()(1)(2),result_12*(-0.5));
+    vstream(ref()(1)(2),result_12);
-    vstream(ref()(2)(0),result_20*(-0.5));
+    vstream(ref()(2)(0),result_20);
-    vstream(ref()(2)(1),result_21*(-0.5));
+    vstream(ref()(2)(1),result_21);
-    vstream(ref()(2)(2),result_22*(-0.5));
+    vstream(ref()(2)(2),result_22);
-    vstream(ref()(3)(0),result_30*(-0.5));
+    vstream(ref()(3)(0),result_30);
-    vstream(ref()(3)(1),result_31*(-0.5));
+    vstream(ref()(3)(1),result_31);
-    vstream(ref()(3)(2),result_32*(-0.5));
+    vstream(ref()(3)(2),result_32);
  }
 }
-*/
+
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  //  std::cout << "Hand op Dhop "<<std::endl;
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
  REGISTER Simd result_00; // 12 regs on knc
  REGISTER Simd result_01;
  REGISTER Simd result_02;
  REGISTER Simd result_10;
  REGISTER Simd result_11;
  REGISTER Simd result_12;
  REGISTER Simd result_20;
  REGISTER Simd result_21;
  REGISTER Simd result_22;
  REGISTER Simd result_30;
  REGISTER Simd result_31;
  REGISTER Simd result_32; // 20 left
  REGISTER Simd Chi_00;    // two spinor; 6 regs
  REGISTER Simd Chi_01;
  REGISTER Simd Chi_02;
  REGISTER Simd Chi_10;
  REGISTER Simd Chi_11;
  REGISTER Simd Chi_12;   // 14 left
  REGISTER Simd UChi_00;  // two spinor; 6 regs
  REGISTER Simd UChi_01;
  REGISTER Simd UChi_02;
  REGISTER Simd UChi_10;
  REGISTER Simd UChi_11;
  REGISTER Simd UChi_12;  // 8 left
  REGISTER Simd U_00;  // two rows of U matrix
  REGISTER Simd U_10;
  REGISTER Simd U_20;  
  REGISTER Simd U_01;
  REGISTER Simd U_11;
  REGISTER Simd U_21;  // 2 reg left.
 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
 #define Chimu_02 Chi_02
 #define Chimu_10 Chi_10
 #define Chimu_11 Chi_11
 #define Chimu_12 Chi_12
 #define Chimu_20 UChi_00
 #define Chimu_21 UChi_01
 #define Chimu_22 UChi_02
 #define Chimu_30 UChi_10
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12
  StencilEntry *SE;
  int offset,local,perm, ptype;
  // Xp
  SE=st.GetEntry(ptype,Xp,ss);
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  if ( local ) {
    LOAD_CHIMU;
    XP_PROJ;
    if ( perm) {
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
  }
  {
    MULT_2SPIN(Xp);
  }
  XP_RECON;
  // Yp
  SE=st.GetEntry(ptype,Yp,ss);
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  if ( local ) {
    LOAD_CHIMU;
    YP_PROJ;
    if ( perm) {
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
  }
  {
    MULT_2SPIN(Yp);
  }
  YP_RECON_ACCUM;
  // Zp
  SE=st.GetEntry(ptype,Zp,ss);
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  if ( local ) {
    LOAD_CHIMU;
    ZP_PROJ;
    if ( perm) {
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
  }
  {
    MULT_2SPIN(Zp);
  }
  ZP_RECON_ACCUM;
  // Tp
  SE=st.GetEntry(ptype,Tp,ss);
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  if ( local ) {
    LOAD_CHIMU;
    TP_PROJ;
    if ( perm) {
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
  }
  {
    MULT_2SPIN(Tp);
  }
  TP_RECON_ACCUM;
  // Xm
  SE=st.GetEntry(ptype,Xm,ss);
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  if ( local ) {
    LOAD_CHIMU;
    XM_PROJ;
    if ( perm) {
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
  }
  {
    MULT_2SPIN(Xm);
  }
  XM_RECON_ACCUM;
  // Ym
  SE=st.GetEntry(ptype,Ym,ss);
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  if ( local ) {
    LOAD_CHIMU;
    YM_PROJ;
    if ( perm) {
      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
  }
  {
    MULT_2SPIN(Ym);
  }
  YM_RECON_ACCUM;
  // Zm
  SE=st.GetEntry(ptype,Zm,ss);
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  if ( local ) {
    LOAD_CHIMU;
    ZM_PROJ;
    if ( perm) {
      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
  }
  {
    MULT_2SPIN(Zm);
  }
  ZM_RECON_ACCUM;
  // Tm
  SE=st.GetEntry(ptype,Tm,ss);
  offset = SE->_offset;
  local  = SE->_is_local;
  perm   = SE->_permute;
  if ( local ) {
    LOAD_CHIMU;
    TM_PROJ;
    if ( perm) {
      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
    }
  } else { 
    LOAD_CHI;
  }
  {
    MULT_2SPIN(Tm);
  }
  TM_RECON_ACCUM;
  {
    SiteSpinor & ref (out._odata[ss]);
    vstream(ref()(0)(0),result_00);
    vstream(ref()(0)(1),result_01);
    vstream(ref()(0)(2),result_02);
    vstream(ref()(1)(0),result_10);
    vstream(ref()(1)(1),result_11);
    vstream(ref()(1)(2),result_12);
    vstream(ref()(2)(0),result_20);
    vstream(ref()(2)(1),result_21);
    vstream(ref()(2)(2),result_22);
    vstream(ref()(3)(0),result_30);
    vstream(ref()(3)(1),result_31);
    vstream(ref()(3)(2),result_32);
  }
 }
  ////////////////////////////////////////////////
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+							     int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
+  assert(0);
  //check consistency of return types between these functions and the ones in WilsonKernels.cc
  return 0;
 }
 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+								int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  assert(0);
  return 0;
 }
 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+							     int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  assert(0);
  return 0;
 }
 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+								int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  assert(0);
  return 0;
 }
-template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+////////////// Wilson ; uses this implementation /////////////////////
 // Need Nc=3 though //
 template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
+							       int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
+							       int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
+								  int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
+								  int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+								      int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+									 int ss,int sU,const FermionField &in, FermionField &out);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+									 int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 								      int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);
 }}
--- a/lib/qcd/action/gauge/GaugeImpl.h
+++ b/lib/qcd/action/gauge/GaugeImpl.h
@@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
 #define INHERIT_GIMPL_TYPES(GImpl) \
    typedef typename GImpl::Simd                           Simd;\
    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
-    typedef typename GImpl::GaugeField               GaugeField;	
+    typedef typename GImpl::GaugeField               GaugeField;\
    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;
    // 
    template<class S,int Nrepresentation=Nc>
--- a/lib/qcd/hmc/HmcRunner.h
+++ b/lib/qcd/hmc/HmcRunner.h
@@ -92,13 +92,13 @@ public:
    // Create integrator, including the smearing policy
    // Smearing policy
-    std::cout << GridLogMessage << " Creating the Stout class\n";
+    std::cout << GridLogDebug << " Creating the Stout class\n";
-    double rho = 0.1; // smearing parameter
+    double rho = 0.1; // smearing parameter, now hardcoded
    int Nsmear = 1;   // number of smearing levels
    Smear_Stout<Gimpl> Stout(rho);
-    std::cout << GridLogMessage << " Creating the SmearedConfiguration class\n";
+    std::cout << GridLogDebug << " Creating the SmearedConfiguration class\n";
    SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
-    std::cout << GridLogMessage << " done\n";
+    std::cout << GridLogDebug << " done\n";
    //////////////
    typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> >  IntegratorType;// change here to change the algorithm
    IntegratorParameters MDpar(20);
@@ -116,27 +116,27 @@ public:
    if ( StartType == HotStart ) {
      // Hot start
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::HotConfiguration(pRNG, U);
    } else if ( StartType == ColdStart ) { 
      // Cold start
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::ColdConfiguration(pRNG, U);
    } else if ( StartType == TepidStart ) {       
      // Tepid start
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::TepidConfiguration(pRNG, U);
    } else if ( StartType == CheckpointStart ) { 
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      // CheckpointRestart
      Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
--- a/lib/qcd/spin/Dirac.cc
+++ b/lib/qcd/spin/Dirac.cc
@@ -61,6 +61,31 @@ namespace Grid {
      "         "
    };
    SpinMatrix makeGammaProd(const unsigned int i)
    {
      SpinMatrix g;
      g = 1.;
      if (i & 0x1)
      {
        g = g*Gamma(Gamma::GammaMatrix::GammaX);
      }
      if (i & 0x2)
      {
        g = g*Gamma(Gamma::GammaMatrix::GammaY);
      }
      if (i & 0x4)
      {
        g = g*Gamma(Gamma::GammaMatrix::GammaZ);
      }
      if (i & 0x8)
      {
        g = g*Gamma(Gamma::GammaMatrix::GammaT);
      }
      return g;
    }
    //    void sprojMul( vHalfSpinColourVector &out,vColourMatrix &u, vSpinColourVector &in){
    //      vHalfSpinColourVector hspin;
    //      spProjXp(hspin,in);
--- a/lib/qcd/spin/Dirac.h
+++ b/lib/qcd/spin/Dirac.h
@@ -83,6 +83,9 @@ namespace QCD {
  };
    // Make gamma products (Chroma convention)
    SpinMatrix makeGammaProd(const unsigned int i);
    /* Gx
     *  0 0  0  i    
     *  0 0  i  0    
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -608,14 +608,14 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    LatticeMatrix Umu(out._grid);
    for(int mu=0;mu<Nd;mu++){
      LieRandomize(pRNG,Umu,0.01);
-      pokeLorentz(out,Umu,mu);
+      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }
  static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
    LatticeMatrix Umu(out._grid);
    Umu=1.0;
    for(int mu=0;mu<Nd;mu++){
-      pokeLorentz(out,Umu,mu);
+      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }
--- a/lib/qcd/utils/SpaceTimeGrid.cc
+++ b/lib/qcd/utils/SpaceTimeGrid.cc
@@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 {
  return new GridRedBlackCartesian(FourDimGrid); 
 }
-
+GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
 {
  std::vector<int> simd(4,1);
  return makeFourDimGrid(latt,simd,mpi);
 }
 GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@@ -58,6 +62,7 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
  return new GridCartesian(latt5,simd5,mpi5); 
 }
 GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 }
 GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();
  std::vector<int> latt5(1,Ls);
  std::vector<int> simd5(1,nsimd);
  std::vector<int>  mpi5(1,1);
  for(int d=0;d<N4;d++){
    latt5.push_back(FourDimGrid->_fdimensions[d]);
    simd5.push_back(1);
     mpi5.push_back(FourDimGrid->_processors[d]);
  }
  return new GridCartesian(latt5,simd5,mpi5); 
 }
 GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();
  int cbd=0;
  std::vector<int> latt5(1,Ls);
  std::vector<int> simd5(1,nsimd);
  std::vector<int>  mpi5(1,1);
  std::vector<int>   cb5(1,1);
  for(int d=0;d<N4;d++){
    latt5.push_back(FourDimGrid->_fdimensions[d]);
    simd5.push_back(1);
     mpi5.push_back(FourDimGrid->_processors[d]);
      cb5.push_back(1);
    }
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 }
 }}
--- a/lib/qcd/utils/SpaceTimeGrid.h
+++ b/lib/qcd/utils/SpaceTimeGrid.h
@@ -35,9 +35,14 @@ class SpaceTimeGrid {
  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);
  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
  static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
 };
 }}
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -101,15 +101,15 @@ namespace Grid {
      // average over all x,y,z,t and over all planes of plaquette
      //////////////////////////////////////////////////
 	static RealD avgPlaquette(const GaugeLorentz &Umu){
 		RealD sumplaq = sumPlaquette(Umu);
 		double vol = Umu._grid->gSites();
 		double faces = (1.0*Nd*(Nd-1))/2.0;
 		return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
   	}
      //////////////////////////////////////////////////
      // average over traced single links
      //////////////////////////////////////////////////
      static RealD linkTrace(const GaugeLorentz &Umu){
 	std::vector<GaugeMat> U(4,Umu._grid);
@@ -126,47 +126,6 @@ namespace Grid {
 	return p.real()/vol/4.0/3.0;
      };
      //////////////////////////////////////////////////
      // the sum over all staples on each site
      //////////////////////////////////////////////////
      static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
 	GridBase *grid = Umu._grid;
 	std::vector<GaugeMat> U(4,grid);
 	for(int d=0;d<Nd;d++){
 	  U[d] = PeekIndex<LorentzIndex>(Umu,d);
 	}
 	staple = zero;
 	for(int nu=0;nu<Nd;nu++){
 	  if(nu != mu) {
 	    // mu
 	    // ^
 	    // |__>  nu
 	    //    __ 
 	    //      |
 	    //    __|
 	    //
 	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftForward (U[nu],nu, 
 							       Gimpl::CovShiftBackward(U[mu],mu,
 										       Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
 	    //  __ 
 	    // |   
 	    // |__ 
 	    //
 	    //
 	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftBackward(U[nu],nu,		  		  
 							       Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
 	  }
 	}
      }
      //////////////////////////////////////////////////
      // the sum over all staples on each site in direction mu,nu
@@ -210,6 +169,51 @@ namespace Grid {
 	}
      }
 //////////////////////////////////////////////////
 // the sum over all staples on each site
 //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
    GridBase *grid = Umu._grid;
    std::vector<GaugeMat> U(Nd,grid);
    for(int d=0;d<Nd;d++){
      U[d] = PeekIndex<LorentzIndex>(Umu,d);
    }
    staple = zero;
    GaugeMat tmp(grid);
    for(int nu=0;nu<Nd;nu++){
      if(nu != mu) {
      // mu
      // ^
      // |__>  nu
      //    __ 
      //      |
      //    __|
      //
 	staple+=Gimpl::ShiftStaple(
 	        Gimpl::CovShiftForward (U[nu],nu, 
 		Gimpl::CovShiftBackward(U[mu],mu,
 		Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
      //  __ 
      // |   
      // |__ 
      //
      //
 	staple+=Gimpl::ShiftStaple(  
                Gimpl::CovShiftBackward(U[nu],nu,		  		  
 		Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
      }
    }
  }
      //////////////////////////////////////////////////
      // the sum over all staples on each site in direction mu,nu, upper part
@@ -247,7 +251,6 @@ namespace Grid {
  //////////////////////////////////////////////////////
  // Similar to above for rectangle is required
  //////////////////////////////////////////////////////
@@ -276,11 +279,12 @@ namespace Grid {
      }
    }
  }
 //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static RealD sumRectangle(const GaugeLorentz &Umu){
-	std::vector<GaugeMat> U(4,Umu._grid);
+    std::vector<GaugeMat> U(Nd,Umu._grid);
    for(int mu=0;mu<Nd;mu++){
      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@@ -406,7 +410,7 @@ namespace Grid {
  static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
    GridBase *grid = Umu._grid;
-	std::vector<GaugeMat> U(4,grid);
+    std::vector<GaugeMat> U(Nd,grid);
    for(int d=0;d<Nd;d++){
      U[d] = PeekIndex<LorentzIndex>(Umu,d);
    }
--- a/lib/serialisation/BaseIO.h
+++ b/lib/serialisation/BaseIO.h
@@ -32,6 +32,40 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <type_traits>
 namespace Grid {
  // helper function to read space-separated values
  template <typename T>
  std::vector<T> strToVec(const std::string s)
  {
    std::istringstream sstr(s);
    T                  buf;
    std::vector<T>     v;
    while(!sstr.eof())
    {
      sstr >> buf;
      v.push_back(buf);
    }
    return v;
  }
  // output to streams for vectors
  template < class T >
  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
  {
    os << "[";
    for (auto &x: v)
    {
      os << x << " ";
    }
    if (v.size() > 0)
    {
      os << "\b";
    }
    os << "]";
    return os;
  }
  class Serializable {};
@@ -138,23 +172,6 @@ namespace Grid {
    r.read(s, output);
  }
  template < class T >
  inline std::ostream& operator << (std::ostream& os, const std::vector<T>& v)
  {
    os << "[";
    for (auto &x: v)
    {
      os << x << " ";
    }
    if (v.size() > 0)
    {
      os << "\b";
    }
    os << "]";
    return os;
  }
  // Writer template implementation ////////////////////////////////////////////
  template <typename T>
  Writer<T>::Writer(void)
--- a/lib/serialisation/MacroMagic.h
+++ b/lib/serialisation/MacroMagic.h
@@ -120,7 +120,7 @@ THE SOFTWARE.
  \
  \
  template <typename T>\
-  static void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
+  static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
    push(WR,s);\
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
    pop(WR);\
@@ -128,14 +128,14 @@ THE SOFTWARE.
  \
  \
  template <typename T>\
-  static void read(Reader<T> &RD,const std::string &s, cname &obj){	\
+  static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
    push(RD,s);\
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
    pop(RD);\
  } \
  \
  \
-  friend std::ostream & operator << (std::ostream &os, const cname &obj ) { \
+  friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
    os<<"class "<<#cname<<" {"<<std::endl;\
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
      os<<"}";								\
@@ -165,7 +165,7 @@ namespace Grid {
  class EnumIO<name> {\
    public:\
      template <typename T>\
-      static void write(Writer<T> &WR,const std::string &s, const name &obj){ \
+      static inline void write(Writer<T> &WR,const std::string &s, const name &obj){ \
        switch (obj) {\
          GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
          default: Grid::write(WR,s,#undefname); break;\
@@ -173,7 +173,7 @@ namespace Grid {
      }\
      \
      template <typename T>\
-      static void read(Reader<T> &RD,const std::string &s, name &obj){ \
+      static inline void read(Reader<T> &RD,const std::string &s, name &obj){ \
        std::string buf;\
        Grid::read(RD, s, buf);\
        if (buf == #undefname) {obj = name::undefname;}\
@@ -182,7 +182,7 @@ namespace Grid {
      }\
  };\
  \
-  std::ostream & operator << (std::ostream &os, const name &obj ) { \
+  inline std::ostream & operator << (std::ostream &os, const name &obj ) { \
    switch (obj) {\
        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
        default: os << #undefname; break;\
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@@ -80,6 +80,20 @@ void XmlReader::pop(void)
  node_ = node_.parent();
 }
 bool XmlReader::nextElement(const std::string &s)
 {
  if (node_.next_sibling(s.c_str()))
  {
    node_ = node_.next_sibling(s.c_str());
    return true;
  }
  else
  {
    return false;
  }
 }
 template <>
 void XmlReader::readDefault(const string &s, string &output)
 {
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -68,6 +68,7 @@ namespace Grid
    virtual ~XmlReader(void) = default;
    void push(const std::string &s);
    void pop(void);
    bool nextElement(const std::string &s);
    template <typename U>
    void readDefault(const std::string &s, U &output);
    template <typename U>
--- a/lib/simd/Avx512Asm.h
+++ b/lib/simd/Avx512Asm.h
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -410,22 +410,22 @@ namespace Optimization {
  struct Permute{
    static inline __m256 Permute0(__m256 in){
-      return _mm256_permute2f128_ps(in,in,0x01);
+      return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
    };
    static inline __m256 Permute1(__m256 in){
-      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
    };
    static inline __m256 Permute2(__m256 in){
-      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
    };
    static inline __m256 Permute3(__m256 in){
      return in;
    };
    static inline __m256d Permute0(__m256d in){
-      return _mm256_permute2f128_pd(in,in,0x01);
+      return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
    };
-    static inline __m256d Permute1(__m256d in){
+    static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
      return _mm256_shuffle_pd(in,in,0x5);
    };
    static inline __m256d Permute2(__m256d in){
@@ -437,6 +437,111 @@ namespace Optimization {
  };
 #if defined (AVX2) || defined (AVXFMA4) 
 #define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
 #define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
 #endif
 #if defined (AVX1) 
 #define _mm256_alignr_epi32(ret,a,b,n) {	\
    __m128 aa, bb;				\
 						\
    aa  = _mm256_extractf128_ps(a,1);		\
    bb  = _mm256_extractf128_ps(b,1);		\
    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
    ret = _mm256_insertf128_ps(ret,aa,1);	\
 						\
    aa  = _mm256_extractf128_ps(a,0);		\
    bb  = _mm256_extractf128_ps(b,0);		\
    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
    ret = _mm256_insertf128_ps(ret,aa,0);	\
  }
 #define _mm256_alignr_epi64(ret,a,b,n) {	\
    __m128d aa, bb;				\
 						\
    aa  = _mm256_extractf128_pd(a,1);		\
    bb  = _mm256_extractf128_pd(b,1);		\
    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
    ret = _mm256_insertf128_pd(ret,aa,1);	\
 						\
    aa  = _mm256_extractf128_pd(a,0);		\
    bb  = _mm256_extractf128_pd(b,0);		\
    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
    ret = _mm256_insertf128_pd(ret,aa,0);	\
  }
 #endif
    inline std::ostream & operator << (std::ostream& stream, const __m256 a)
    {
      const float *p=(const float *)&a;
      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
      return stream;
    };
    inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
    {
      const double *p=(const double *)&a;
      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
      return stream;
    };
  struct Rotate{
    static inline __m256 rotate(__m256 in,int n){ 
      switch(n){
      case 0: return tRotate<0>(in);break;
      case 1: return tRotate<1>(in);break;
      case 2: return tRotate<2>(in);break;
      case 3: return tRotate<3>(in);break;
      case 4: return tRotate<4>(in);break;
      case 5: return tRotate<5>(in);break;
      case 6: return tRotate<6>(in);break;
      case 7: return tRotate<7>(in);break;
      default: assert(0);
      }
    }
    static inline __m256d rotate(__m256d in,int n){ 
      switch(n){
      case 0: return tRotate<0>(in);break;
      case 1: return tRotate<1>(in);break;
      case 2: return tRotate<2>(in);break;
      case 3: return tRotate<3>(in);break;
      default: assert(0);
      }
    }
    template<int n>
    static inline __m256 tRotate(__m256 in){ 
      __m256 tmp = Permute::Permute0(in);
      __m256 ret;
      if ( n > 3 ) { 
 	_mm256_alignr_epi32(ret,in,tmp,n);  
      } else {
        _mm256_alignr_epi32(ret,tmp,in,n);          
      }
      //      std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
      return ret;
    };
    template<int n>
    static inline __m256d tRotate(__m256d in){ 
      __m256d tmp = Permute::Permute0(in);
      __m256d ret;
      if ( n > 1 ) {
 	_mm256_alignr_epi64(ret,in,tmp,n);          
      } else {
        _mm256_alignr_epi64(ret,tmp,in,n);          
      }
      //      std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
      return ret;
    };
  };
  //Complex float Reduce
  template<>
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -39,7 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <immintrin.h>
-
+namespace Grid{
 namespace Optimization {
  struct Vsplat{
@@ -246,26 +246,30 @@ namespace Optimization {
  struct TimesMinusI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
-      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
+      //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
-      return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));   // 0x4E??
+      //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0));   // 0x4E??
      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
-      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
+      //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
-      return _mm512_shuffle_pd(tmp,tmp,0x55);
+      //return _mm512_shuffle_pd(tmp,tmp,0x55);
      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
    } 
  };
  struct TimesI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
-      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
-      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); 
+      return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); 
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
-      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
+      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
-      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); 
+      return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); 
    }
@@ -305,6 +309,54 @@ namespace Optimization {
  };
  struct Rotate{
    static inline __m512 rotate(__m512 in,int n){ 
      switch(n){
      case 0: return tRotate<0>(in);break;
      case 1: return tRotate<1>(in);break;
      case 2: return tRotate<2>(in);break;
      case 3: return tRotate<3>(in);break;
      case 4: return tRotate<4>(in);break;
      case 5: return tRotate<5>(in);break;
      case 6: return tRotate<6>(in);break;
      case 7: return tRotate<7>(in);break;
      case 8 : return tRotate<8>(in);break;
      case 9 : return tRotate<9>(in);break;
      case 10: return tRotate<10>(in);break;
      case 11: return tRotate<11>(in);break;
      case 12: return tRotate<12>(in);break;
      case 13: return tRotate<13>(in);break;
      case 14: return tRotate<14>(in);break;
      case 15: return tRotate<15>(in);break;
      default: assert(0);
      }
    }
    static inline __m512d rotate(__m512d in,int n){ 
      switch(n){
      case 0: return tRotate<0>(in);break;
      case 1: return tRotate<1>(in);break;
      case 2: return tRotate<2>(in);break;
      case 3: return tRotate<3>(in);break;
      case 4: return tRotate<4>(in);break;
      case 5: return tRotate<5>(in);break;
      case 6: return tRotate<6>(in);break;
      case 7: return tRotate<7>(in);break;
      default: assert(0);
      }
    }
    template<int n> static inline __m512 tRotate(__m512 in){ 
      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
    };
    template<int n> static inline __m512d tRotate(__m512d in){ 
      return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);          
    };
  };
  //////////////////////////////////////////////
  // Some Template specialization
@@ -345,7 +397,7 @@ namespace Optimization {
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
-namespace Grid {
+
  typedef __m512 SIMD_Ftype;  // Single precision type
  typedef __m512d SIMD_Dtype; // Double precision type
  typedef __m512i SIMD_Itype; // Integer type
--- a/lib/simd/Grid_empty.h
+++ b/lib/simd/Grid_empty.h
@@ -35,6 +35,7 @@ Author: neo <cossu@post.kek.jp>
 // Time-stamp: <2015-06-09 14:28:02 neo>
 //----------------------------------------------------------------------
 namespace Grid {
 namespace Optimization {
  template<class vtype>
@@ -54,51 +55,67 @@ namespace Optimization {
  struct Vsplat{
    //Complex float
-    inline float operator()(float a, float b){
+    inline u128f operator()(float a, float b){
-      return 0;
+      u128f out; 
      out.f[0] = a;
      out.f[1] = b;
      out.f[2] = a;
      out.f[3] = b;
      return out;
    }
    // Real float
-    inline float operator()(float a){
+    inline u128f operator()(float a){
-      return 0;
+      u128f out; 
      out.f[0] = a;
      out.f[1] = a;
      out.f[2] = a;
      out.f[3] = a;
      return out;
    }
    //Complex double
-    inline double operator()(double a, double b){
+    inline u128d operator()(double a, double b){
-      return 0;
+      u128d out; 
      out.f[0] = a;
      out.f[1] = b;
      return out;
    }
    //Real double
-    inline double operator()(double a){
+    inline u128d operator()(double a){
-      return 0;
+      u128d out; 
      out.f[0] = a;
      out.f[1] = a;
      return out;
    }
    //Integer
    inline int operator()(Integer a){
-      return 0;
+      return a;
    }
  };
  struct Vstore{
    //Float 
-    inline void operator()(float a, float* F){
+    inline void operator()(u128f a, float* F){
-      
+      memcpy(F,a.f,4*sizeof(float));
    }
    //Double
-    inline void operator()(double a, double* D){
+    inline void operator()(u128d a, double* D){
-     
+      memcpy(D,a.f,2*sizeof(double));
    }
    //Integer
    inline void operator()(int a, Integer* I){
-      
+      I[0] = a;
    }
  };
  struct Vstream{
    //Float
-    inline void operator()(float * a, float b){
+    inline void operator()(float * a, u128f b){
-     
+      memcpy(a,b.f,4*sizeof(float));
    }
    //Double
-    inline void operator()(double * a, double b){
+    inline void operator()(double * a, u128d b){
-     
+      memcpy(a,b.f,2*sizeof(double));
    }
@@ -106,24 +123,40 @@ namespace Optimization {
  struct Vset{
    // Complex float 
-    inline float operator()(Grid::ComplexF *a){
+    inline u128f operator()(Grid::ComplexF *a){
-      return 0;
+      u128f out; 
      out.f[0] = a[0].real();
      out.f[1] = a[0].imag();
      out.f[2] = a[1].real();
      out.f[3] = a[1].imag();
      return out;
    }
    // Complex double 
-    inline double operator()(Grid::ComplexD *a){
+    inline u128d operator()(Grid::ComplexD *a){
-      return 0;
+      u128d out; 
      out.f[0] = a[0].real();
      out.f[1] = a[0].imag();
      return out;
    }
    // Real float 
-    inline float operator()(float *a){
+    inline u128f operator()(float *a){
-      return  0;
+      u128f out; 
      out.f[0] = a[0];
      out.f[1] = a[1];
      out.f[2] = a[2];
      out.f[3] = a[3];
      return out;
    }
    // Real double
-    inline double operator()(double *a){
+    inline u128d operator()(double *a){
-      return 0;
+      u128d out; 
      out.f[0] = a[0];
      out.f[1] = a[1];
      return out;
    }
    // Integer
    inline int operator()(Integer *a){
-      return 0;
+      return a[0];
    }
@@ -145,130 +178,279 @@ namespace Optimization {
  /////////////////////////////////////////////////////
  struct Sum{
    //Complex/Real float
-    inline float operator()(float a, float b){
+    inline u128f operator()(u128f a, u128f b){
-      return 0;
+      u128f out;
      out.f[0] = a.f[0] + b.f[0];
      out.f[1] = a.f[1] + b.f[1];
      out.f[2] = a.f[2] + b.f[2];
      out.f[3] = a.f[3] + b.f[3];
      return out;
    }
    //Complex/Real double
-    inline double operator()(double a, double b){
+    inline u128d operator()(u128d a, u128d b){
-      return 0;
+      u128d out;
      out.f[0] = a.f[0] + b.f[0];
      out.f[1] = a.f[1] + b.f[1];
      return out;
    }
    //Integer
    inline int operator()(int a, int b){
-      return 0;
+      return a + b;
    }
  };
  struct Sub{
    //Complex/Real float
-    inline float operator()(float a, float b){
+    inline u128f operator()(u128f a, u128f b){
-      return 0;
+      u128f out;
      out.f[0] = a.f[0] - b.f[0];
      out.f[1] = a.f[1] - b.f[1];
      out.f[2] = a.f[2] - b.f[2];
      out.f[3] = a.f[3] - b.f[3];
      return out;
    }
    //Complex/Real double
-    inline double operator()(double a, double b){
+    inline u128d operator()(u128d a, u128d b){
-      return 0;
+      u128d out;
      out.f[0] = a.f[0] - b.f[0];
      out.f[1] = a.f[1] - b.f[1];
      return out;
    }
    //Integer
    inline int operator()(int a, int b){
-      return 0;
+      return a-b;
    }
  };
  struct MultComplex{
    // Complex float
-    inline float operator()(float a, float b){
+    inline u128f operator()(u128f a, u128f b){
-      return 0;
+      u128f out;
      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
      out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
      out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
      return out;
    }
    // Complex double
-    inline double operator()(double a, double b){
+    inline u128d operator()(u128d a, u128d b){
-      return 0;
+      u128d out;
      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
      return out;
    }
  };
  struct Mult{
-    inline float  mac(float a, float b,double c){
+    //CK: Appear unneeded
-      return 0;
+    // inline float  mac(float a, float b,double c){
-    }
+    //   return 0;
-    inline double mac(double a, double b,double c){
+    // }
-      return 0;
+    // inline double mac(double a, double b,double c){
-    }
+    //   return 0;
    // }
    // Real float
-    inline float operator()(float a, float b){
+    inline u128f operator()(u128f a, u128f b){
-      return 0;
+      u128f out;
      out.f[0] = a.f[0]*b.f[0];
      out.f[1] = a.f[1]*b.f[1];
      out.f[2] = a.f[2]*b.f[2];
      out.f[3] = a.f[3]*b.f[3];
      return out;
    }
    // Real double
-    inline double operator()(double a, double b){
+    inline u128d operator()(u128d a, u128d b){
-      return 0;
+      u128d out;
      out.f[0] = a.f[0]*b.f[0];
      out.f[1] = a.f[1]*b.f[1];
      return out;
    }
    // Integer
    inline int operator()(int a, int b){
-      return 0;
+      return a*b;
    }
  };
  struct Conj{
    // Complex single
-    inline float operator()(float in){
+    inline u128f operator()(u128f in){
-      return 0;
+      u128f out;
      out.f[0] = in.f[0];
      out.f[1] = -in.f[1];
      out.f[2] = in.f[2];
      out.f[3] = -in.f[3];
      return out;
    }
    // Complex double
-    inline double operator()(double in){
+    inline u128d operator()(u128d in){
-      return 0;
+      u128d out;
      out.f[0] = in.f[0];
      out.f[1] = -in.f[1];
      return out;
    }
    // do not define for integer input
  };
  struct TimesMinusI{
    //Complex single
-    inline float operator()(float in, float ret){
+    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
-      return 0;
+      u128f out;
      out.f[0] = in.f[1];
      out.f[1] = -in.f[0];
      out.f[2] = in.f[3];
      out.f[3] = -in.f[2];
      return out;
    }
    //Complex double
-    inline double operator()(double in, double ret){
+    inline u128d operator()(u128d in, u128d ret){
-      return 0;
+      u128d out;
      out.f[0] = in.f[1];
      out.f[1] = -in.f[0];
      return out;
    }
  };
  struct TimesI{
    //Complex single
-    inline float operator()(float in, float ret){
+    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
-      return 0;
+      u128f out;
      out.f[0] = -in.f[1];
      out.f[1] = in.f[0];
      out.f[2] = -in.f[3];
      out.f[3] = in.f[2];
      return out;
    }
    //Complex double
-    inline double operator()(double in, double ret){
+    inline u128d operator()(u128d in, u128d ret){
-      return 0;
+      u128d out;
      out.f[0] = -in.f[1];
      out.f[1] = in.f[0];
      return out;
    }
  };
  //////////////////////////////////////////////
  // Some Template specialization
  struct Permute{
    //We just have to mirror the permutes of Grid_sse4.h
    static inline u128f Permute0(u128f in){ //AB CD -> CD AB
      u128f out;
      out.f[0] = in.f[2];
      out.f[1] = in.f[3];
      out.f[2] = in.f[0];
      out.f[3] = in.f[1];
      return out;
    };
    static inline u128f Permute1(u128f in){ //AB CD -> BA DC
      u128f out;
      out.f[0] = in.f[1];
      out.f[1] = in.f[0];
      out.f[2] = in.f[3];
      out.f[3] = in.f[2];
      return out;
    };
    static inline u128f Permute2(u128f in){
      return in;
    };
    static inline u128f Permute3(u128f in){
      return in;
    };
    static inline u128d Permute0(u128d in){ //AB -> BA
      u128d out;
      out.f[0] = in.f[1];
      out.f[1] = in.f[0];
      return out;      
    };
    static inline u128d Permute1(u128d in){
      return in;
    };
    static inline u128d Permute2(u128d in){
      return in;
    };
    static inline u128d Permute3(u128d in){
      return in;
    };
  };
  template < typename vtype > 
    void permute(vtype &a, vtype b, int perm) {
   };
  struct Rotate{
    static inline u128f rotate(u128f in,int n){
      u128f out;
      switch(n){
      case 0:
        out.f[0] = in.f[0];
        out.f[1] = in.f[1];
        out.f[2] = in.f[2];
        out.f[3] = in.f[3];
        break;
      case 1:
        out.f[0] = in.f[1];
        out.f[1] = in.f[2];
        out.f[2] = in.f[3];
        out.f[3] = in.f[0];
        break;
      case 2:
        out.f[0] = in.f[2];
        out.f[1] = in.f[3];
        out.f[2] = in.f[0];
        out.f[3] = in.f[1];
        break;
      case 3:
        out.f[0] = in.f[3];
        out.f[1] = in.f[0];
        out.f[2] = in.f[1];
        out.f[3] = in.f[2];
        break;
      default: assert(0);
      }
      return out;
    }
    static inline u128d rotate(u128d in,int n){
      u128d out;
      switch(n){
      case 0:
        out.f[0] = in.f[0];
        out.f[1] = in.f[1];
        break;
      case 1:
        out.f[0] = in.f[1];
        out.f[1] = in.f[0];
        break;
      default: assert(0);
      }
      return out;
    }
  };
  //Complex float Reduce
  template<>
-  inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
+  inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
-    return 0;
+    return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
  }
  //Real float Reduce
  template<>
-  inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
+  inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
-    return 0;
+    return in.f[0] + in.f[1] + in.f[2] + in.f[3];
  }
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
+  inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
-    return 0;
+    return Grid::ComplexD(in.f[0],in.f[1]);
  }
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
+  inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
-    return 0;
+    return in.f[0] + in.f[1];
  }
  //Integer Reduce
@@ -282,10 +464,9 @@ namespace Optimization {
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
 namespace Grid {
-  typedef float SIMD_Ftype;  // Single precision type
+  typedef Optimization::u128f SIMD_Ftype;  // Single precision type
-  typedef double SIMD_Dtype; // Double precision type
+  typedef Optimization::u128d SIMD_Dtype; // Double precision type
  typedef int SIMD_Itype; // Integer type
  // prefetch utilities
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 //----------------------------------------------------------------------
 #include <immintrin.h>
 #include <zmmintrin.h>
 namespace Grid{
 namespace Optimization {
  struct Vsplat{
@@ -316,6 +318,54 @@ namespace Optimization {
  };
  struct Rotate{
    static inline __m512 rotate(__m512 in,int n){ 
      switch(n){
      case 0: return tRotate<0>(in);break;
      case 1: return tRotate<1>(in);break;
      case 2: return tRotate<2>(in);break;
      case 3: return tRotate<3>(in);break;
      case 4: return tRotate<4>(in);break;
      case 5: return tRotate<5>(in);break;
      case 6: return tRotate<6>(in);break;
      case 7: return tRotate<7>(in);break;
      case 8 : return tRotate<8>(in);break;
      case 9 : return tRotate<9>(in);break;
      case 10: return tRotate<10>(in);break;
      case 11: return tRotate<11>(in);break;
      case 12: return tRotate<12>(in);break;
      case 13: return tRotate<13>(in);break;
      case 14: return tRotate<14>(in);break;
      case 15: return tRotate<15>(in);break;
      default: assert(0);
      }
    }
    static inline __m512d rotate(__m512d in,int n){ 
      switch(n){
      case 0: return tRotate<0>(in);break;
      case 1: return tRotate<1>(in);break;
      case 2: return tRotate<2>(in);break;
      case 3: return tRotate<3>(in);break;
      case 4: return tRotate<4>(in);break;
      case 5: return tRotate<5>(in);break;
      case 6: return tRotate<6>(in);break;
      case 7: return tRotate<7>(in);break;
      default: assert(0);
      }
    }
    template<int n> static inline __m512 tRotate(__m512 in){ 
      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
    };
    template<int n> static inline __m512d tRotate(__m512d in){ 
      return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);          
    };
  };
  //////////////////////////////////////////////
@@ -358,7 +408,7 @@ namespace Optimization {
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
-namespace Grid {
+
  typedef __m512 SIMD_Ftype;  // Single precision type
  typedef __m512d SIMD_Dtype; // Double precision type
  typedef __m512i SIMD_Itype; // Integer type
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -267,10 +267,10 @@ namespace Optimization {
  struct Permute{
    static inline __m128 Permute0(__m128 in){
-      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
    };
    static inline __m128 Permute1(__m128 in){
-      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
    };
    static inline __m128 Permute2(__m128 in){
      return in;
@@ -279,7 +279,7 @@ namespace Optimization {
      return in;
    };
-    static inline __m128d Permute0(__m128d in){
+    static inline __m128d Permute0(__m128d in){ //AB -> BA
      return _mm_shuffle_pd(in,in,0x1);
    };
    static inline __m128d Permute1(__m128d in){
@@ -294,6 +294,32 @@ namespace Optimization {
  };
  struct Rotate{
    static inline __m128 rotate(__m128 in,int n){ 
      switch(n){
      case 0: return tRotate<0>(in);break;
      case 1: return tRotate<1>(in);break;
      case 2: return tRotate<2>(in);break;
      case 3: return tRotate<3>(in);break;
      default: assert(0);
      }
    }
    static inline __m128d rotate(__m128d in,int n){ 
      switch(n){
      case 0: return tRotate<0>(in);break;
      case 1: return tRotate<1>(in);break;
      default: assert(0);
      }
    }
 #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
 #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
    template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
    template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
  };
  //////////////////////////////////////////////
  // Some Template specialization
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -299,16 +299,44 @@ namespace Grid {
    }
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
    {
-      if      (perm==3) permute3(y,b);
+      if ( perm & RotateBit ) {
-      else if (perm==2) permute2(y,b);
+	int dist = perm&0xF;
-      else if (perm==1) permute1(y,b);
+        y=rotate(b,dist);
-      else if (perm==0) permute0(y,b);
+	return;
      }
      switch(perm){
      case 3: permute3(y,b); break;
      case 2: permute2(y,b); break;
      case 1: permute1(y,b); break;
      case 0: permute0(y,b); break;
      default: assert(0);
      }
    }
  };// end of Grid_simd class definition 
  ////////////////////////////////////////////////////////////////////
  // General rotate
  ////////////////////////////////////////////////////////////////////
  template <class S, class V, IfNotComplex<S> =0> 
  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
  {
    nrot = nrot % Grid_simd<S,V>::Nsimd();
    Grid_simd<S,V> ret;
    //    std::cout << "Rotate Real by "<<nrot<<std::endl;
    ret.v = Optimization::Rotate::rotate(b.v,nrot);
    return ret;
  }
  template <class S, class V, IfComplex<S> =0> 
  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
  {
    nrot = nrot % Grid_simd<S,V>::Nsimd();
    Grid_simd<S,V> ret;
    //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
    return ret;
  }
  ///////////////////////
  // Splat
  ///////////////////////
@@ -339,6 +367,9 @@ namespace Grid {
  template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,S(0.0,0.0)); }// use xor?
  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 
  template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));} 
  template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));} 
  // if not complex overload here 
  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
--- a/lib/simd/Intel512avx.h
+++ b/lib/simd/Intel512avx.h
@@ -0,0 +1,197 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/simd/Avx512Asm.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ASM_AV512_H
 #define GRID_ASM_AV512_H
 ////////////////////////////////////////////////////////////	  
 // Knights Landing specials
 ////////////////////////////////////////////////////////////	  
 #define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 #define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 #define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
 #define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
 #define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
 #define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
 #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
 #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
 #define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
  VSHUFMEMf(O,P,tmp) \
  VMULMEMf(O,P,B,Biirr) \
  VMULMEMf(O,P,C,Ciirr) \
  VMULf(tmp,B,Briir) \
  VMULf(tmp,C,Criir)
 #define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
  VSHUFMEMd(O,P,tmp)  \
  VMULMEMd(O,P,B,Biirr)  \ 
  VMULMEMd(O,P,C,Ciirr)  \
  VMULd(tmp,B,Briir)  \
  VMULd(tmp,C,Criir) 
 #define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
  VSHUFMEMf(O,P,tmp) \
  VMADDMEMf(O,P,B,Biirr) \
  VMADDMEMf(O,P,C,Ciirr) \
  VMADDf(tmp,B,Briir) \
  VMADDf(tmp,C,Criir)
 #define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
  VSHUFMEMd(O,P,tmp) \
  VMADDMEMd(O,P,B,Biirr) \
  VMADDMEMd(O,P,C,Ciirr) \
  VMADDd(tmp,B,Briir) \
  VMADDd(tmp,C,Criir)
 // Merges accumulation for complex dot chain; less efficient under avx512
 #define ZEND1f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Criir "," #Criir "," #tmp   ";\n"\
                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
 #define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
                                  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"
 #define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\ 
                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
 #define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\
                         	  "vsubpd  " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
 #define VMOVRDUPd(OFF,A,DEST)       "vpshufd  $0x44," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
 #define VMOVIDUPd(OFF,A,DEST)       "vpshufd  $0xee," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 3,2,3,2
 #define VMOVRDUPf(OFF,PTR,DEST)         "vmovsldup " #OFF "*64(" #PTR "), " #DEST  ";\n"
 #define VMOVIDUPf(OFF,PTR,DEST)         "vmovshdup " #OFF "*64(" #PTR "), " #DEST  ";\n"
 #define VRDUPd(SRC,DEST)       "vpshufd  $0x44," #SRC"," #DEST  ";\n" // 32 bit level: 1,0,3,2
 #define VRDUPf(SRC,DEST)         "vmovsldup " #SRC ", " #DEST  ";\n"
 #define VIDUPd(SRC,DEST)       "vpshufd  $0xee," #SRC"," #DEST  ";\n" // 32 bit level: 3,2,3,2
 #define VIDUPf(SRC,DEST)         "vmovshdup " #SRC ", " #DEST  ";\n"
 #define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
 #define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
 #define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
 #define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
 #define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
 #define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
 #define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps   " #O"*64("#P "),"#B "," #accum  ";\n"
 #define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"
 #define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
 #define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
 #define VMULIDUPd(O,P,B,accum) "vmulpd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
  /*
   * TimesI is used only in the XP recon
   * Could zero the regs and use RECON_ACCUM
   */
 #define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	  
 #define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESI2f(A,DEST, Z)   "vsubps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 #define VTIMESI0d(A,DEST, Z)   VSHUFd(A,DEST)	 
 #define VTIMESI1d(A,DEST, Z)   "vaddpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESI2d(A,DEST, Z)   "vsubpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 #define VTIMESMINUSI0f(A,DEST,Z)  VSHUFf(A,DEST)					
 #define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 #define VTIMESMINUSI0d(A,DEST,Z)  VSHUFd(A,DEST)					
 #define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
 #if 0
 #define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
 #define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 #define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 #define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
 #define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 #define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 #define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
 #define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 #define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 #define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
 #define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
 #define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
 #else
 // o_p must point to floating 1.0f/d
 //
 // Ai, Ar -> tmp (r i)
 // tmp *1.0 
 // ACC i - Ar ; ACC r + Ai
 #define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
 #define VACCTIMESMINUSI1f(A,ACC,tmp)  VMADDMEMf(1,%r10,tmp,ACC)
 #define VACCTIMESMINUSI2f(A,ACC,tmp)  
 #define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
 #define VACCTIMESMINUSI1d(A,ACC,tmp)  VMADDMEMd(1,%r10,tmp,ACC)  
 #define VACCTIMESMINUSI2d(A,ACC,tmp)
 // Ai, Ar -> tmp (r i)
 // tmp *1.0 
 // ACC i + Ar ; ACC r - Ai
 #define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
 #define  VACCTIMESI1f(A,ACC,tmp)  VMADDMEMf(0,%r10,tmp,ACC)  
 #define  VACCTIMESI2f(A,ACC,tmp)
 #define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
 #define  VACCTIMESI1d(A,ACC,tmp)  VMADDMEMd(0,%r10,tmp,ACC)  
 #define  VACCTIMESI2d(A,ACC,tmp)
 #endif
 #define VPERM0f(A,B) "vshuff32x4  $0x4e," #A "," #B "," #B ";\n"
 #define VPERM1f(A,B) "vshuff32x4  $0xb1," #A "," #B "," #B ";\n"
 #define VPERM2f(A,B) "vshufps     $0x4e," #A "," #B "," #B ";\n"
 #define VPERM3f(A,B) "vshufps     $0xb1," #A "," #B "," #B ";\n"
 #define VPERM0d(A,B) "vshuff64x2  $0x4e," #A "," #B "," #B ";\n"
 #define VPERM1d(A,B) "vshuff64x2  $0xb1," #A "," #B "," #B ";\n"
 #define VPERM2d(A,B) "vshufpd     $0x55," #A "," #B "," #B ";\n"
 #define VPERM3d(A,B) VMOVd(A,B)
 #endif
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@@ -0,0 +1,141 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/simd/Avx512Asm.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ASM_INTEL_COMMON_512_H
 #define GRID_ASM_INTEL_COMMON_512_H
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Opcodes common 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 #define MASK_REGS \
  __asm__ ("mov     $0xAAAA, %%eax \n"\ 
           "kmovw    %%eax, %%k6 \n"\
           "mov     $0x5555, %%eax \n"\
           "kmovw    %%eax, %%k7 \n" : : : "%eax");
 #define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 #define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
 #define VTIMESIf(A,DEST, Z) \
  VTIMESI0f(A,DEST, Z) \
  VTIMESI1f(A,DEST, Z) \
  VTIMESI2f(A,DEST, Z) 
 #define VTIMESId(A,DEST, Z) \
  VTIMESI0d(A,DEST, Z) \
  VTIMESI1d(A,DEST, Z) \
  VTIMESI2d(A,DEST, Z) 
 #define VTIMESMINUSIf(A,DEST, Z) \
        VTIMESMINUSI0f(A,DEST, Z) \
        VTIMESMINUSI1f(A,DEST, Z) \
        VTIMESMINUSI2f(A,DEST, Z) 
 #define VTIMESMINUSId(A,DEST, Z) \
        VTIMESMINUSI0d(A,DEST, Z) \
        VTIMESMINUSI1d(A,DEST, Z) \
        VTIMESMINUSI2d(A,DEST, Z) 
 #define VACCTIMESIf(A,ACC,tmp)			\
 VACCTIMESI0f(A,ACC,tmp)			\
 VACCTIMESI1f(A,ACC,tmp)			\
 VACCTIMESI2f(A,ACC,tmp)			
 #define VACCTIMESId(A,ACC,tmp)			\
 VACCTIMESI0d(A,ACC,tmp)			\
 VACCTIMESI1d(A,ACC,tmp)			\
 VACCTIMESI2d(A,ACC,tmp)			
 #define VACCTIMESMINUSIf(A,ACC,tmp)			\
  VACCTIMESMINUSI0f(A,ACC,tmp)				\
  VACCTIMESMINUSI1f(A,ACC,tmp)				\
  VACCTIMESMINUSI2f(A,ACC,tmp)			
 #define VACCTIMESMINUSId(A,ACC,tmp)			\
  VACCTIMESMINUSI0d(A,ACC,tmp)				\
  VACCTIMESMINUSI1d(A,ACC,tmp)				\
  VACCTIMESMINUSI2d(A,ACC,tmp)			
 #define LOAD64i(A,ptr)  __asm__ ( "movq %0, %" #A :  : "r"(ptr)  : #A  );
 #define LOAD64(A,ptr)  LOAD64i(A,ptr)
 #define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 #define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 #define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
 #define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
 #define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 #define VEVICT(O,A)   
 //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
 //  "clevict0 "#O"*64("#A");\n" 
 #define VLOADf(OFF,PTR,DEST)   "vmovaps  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 #define VLOADd(OFF,PTR,DEST)   "vmovapd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
 #define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
 #define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
 #define VSUBf(A,B,DEST)        "vsubps   " #A "," #B "," #DEST  ";\n"
 #define VSUBd(A,B,DEST)        "vsubpd   " #A "," #B "," #DEST  ";\n"
 #define VADDMEMf(O,A,B,DEST)        "vaddps   "#O"*64("#A ")," #B "," #DEST  ";\n"
 #define VADDMEMd(O,A,B,DEST)        "vaddpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
 #define VSUBMEMf(O,A,B,DEST)        "vsubps   "#O"*64("#A ")," #B "," #DEST  ";\n"
 #define VSUBMEMd(O,A,B,DEST)        "vsubpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
 #define VMULf(A,B,DEST)        "vmulps   " #A "," #B "," #DEST  ";\n"
 #define VMULd(A,B,DEST)        "vmulpd   " #A "," #B "," #DEST  ";\n"
 #define VMADDf(A,B,DEST)       "vfmadd231ps   " #A "," #B "," #DEST  ";\n"
 #define VMADDd(A,B,DEST)       "vfmadd231pd   " #A "," #B "," #DEST  ";\n"
 #define VMULMEMf(O,A,B,DEST)   "vmulps   " #O"*64("#A ")," #B "," #DEST  ";\n"
 #define VMULMEMd(O,A,B,DEST)   "vmulpd   " #O"*64("#A ")," #B "," #DEST  ";\n"
 #define VMADDMEMf(O,A,B,DEST)       "vfmadd231ps   " #O"*64("#A "),"#B "," #DEST  ";\n"
 #define VMADDMEMd(O,A,B,DEST)       "vfmadd231pd   " #O"*64("#A "),"#B "," #DEST  ";\n"
 #define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 #define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 #define VPREFETCHNTA(O,A) 
 #define VPREFETCH(O,A)    
 #define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 #define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 // Swaps Re/Im ; could unify this with IMCI
 #define VSHUFd(A,DEST)         "vpshufd  $0x4e," #A "," #DEST  ";\n"    
 #define VSHUFf(A,DEST)         "vpshufd  $0xb1," #A "," #DEST  ";\n"    
 #define VSHUFMEMd(OFF,A,DEST)  "vpshufd  $0x4e, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
 #define VSHUFMEMf(OFF,A,DEST)  "vpshufd  $0xb1, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 2,3,0,1
 #define TRAP " int3 ;\n"
 #endif
--- a/lib/simd/Intel512double.h
+++ b/lib/simd/Intel512double.h
@@ -0,0 +1,154 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/simd/Avx512Asm.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 // No guard can be multiply included as undef clearage
 #undef VZERO
 #undef VMOV
 #undef VLOAD
 #undef VSTORE
 #define VZERO(A)                  VZEROd(A)
 #define VMOV(A,B)                 VMOVd(A,B)
 #define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
 #define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
 #undef VADD
 #undef VSUB
 #undef VMUL
 #undef VMADD
 #define VADD(A,B,C)               VADDd(A,B,C)
 #define VSUB(A,B,C)               VSUBd(A,B,C)
 #define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
 #define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
 #undef VTIMESI
 #undef VTIMESI0 
 #undef VTIMESI1
 #undef VTIMESI2 
 #define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
 #define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
 #define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
 #define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
 #undef VTIMESMINUSI
 #undef VTIMESMINUSI0
 #undef VTIMESMINUSI1
 #undef VTIMESMINUSI2
 #define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
 #define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
 #define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
 #define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
 #undef VACCTIMESI
 #undef VACCTIMESI0
 #undef VACCTIMESI1
 #undef VACCTIMESI2
 #define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
 #define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
 #define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
 #define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
 #undef VACCTIMESMINUSI
 #undef VACCTIMESMINUSI0
 #undef VACCTIMESMINUSI1
 #undef VACCTIMESMINUSI2
 #define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
 #define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
 #define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
 #define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
 #undef VACCTIMESI1MEM
 #undef VACCTIMESI2MEM
 #define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
 #define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
 #undef VACCTIMESMINUSI1MEM
 #undef VACCTIMESMINUSI2MEM
 #define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
 #define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
 #undef VPERM0
 #undef VPERM1
 #undef VPERM2
 #undef VPERM3
 #define VPERM0(A,B)               VPERM0d(A,B)
 #define VPERM1(A,B)               VPERM1d(A,B)
 #define VPERM2(A,B)               VPERM2d(A,B)
 #define VPERM3(A,B)               VPERM3d(A,B)
 #undef VSHUFMEM
 #undef VADDMEM
 #undef VSUBMEM
 #define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
 #define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
 #define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef VMADDSUB
 #undef VSHUF
 #define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
 #define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
 #define VSHUF(A,B)                                       VSHUFd(A,B)
 #undef ZEND1
 #undef ZEND2
 #undef ZLOAD
 #undef ZMUL
 #undef ZMADD
 #undef ZMULMEM2SP
 #undef ZMADDMEM2SP
 #define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
 #define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
 #define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
 #define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #undef VRDUP
 #undef VIDUP
 #undef VMADDSUBMEM
 #undef VMADDMEM
 #undef VMULMEM
 #define VRDUP(SRC,DEST) VRDUPd(SRC,DEST) 
 #define VIDUP(SRC,DEST) VIDUPd(SRC,DEST) 
 #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
 #define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
 #define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
 #undef VMADDSUBRDUP   
 #undef VMADDSUBIDUP   
 #undef VMULRDUP   
 #undef VMULIDUP   
 #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
 #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
 #define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
 #define VMULIDUP(O,P,B,accum)     VMULIDUPd(O,P,B,accum) 
--- a/lib/simd/Intel512imci.h
+++ b/lib/simd/Intel512imci.h
@@ -0,0 +1,127 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/simd/Avx512Asm.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ASM_AV512_H
 #define GRID_ASM_AV512_H
 ////////////////////////////////////////////////////////////	  
 // Knights Corner specials
 ////////////////////////////////////////////////////////////	  
 #define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 #define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 #define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
 #define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
 #define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
 #define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
 #define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
 #define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
 #define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
  VSHUFMEMf(O,P,tmp) \
  VMULMEMf(O,P,B,Biirr) \
  VMULMEMf(O,P,C,Ciirr) \
  VMULf(tmp,B,Briir) \
  VMULf(tmp,C,Criir)
 #define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
  VSHUFMEMd(O,P,tmp)  \
  VMULMEMd(O,P,B,Biirr)  \ 
  VMULMEMd(O,P,C,Ciirr)  \
  VMULd(tmp,B,Briir)  \
  VMULd(tmp,C,Criir) 
 #define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
  VSHUFMEMf(O,P,tmp) \
  VMADDMEMf(O,P,B,Biirr) \
  VMADDMEMf(O,P,C,Ciirr) \
  VMADDf(tmp,B,Briir) \
  VMADDf(tmp,C,Criir)
 #define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
  VSHUFMEMd(O,P,tmp) \
  VMADDMEMd(O,P,B,Biirr) \
  VMADDMEMd(O,P,C,Ciirr) \
  VMADDd(tmp,B,Briir) \
  VMADDd(tmp,C,Criir)
 #define ZEND1d(Criir,Ciirr, tmp) "vaddpd  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
 #define ZEND2d(Criir,Ciirr, tmp) "vsubpd  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
 #define ZEND1f(Criir,Ciirr, tmp) "vaddps  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
 #define ZEND2f(Criir,Ciirr, tmp) "vsubps  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
 #define VTIMESI0f(A,DEST, Z)   
 #define VTIMESI1f(A,DEST, Z)   "vaddps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 #define VTIMESI2f(A,DEST, Z)   "vsubps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESI0d(A,DEST, Z)   
 #define VTIMESI1d(A,DEST, Z)   "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 #define VTIMESI2d(A,DEST, Z)   "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESMINUSI0f(A,DEST,Z)  
 #define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 #define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 #define VTIMESMINUSI0d(A,DEST,Z)  
 #define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
 #define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
 #define  VACCTIMESI0f(A,ACC,tmp)
 #define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 #define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 #define  VACCTIMESI0d(A,ACC,tmp)
 #define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 #define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 #define VACCTIMESMINUSI0f(A,ACC,tmp)  
 #define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 #define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 	   // Acc = Acc - i A
 #define VACCTIMESMINUSI0d(A,ACC,tmp)  
 #define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
 #define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
 //((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
 //((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
 #define VPERM0f(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
 #define VPERM1f(A,B) "vpermf32x4  $0xb1," #A "," #B ";\n"
 #define VPERM2f(A,B) "vmovaps     " #A "{badc}," #B ";\n"
 #define VPERM3f(A,B) "vmovaps     " #A "{cdab}," #B ";\n"
 #define VPERM0d(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
 #define VPERM1d(A,B) "vmovapd     " #A "{badc}," #B ";\n"
 #define VPERM2d(A,B) "vmovapd     " #A "{cdab}," #B ";\n"
 #define VPERM3d(A,B) VMOVd(A,B)
 #endif
--- a/lib/simd/Intel512single.h
+++ b/lib/simd/Intel512single.h
@@ -0,0 +1,155 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/simd/Avx512Asm.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 // No guard can be multiply included as undef clearge of macros
 #undef VZERO
 #undef VMOV
 #undef VLOAD
 #undef VSTORE
 #define VZERO(A)                  VZEROf(A)
 #define VMOV(A,B)                 VMOVf(A,B)
 #define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 #define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 #undef VADD
 #undef VSUB
 #undef VMUL
 #undef VMADD
 #define VADD(A,B,C)               VADDf(A,B,C)
 #define VSUB(A,B,C)               VSUBf(A,B,C)
 #define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
 #define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
 #undef VTIMESI
 #undef VTIMESI0 
 #undef VTIMESI1
 #undef VTIMESI2 
 #define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
 #define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
 #define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
 #define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
 #undef VTIMESMINUSI
 #undef VTIMESMINUSI0
 #undef VTIMESMINUSI1
 #undef VTIMESMINUSI2
 #define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
 #define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
 #define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
 #define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
 #undef VACCTIMESI
 #undef VACCTIMESI0
 #undef VACCTIMESI1
 #undef VACCTIMESI2
 #define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 #define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
 #define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
 #define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
 #undef VACCTIMESMINUSI
 #undef VACCTIMESMINUSI0
 #undef VACCTIMESMINUSI1
 #undef VACCTIMESMINUSI2
 #define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 #define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
 #define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
 #define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
 #undef VACCTIMESI1MEM
 #undef VACCTIMESI2MEM
 #define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
 #define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
 #undef VACCTIMESMINUSI1MEM
 #undef VACCTIMESMINUSI2MEM
 #define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 #define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 #undef VPERM0
 #undef VPERM1
 #undef VPERM2
 #undef VPERM3
 #define VPERM0(A,B)               VPERM0f(A,B)
 #define VPERM1(A,B)               VPERM1f(A,B)
 #define VPERM2(A,B)               VPERM2f(A,B)
 #define VPERM3(A,B)               VPERM3f(A,B)
 #undef VSHUFMEM
 #undef VADDMEM
 #undef VSUBMEM
 #define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 #define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
 #define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef VMADDSUB
 #undef VSHUF
 #define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
 #define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
 #define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
 #define VSHUF(A,B)                                       VSHUFf(A,B)
 #undef ZEND1
 #undef ZEND2
 #undef ZLOAD
 #undef ZMUL
 #undef ZMADD
 #undef ZMULMEM2SP
 #undef ZMADDMEM2SP
 #define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
 #define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
 #define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
 #define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
 #define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
 #define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 #undef VRDUP
 #undef VIDUP
 #undef VMADDSUBMEM
 #undef VMADDMEM
 #undef VMULMEM
 #define VRDUP(SRC,DEST) VRDUPf(SRC,DEST) 
 #define VIDUP(SRC,DEST) VIDUPf(SRC,DEST) 
 #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
 #define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
 #define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
 #undef VMADDSUBRDUP   
 #undef VMADDSUBIDUP   
 #undef VMULRDUP   
 #undef VMULIDUP   
 #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
 #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
 #define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
 #define VMULIDUP(O,P,B,accum)     VMULIDUPf(O,P,B,accum) 
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@@ -0,0 +1,849 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/simd/Avx512Asm.h
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_ASM_INTEL_512_QCD_H
 #define GRID_ASM_INTEL_512_QCD_H
 //////////////////////////////////////////////////////////////////////////////////////////
 // Register allocations for Wilson Kernel are precision indept
 //////////////////////////////////////////////////////////////////////////////////////////
 #define result_00 %zmm0 
 #define result_01 %zmm1
 #define result_02 %zmm2
 #define result_10 %zmm3
 #define result_11 %zmm4
 #define result_12 %zmm5
 #define result_20 %zmm6
 #define result_21 %zmm7
 #define result_22 %zmm8
 #define result_30 %zmm9
 #define result_31 %zmm10
 #define result_32 %zmm11
 #define Chi_00 %zmm12  
 #define Chi_01 %zmm13
 #define Chi_02 %zmm14
 #define Chi_10 %zmm15
 #define Chi_11 %zmm16
 #define Chi_12 %zmm17  
 #define UChi_00 %zmm18 
 #define UChi_01 %zmm19
 #define UChi_02 %zmm20
 #define UChi_10 %zmm21
 #define UChi_11 %zmm22
 #define UChi_12 %zmm23 
 #define Uir %zmm24 
 #define Uri %zmm25  
 #define T1 %zmm24
 #define T2 %zmm25
 #define Z0 %zmm26
 #define Z1 %zmm27
 #define Z2 %zmm28
 #define Z3 %zmm29
 #define Z4 %zmm30
 #define Z5 %zmm31
 #define TMP Chi_00
 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
 #define Chimu_02 Chi_02
 #define Chimu_10 Chi_10
 #define Chimu_11 Chi_11
 #define Chimu_12 Chi_12
 #define Chimu_20 UChi_00
 #define Chimu_21 UChi_01
 #define Chimu_22 UChi_02
 #define Chimu_30 UChi_10
 #define Chimu_31 UChi_11
 #define Chimu_32 UChi_12
 #include <simd/Intel512common.h>
 #include <simd/Intel512avx.h>
 //////////////////////////////////////////////////////////////////
 // Macros used to build wilson kernel -- can rationalise and simplify
 // a little as some duplication developed during trying different
 // variants during optimisation. Could cut back to only those used.
 //////////////////////////////////////////////////////////////////
 //  const SiteSpinor * ptr = & in._odata[offset];	
 #define LOAD_CHIMU(PTR)	 LOAD_CHIMUi(PTR)
 #define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
 #define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
 #define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
 #define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
 #define LOAD_CHIMUi \
 	   LOAD_CHIMU01i	\
 	   LOAD_CHIMU23i	);
 #define LOAD_CHIMU01i\
 	   VLOAD(0,%r8,Chimu_00)		\
 	   VLOAD(1,%r8,Chimu_01)		\
 	   VLOAD(2,%r8,Chimu_02)		\
 	   VLOAD(3,%r8,Chimu_10)		\
 	   VLOAD(4,%r8,Chimu_11)		\
 	   VLOAD(5,%r8,Chimu_12)		
 #define LOAD_CHIMU23i\
 	   VLOAD(6,%r8,Chimu_20)		\
 	   VLOAD(7,%r8,Chimu_21)		\
 	   VLOAD(8,%r8,Chimu_22)		\
 	   VLOAD(9,%r8,Chimu_30)		\
 	   VLOAD(10,%r8,Chimu_31)		\
 	   VLOAD(11,%r8,Chimu_32)		
 #define SHUF_CHIMU23i\
 	   VSHUFMEM(6,%r8,Chimu_20)		\
 	   VSHUFMEM(7,%r8,Chimu_21)		\
 	   VSHUFMEM(8,%r8,Chimu_22)		\
 	   VSHUFMEM(9,%r8,Chimu_30)		\
 	   VSHUFMEM(10,%r8,Chimu_31)		\
 	   VSHUFMEM(11,%r8,Chimu_32)		
 //  const SiteHalfSpinor *ptr = &buf[offset];	
 #define LOAD_CHIi				\
  VLOAD(0,%r8,Chi_00)					\
  VLOAD(1,%r8,Chi_01)					\
  VLOAD(2,%r8,Chi_02)					\
  VLOAD(3,%r8,Chi_10)					\
  VLOAD(4,%r8,Chi_11)					\
  VLOAD(5,%r8,Chi_12)	
 #define SAVE_UCHIi(PTR)				\
  LOAD64(%r8,PTR)				\
  __asm__ (					\
  VSTORE(0,%r8,UChi_00)				\
  VSTORE(1,%r8,UChi_01)				\
  VSTORE(2,%r8,UChi_02)				\
  VSTORE(3,%r8,UChi_10)				\
  VSTORE(4,%r8,UChi_11)				\
  VSTORE(5,%r8,UChi_12)				\
 						);
 #define SAVE_CHIi(PTR)				\
  LOAD64(%r8,PTR)				\
  __asm__ (					\
  VSTORE(0,%r8,Chi_00)				\
  VSTORE(1,%r8,Chi_01)				\
  VSTORE(2,%r8,Chi_02)				\
  VSTORE(3,%r8,Chi_10)				\
  VSTORE(4,%r8,Chi_11)				\
  VSTORE(5,%r8,Chi_12)				\
 						);
 #define SAVE_RESULTi(PTR)\
 	   LOAD64(%r8,PTR)			\
  __asm__ (					\
 	   VSTORE(0,%r8,result_00)		\
 	   VSTORE(1,%r8,result_01)		\
 	   VSTORE(2,%r8,result_02)		\
 	   VSTORE(3,%r8,result_10)		\
 	   VSTORE(4,%r8,result_11)		\
 	   VSTORE(5,%r8,result_12)		\
 	   VSTORE(6,%r8,result_20)		\
 	   VSTORE(7,%r8,result_21)		\
 	   VSTORE(8,%r8,result_22)		\
 	   VSTORE(9,%r8,result_30)		\
 	   VSTORE(10,%r8,result_31)		\
 	   VSTORE(11,%r8,result_32) 		\
 						);
 #define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 #define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
 #define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
 #define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
 #define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
 #define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
 #define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
 #define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
 #define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
 #define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
 //////////////////////////////////////////////////////////////////
 // Dirac algebra
 //////////////////////////////////////////////////////////////////
 //      hspin(0)=fspin(0)+timesI(fspin(3));
 //      hspin(1)=fspin(1)+timesI(fspin(2));
 #define XP_PROJMEM(PTR) \
  LOAD64(%r8,PTR)							\
  __asm__ (								\
 	   LOAD_CHIi						\
 	   SHUF_CHIMU23i						\
 	   VACCTIMESI1(Chi_00,Chi_00,Chimu_30)		\
 	   VACCTIMESI1(Chi_01,Chi_01,Chimu_31)		\
 	   VACCTIMESI1(Chi_02,Chi_02,Chimu_32)		\
 	   VACCTIMESI1(Chi_10,Chi_10,Chimu_20)		\
 	   VACCTIMESI1(Chi_11,Chi_11,Chimu_21)		\
 	   VACCTIMESI1(Chi_12,Chi_12,Chimu_22)		\
 	   VACCTIMESI2(Chi_00,Chi_00,Chimu_30)		\
 	   VACCTIMESI2(Chi_01,Chi_01,Chimu_31)		\
 	   VACCTIMESI2(Chi_02,Chi_02,Chimu_32)		\
 	   VACCTIMESI2(Chi_10,Chi_10,Chimu_20)		\
 	   VACCTIMESI2(Chi_11,Chi_11,Chimu_21)		\
 	   VACCTIMESI2(Chi_12,Chi_12,Chimu_22)		);
 #define YP_PROJMEM(ptr) \
  LOAD64(%r8,ptr)		\
  __asm__ (					\
  LOAD_CHIMU01i					\
  VSUBMEM(9,%r8 ,Chimu_00,Chi_00)		\
  VSUBMEM(10,%r8,Chimu_01,Chi_01)		\
  VSUBMEM(11,%r8,Chimu_02,Chi_02)		\
  VADDMEM(6,%r8,Chimu_10,Chi_10)		\
  VADDMEM(7,%r8,Chimu_11,Chi_11)		\
  VADDMEM(8,%r8,Chimu_12,Chi_12)		);
 #define ZP_PROJMEM(PTR) \
  LOAD64(%r8,PTR)							\
  __asm__ (								\
 	   LOAD_CHIi						\
 	   SHUF_CHIMU23i						\
 	   VACCTIMESI1(Chi_00,Chi_00,Chimu_20)				\
 	   VACCTIMESI1(Chi_01,Chi_01,Chimu_21)		   	        \
 	   VACCTIMESI1(Chi_02,Chi_02,Chimu_22)				\
 	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30)			\
 	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31)			\
 	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32)			\
 	   VACCTIMESI2(Chi_00,Chi_00,Chimu_20)				\
 	   VACCTIMESI2(Chi_01,Chi_01,Chimu_21)				\
 	   VACCTIMESI2(Chi_02,Chi_02,Chimu_22)				\
 	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30)		\
 	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31)		\
 	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32)	);
 #define TP_PROJMEM(ptr)				\
  LOAD64(%r8,ptr)				\
  __asm__ (					\
 	   LOAD_CHIMU01i			\
 	   VADDMEM(6,%r8 ,Chimu_00,Chi_00)	\
 	   VADDMEM(7,%r8,Chimu_01,Chi_01)	\
 	   VADDMEM(8,%r8,Chimu_02,Chi_02)	\
 	   VADDMEM(9,%r8,Chimu_10,Chi_10)	\
 	   VADDMEM(10,%r8,Chimu_11,Chi_11)	\
 	   VADDMEM(11,%r8,Chimu_12,Chi_12)	);
 //      hspin(0)=fspin(0)-timesI(fspin(3))
 //      hspin(1)=fspin(1)-timesI(fspin(2))
 #define XM_PROJMEM(PTR) \
  LOAD64(%r8,PTR)\
  __asm__ (								\
 	   SHUF_CHIMU23i						\
 	   LOAD_CHIi \
 	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
 	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
 	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
 	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
 	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
 	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
 	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
 	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
 	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
 	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
 	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
 	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
 #define YM_PROJMEM(ptr)				\
  LOAD64(%r8,ptr)				\
  __asm__ (					\
  LOAD_CHIMU01i					\
  VADDMEM(9,%r8 ,Chimu_00,Chi_00)		\
  VADDMEM(10,%r8,Chimu_01,Chi_01)		\
  VADDMEM(11,%r8,Chimu_02,Chi_02)		\
  VSUBMEM(6,%r8,Chimu_10,Chi_10)		\
  VSUBMEM(7,%r8,Chimu_11,Chi_11)		\
  VSUBMEM(8,%r8,Chimu_12,Chi_12)			);
 #define ZM_PROJMEM(PTR) \
  LOAD64(%r8,PTR)							\
  __asm__ (								\
 	   SHUF_CHIMU23i						\
           LOAD_CHIi \
 	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
 	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
 	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
 	   VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
 	   VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
 	   VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
 	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
 	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
 	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
 	   VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
 	   VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
 	   VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
 #define TM_PROJMEM(ptr)				\
  LOAD64(%r8,ptr)				\
  __asm__ (					\
  LOAD_CHIMU01i					\
  VSUBMEM(6,%r8,Chimu_00,Chi_00)		\
  VSUBMEM(7,%r8,Chimu_01,Chi_01)		\
  VSUBMEM(8,%r8,Chimu_02,Chi_02)		\
  VSUBMEM(9,%r8,Chimu_10,Chi_10)		\
  VSUBMEM(10,%r8,Chimu_11,Chi_11)		\
  VSUBMEM(11,%r8,Chimu_12,Chi_12)		);
 //      fspin(0)=hspin(0)
 //      fspin(1)=hspin(1)
 //      fspin(2)=timesMinusI(hspin(1))
 //      fspin(3)=timesMinusI(hspin(0))
 #define XP_RECON __asm__ (			\
 			  VZERO(TMP)		\
 			  VTIMESMINUSI0(UChi_00,result_30,TMP)	\
 			  VTIMESMINUSI0(UChi_10,result_20,TMP)	\
 			  VTIMESMINUSI0(UChi_01,result_31,TMP)	\
 			  VTIMESMINUSI0(UChi_11,result_21,TMP)	\
 			  VTIMESMINUSI0(UChi_02,result_32,TMP)   \
 			  VTIMESMINUSI0(UChi_12,result_22,TMP)	\
 			  VMOV(UChi_00,result_00)	\
 			  VMOV(UChi_10,result_10)	\
 			  VMOV(UChi_01,result_01)	\
 			  VMOV(UChi_11,result_11)	\
 			  VMOV(UChi_02,result_02)	\
 			  VMOV(UChi_12,result_12)	\
 			  VTIMESMINUSI1(UChi_10,result_20,TMP)	\
 			  VTIMESMINUSI1(UChi_11,result_21,TMP)	\
 			  VTIMESMINUSI1(UChi_12,result_22,TMP)	\
 			  VTIMESMINUSI1(UChi_00,result_30,TMP)	\
 			  VTIMESMINUSI1(UChi_01,result_31,TMP)	\
 			  VTIMESMINUSI1(UChi_02,result_32,TMP)   \
 			  VTIMESMINUSI2(UChi_10,result_20,TMP)	\
 			  VTIMESMINUSI2(UChi_11,result_21,TMP)	\
 			  VTIMESMINUSI2(UChi_12,result_22,TMP)	\
 			  VTIMESMINUSI2(UChi_00,result_30,TMP)	\
 			  VTIMESMINUSI2(UChi_01,result_31,TMP)	\
 			  VTIMESMINUSI2(UChi_02,result_32,TMP)   \
 						);
  // NB could save 6 ops using addsub => 12 cycles
 #define XP_RECON_ACCUM __asm__ ( \
  VZERO(TMP)\
  VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
  VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
  VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
  VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
  VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
  VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
  VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
  VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
  VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
  VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
  VACCTIMESMINUSI2(UChi_00,result_30,Z3)\
  VACCTIMESMINUSI2(UChi_01,result_31,Z4)\
  VACCTIMESMINUSI2(UChi_02,result_32,Z5)\
 				 );
 #define XM_RECON __asm__ ( \
  VZERO(TMP)\
  VTIMESI0(UChi_00,result_30,TMP)\
  VTIMESI0(UChi_10,result_20,TMP)\
  VTIMESI0(UChi_01,result_31,TMP)\
  VTIMESI0(UChi_11,result_21,TMP)\
  VTIMESI0(UChi_02,result_32,TMP)\
  VTIMESI0(UChi_12,result_22,TMP)\
  VMOV(UChi_00,result_00)\
  VMOV(UChi_10,result_10)\
  VMOV(UChi_01,result_01)\
  VMOV(UChi_11,result_11)\
  VMOV(UChi_02,result_02)\
  VMOV(UChi_12,result_12)\
  VTIMESI1(UChi_00,result_30,TMP)\
  VTIMESI1(UChi_10,result_20,TMP)\
  VTIMESI1(UChi_01,result_31,TMP)\
  VTIMESI1(UChi_11,result_21,TMP)\
  VTIMESI1(UChi_02,result_32,TMP)\
  VTIMESI1(UChi_12,result_22,TMP)\
  VTIMESI2(UChi_10,result_20,TMP)\
  VTIMESI2(UChi_11,result_21,TMP)\
  VTIMESI2(UChi_12,result_22,TMP)\
  VTIMESI2(UChi_00,result_30,TMP)\
  VTIMESI2(UChi_01,result_31,TMP)\
  VTIMESI2(UChi_02,result_32,TMP)\
 			   );
 #define XM_RECON_ACCUM __asm__ ( \
  VACCTIMESI0(UChi_10,result_20,Z0)\
  VACCTIMESI0(UChi_00,result_30,Z3)\
  VACCTIMESI0(UChi_11,result_21,Z1)\
  VACCTIMESI0(UChi_01,result_31,Z4)\
  VACCTIMESI0(UChi_12,result_22,Z2)\
  VACCTIMESI0(UChi_02,result_32,Z5)\
  \
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_12,result_12,result_12)\
  VADD(UChi_02,result_02,result_02)\
  \
  VACCTIMESI1(UChi_10,result_20,Z0)\
  VACCTIMESI1(UChi_00,result_30,Z3)\
  VACCTIMESI1(UChi_11,result_21,Z1)\
  VACCTIMESI1(UChi_01,result_31,Z4)\
  VACCTIMESI1(UChi_12,result_22,Z2)\
  VACCTIMESI1(UChi_02,result_32,Z5)\
  VACCTIMESI2(UChi_10,result_20,Z0)\
  VACCTIMESI2(UChi_11,result_21,Z1)\
  VACCTIMESI2(UChi_12,result_22,Z2)\
  VACCTIMESI2(UChi_00,result_30,Z3)\
  VACCTIMESI2(UChi_01,result_31,Z4)\
  VACCTIMESI2(UChi_02,result_32,Z5)\
 				 );
 #define YP_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VADD(UChi_10,result_20,result_20)\
  VADD(UChi_11,result_21,result_21)\
  VADD(UChi_12,result_22,result_22)\
  VSUB(UChi_00,result_30,result_30)\
  VSUB(UChi_01,result_31,result_31)\
  VSUB(UChi_02,result_32,result_32) );
 #define YM_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VSUB(UChi_10,result_20,result_20)\
  VSUB(UChi_11,result_21,result_21)\
  VSUB(UChi_12,result_22,result_22)\
  VADD(UChi_00,result_30,result_30)\
  VADD(UChi_01,result_31,result_31)\
  VADD(UChi_02,result_32,result_32) );
 #define ZP_RECON_ACCUM __asm__ ( \
  VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
  VACCTIMESI0(UChi_10,result_30,Z3)\
  VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
  VACCTIMESI0(UChi_11,result_31,Z4)\
  VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
  VACCTIMESI0(UChi_12,result_32,Z5)\
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
  VACCTIMESI1(UChi_10,result_30,Z3)\
  VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
  VACCTIMESI1(UChi_11,result_31,Z4)\
  VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
  VACCTIMESI1(UChi_12,result_32,Z5)\
  VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
  VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
  VACCTIMESMINUSI2(UChi_02,result_22,Z2)\
  VACCTIMESI2(UChi_10,result_30,Z3)\
  VACCTIMESI2(UChi_11,result_31,Z4)\
  VACCTIMESI2(UChi_12,result_32,Z5)\
 				 );
 #define ZM_RECON_ACCUM __asm__ ( \
  VACCTIMESI0(UChi_00,result_20,Z0)\
  VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
  VACCTIMESI0(UChi_01,result_21,Z1)\
  VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
  VACCTIMESI0(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VACCTIMESI1(UChi_00,result_20,Z0)\
  VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
  VACCTIMESI1(UChi_01,result_21,Z1)\
  VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
  VACCTIMESI1(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
  VACCTIMESI2(UChi_00,result_20,Z0)\
  VACCTIMESI2(UChi_01,result_21,Z1)\
  VACCTIMESI2(UChi_02,result_22,Z2)\
  VACCTIMESMINUSI2(UChi_10,result_30,Z3)\
  VACCTIMESMINUSI2(UChi_11,result_31,Z4)\
  VACCTIMESMINUSI2(UChi_12,result_32,Z5)\
 				 );
 #define TP_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VADD(UChi_00,result_20,result_20)\
  VADD(UChi_10,result_30,result_30)\
  VADD(UChi_01,result_21,result_21)\
  VADD(UChi_11,result_31,result_31)\
  VADD(UChi_02,result_22,result_22)\
  VADD(UChi_12,result_32,result_32) );
 #define TM_RECON_ACCUM __asm__ ( \
  VADD(UChi_00,result_00,result_00)\
  VADD(UChi_10,result_10,result_10)\
  VADD(UChi_01,result_01,result_01)\
  VADD(UChi_11,result_11,result_11)\
  VADD(UChi_02,result_02,result_02)\
  VADD(UChi_12,result_12,result_12)\
  VSUB(UChi_00,result_20,result_20)\
  VSUB(UChi_10,result_30,result_30)\
  VSUB(UChi_01,result_21,result_21)\
  VSUB(UChi_11,result_31,result_31)\
  VSUB(UChi_02,result_22,result_22)\
  VSUB(UChi_12,result_32,result_32) );
 #define PREFETCH_CHIMU(A) \
  LOAD64(%r9,A)						\
 	   __asm__ (						\
  VPREFETCHG(12,%r9)\
  VPREFETCHG(13,%r9)\
  VPREFETCHG(14,%r9)\
  VPREFETCHG(15,%r9)\
  VPREFETCHG(16,%r9)\
  VPREFETCHG(17,%r9)\
  VPREFETCHG(18,%r9)\
  VPREFETCHG(19,%r9)\
  VPREFETCHG(20,%r9)\
  VPREFETCHG(21,%r9)\
  VPREFETCHG(22,%r9)\
  VPREFETCHG(23,%r9));
 #define PERMUTE_DIR0 __asm__ ( 	\
  VPERM0(Chi_00,Chi_00)	\
  VPERM0(Chi_01,Chi_01)	\
  VPERM0(Chi_02,Chi_02)	\
  VPERM0(Chi_10,Chi_10)	\
  VPERM0(Chi_11,Chi_11)	\
  VPERM0(Chi_12,Chi_12) );
 #define PERMUTE_DIR1 __asm__ (	\
  VPERM1(Chi_00,Chi_00)	\
  VPERM1(Chi_01,Chi_01)	\
  VPERM1(Chi_02,Chi_02)	\
  VPERM1(Chi_10,Chi_10)	\
  VPERM1(Chi_11,Chi_11)	\
  VPERM1(Chi_12,Chi_12));
 #define PERMUTE_DIR2 __asm__ (	\
  VPERM2(Chi_00,Chi_00)	\
  VPERM2(Chi_01,Chi_01)	\
  VPERM2(Chi_02,Chi_02)	\
  VPERM2(Chi_10,Chi_10)	\
  VPERM2(Chi_11,Chi_11)	\
  VPERM2(Chi_12,Chi_12) );
 #define PERMUTE_DIR3 __asm__ (	\
  VPERM3(Chi_00,Chi_00)	\
  VPERM3(Chi_01,Chi_01)	\
  VPERM3(Chi_02,Chi_02)	\
  VPERM3(Chi_10,Chi_10)	\
  VPERM3(Chi_11,Chi_11)	\
  VPERM3(Chi_12,Chi_12) );
 #define MULT_ADDSUB_2SPIN(ptr,pf)					\
  LOAD64(%r8,ptr)						\
  LOAD64(%r9,pf)						\
 	   __asm__ (						\
 	   VPREFETCH2(9,%r8)				   \
 	   VPREFETCH2(10,%r8)					   \
 	   VPREFETCH2(11,%r8)					   \
 	   VPREFETCH2(12,%r8)					   \
 	   VPREFETCH2(13,%r8)					   \
 	   VPREFETCH2(14,%r8)					   \
 	   VPREFETCH2(15,%r8)					   \
 	   VPREFETCH2(16,%r8)					   \
 	   VPREFETCH2(17,%r8)					   \
 	   VSHUF(Chi_00,T1)				\
 	   VMOVIDUP(0,%r8,Z0 )					\
           VMOVIDUP(3,%r8,Z1 )					\
           VMOVIDUP(6,%r8,Z2 )	          VSHUF(Chi_10,T2)		\
 	   /*6*/							\
           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 )	\
           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 )	\
           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 )	\
           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
 	   VPREFETCHG(0,%r9)					   \
 	   VPREFETCHG(1,%r9)					   \
 	   VPREFETCHG(2,%r9)					   \
 	   VPREFETCHG(3,%r9)					   \
 	   /*18*/						\
           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
           VMADDSUB(Z3,Chi_10,UChi_10)				\
           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 )	\
           VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
           VMADDSUB(Z5,Chi_10,UChi_12)				\
 	   VPREFETCHG(4,%r9)					   \
 	   VPREFETCHG(5,%r9)					   \
 	   VPREFETCHG(6,%r9)					   \
 	   VPREFETCHG(7,%r9)					   \
 	   /*28*/						\
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
           VMADDSUB(Z0,T2,UChi_10)				\
           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 )	\
           VMADDSUB(Z1,T2,UChi_11)				\
           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 )	\
           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 )	\
 	   VPREFETCH2(12,%r9)					   \
 	   VPREFETCH2(13,%r9)					   \
 	   VPREFETCH2(14,%r9)					   \
 	   VPREFETCH2(15,%r9)					   \
 	   VPREFETCH2(16,%r9)					   \
 	   VPREFETCH2(17,%r9)					   \
 	   VPREFETCH2(18,%r9)					   \
 	   VPREFETCH2(19,%r9)					   \
 	   VPREFETCH2(20,%r9)					   \
 	   VPREFETCH2(21,%r9)					   \
 	   VPREFETCH2(22,%r9)					   \
 	   VPREFETCH2(23,%r9)					   \
           /*38*/						\
           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)	\
           VMADDSUB(Z3,Chi_11,UChi_10)				\
           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 )	\
           VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
           VMADDSUB(Z5,Chi_11,UChi_12)				\
 	   VPREFETCHG(9,%r8)				   \
 	   VPREFETCHG(10,%r8)					   \
 	   VPREFETCHG(11,%r8)					   \
 	   VPREFETCHG(12,%r8)					   \
 	   VPREFETCHG(13,%r8)					   \
 	   VPREFETCHG(14,%r8)					   \
 	   VPREFETCHG(15,%r8)					   \
 	   VPREFETCHG(16,%r8)					   \
 	   VPREFETCHG(17,%r8)					   \
 	   /*48*/						\
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
           VMADDSUB(Z0,T2,UChi_10)			      \
           VMADDSUB(Z1,T1,UChi_01)			      \
           VMADDSUB(Z1,T2,UChi_11)			      \
           VMADDSUB(Z2,T1,UChi_02)			      \
           VMADDSUB(Z2,T2,UChi_12)			      \
 	   VPREFETCHG(8,%r9)					   \
 	   VPREFETCHG(9,%r9)					   \
 	   VPREFETCHG(10,%r9)					   \
 	   VPREFETCHG(11,%r9)					   \
 	   /*55*/					      \
           VMADDSUB(Z3,Chi_02,UChi_00)			      \
           VMADDSUB(Z3,Chi_12,UChi_10)			      \
           VMADDSUB(Z4,Chi_02,UChi_01)			      \
           VMADDSUB(Z4,Chi_12,UChi_11)			      \
           VMADDSUB(Z5,Chi_02,UChi_02)			      \
           VMADDSUB(Z5,Chi_12,UChi_12)			      \
 	   /*61 insns*/							);
 #define MULT_ADDSUB_2SPIN_LS(ptr,pf)				   \
  LOAD64(%r8,ptr)						   \
  LOAD64(%r9,pf)						   \
  __asm__ (							   \
           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 	   VPREFETCHG(0,%r9)					   \
 	   VPREFETCHG(1,%r9)					   \
 	   VPREFETCHG(2,%r9)					   \
 	   VPREFETCHG(3,%r9)					   \
 	   /*8*/						   \
           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 	   VPREFETCHG(4,%r9)					   \
 	   VPREFETCHG(5,%r9)					   \
 	   VPREFETCHG(6,%r9)					   \
 	   VPREFETCHG(7,%r9)					   \
 	   /*16*/					  	   \
           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
 	   VPREFETCHG(8,%r9)					   \
 	   VPREFETCHG(9,%r9)					   \
 	   VPREFETCHG(10,%r9)					   \
 	   VPREFETCHG(11,%r9)					   \
           /*22*/						   \
           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 	   VPREFETCH2(12,%r9)					   \
 	   VPREFETCH2(13,%r9)					   \
 	   VPREFETCH2(14,%r9)					   \
 	   VPREFETCH2(15,%r9)					   \
 	   /*30*/						   \
           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
 	   VPREFETCH2(16,%r9)					   \
 	   VPREFETCH2(17,%r9)					   \
 	   VPREFETCH2(18,%r9)					   \
 	   VPREFETCH2(19,%r9)					   \
           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 	   /*36*/					           \
           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 	   VPREFETCH2(20,%r9)					   \
 	   VPREFETCH2(21,%r9)					   \
 	   VPREFETCH2(22,%r9)					   \
 	   VPREFETCH2(23,%r9)					   \
 	   VPREFETCHG(2,%r8)					   \
 	   VPREFETCHG(3,%r8)					   \
 	   VPREFETCH2(4,%r8)					   \
 	   VPREFETCH2(5,%r8)					   \
 	   /*42 insns*/						);
 #define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
  LOAD64(%r8,ptr)						   \
  LOAD64(%r9,pf)						   \
  __asm__ (							   \
           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 	   /*8*/						   \
           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 	   /*16*/					  	   \
           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
           /*22*/						   \
           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 	   /*30*/						   \
           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 	   /*36*/					           \
           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 	   /*	   VPREFETCHG(2,%r8)*/				   \
 	   /*	   VPREFETCHG(3,%r8)*/				   \
 	   /*42 insns*/						);
 #define Z6 Chi_00
 #define MULT_ADDSUB_2SPIN_NEW(ptr,pf)			       \
  LOAD64(%r8,ptr)					       \
  __asm__ (							  \
   VSHUFMEM(0,%r8,Z0)					          \
   VRDUP(Chi_00,T1)           VIDUP(Chi_00,Chi_00)	          \
   VRDUP(Chi_10,T2)           VIDUP(Chi_10,Chi_10)		  \
   VMUL(Z0,Chi_00,Z1)         VMUL(Z0,Chi_10,Z2)		  \
   VSHUFMEM(3,%r8,Z0)						  \
   VMUL(Z0,Chi_00,Z3)         VMUL(Z0,Chi_10,Z4)		  \
   VSHUFMEM(6,%r8,Z0)						  \
   VMUL(Z0,Chi_00,Z5)         VMUL(Z0,Chi_10,Z6)		  \
   VMULMEM(0,%r8,T1,UChi_00)  VMULMEM(0,%r8,T2,UChi_10)		  \
   VMULMEM(3,%r8,T1,UChi_01)  VMULMEM(3,%r8,T2,UChi_11)		  \
   VMULMEM(6,%r8,T1,UChi_02)  VMULMEM(6,%r8,T2,UChi_12)		  \
   /*11 cycles*/						  \
   VSHUFMEM(1,%r8,Z0)						  \
   VRDUP(Chi_01,T1)           VIDUP(Chi_01,Chi_01)		  \
   VRDUP(Chi_11,T2)           VIDUP(Chi_11,Chi_11)		  \
   VMADD(Z0,Chi_01,Z1)        VMADD(Z0,Chi_11,Z2)		  \
   VSHUFMEM(4,%r8,Z0)						  \
   VMADD(Z0,Chi_01,Z3)        VMADD(Z0,Chi_11,Z4)		  \
   VSHUFMEM(7,%r8,Z0)						  \
   VMADD(Z0,Chi_01,Z5)        VMADD(Z0,Chi_11,Z6)		  \
   VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10)	  \
   VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11)	  \
   VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12)	  \
   /*22 cycles*/						  \
   VSHUFMEM(2,%r8,Z0)						  \
   VRDUP(Chi_02,T1)        VIDUP(Chi_02,Chi_02)			  \
   VRDUP(Chi_12,T2)        VIDUP(Chi_12,Chi_12)			  \
   VMADD(Z0,Chi_02,Z1)        VMADD(Z0,Chi_12,Z2)		  \
   VSHUFMEM(5,%r8,Z0)						  \
   VMADD(Z0,Chi_02,Z3)        VMADD(Z0,Chi_12,Z4)		  \
   VSHUFMEM(8,%r8,Z0)						  \
   VMADD(Z0,Chi_02,Z5)        VMADD(Z0,Chi_12,Z6)		  \
   /*33 cycles*/						  \
   VMADDSUBMEM(2,%r8,T1,Z1)   VMADDSUBMEM(2,%r8,T2,Z2)		  \
   VMADDSUBMEM(5,%r8,T1,Z3)   VMADDSUBMEM(5,%r8,T2,Z4)	          \
   VMADDSUBMEM(8,%r8,T1,Z5)   VMADDSUBMEM(8,%r8,T2,Z6)	       \
  /*stall*/						       \
  /*stall*/						       \
  /*stall*/						       \
  VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
  VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
  VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) )
 #endif
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -103,9 +103,11 @@ void LebesgueOrder::IterateI(int ND,
    } else {
      for(int d=0;d<ND;d++){
 	x[d]=xi[d]+xo[d];
 //	std::cout << x[d]<<" ";
      }
 //      std::cout << "\n";
      IndexInteger index;
-      grid->IndexFromCoor(x,index,grid->_rdimensions);
+      Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
      _LebesgueReorder.push_back(index);
    }
  }
@@ -188,6 +190,7 @@ void LebesgueOrder::ZGraph(void)
  }
  assert( _LebesgueReorder.size() == vol );
  /*
  std::vector<int> coor(4);
  for(IndexInteger asite=0;asite<vol;asite++){
    grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
@@ -198,5 +201,6 @@ void LebesgueOrder::ZGraph(void)
 		<< coor[3]<<"]"
 		<<std::endl;
  }
  */
 }
 }
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -44,8 +44,8 @@ template<class vsimd,class scalar>
 inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y, 
 		    std::vector<scalar *> &extracted,int offset){
  // FIXME: bounce off memory is painful
  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
  int Nextr=extracted.size();
  int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr;
  scalar*buf = (scalar *)y;
@@ -59,8 +59,10 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 template<class vsimd,class scalar>
 inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y, 
 		  std::vector<scalar *> &extracted,int offset){
  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
  int Nextr=extracted.size();
  int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
                     // replicate n-fold. Use to allow Integer masks to 
                     // predicate floating point of various width assignments and maintain conformable.
@@ -85,6 +87,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
  scalar *buf = (scalar *)&y;
  for(int i=0;i<Nextr;i++){
    extracted[i]=buf[i*s];
 #ifdef PARANOID
    for(int ii=1;ii<s;ii++){
      if ( buf[i*s]!=buf[i*s+ii] ){
 	std::cout<<GridLogMessage << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
@@ -96,6 +99,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
      }
      assert(buf[i*s]==buf[i*s+ii]);
    }
 #endif
  }
 };
@@ -106,7 +110,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 template<class vsimd,class scalar>
 inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type  &y,std::vector<scalar> &extracted){
  int Nextr=extracted.size();
-  int Nsimd=vsimd::Nsimd();
+  static const int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr;
  scalar *buf = (scalar *)&y;
@@ -125,9 +129,9 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
-  const int Nsimd=vobj::vector_type::Nsimd();
+  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
  static const int words=sizeof(vobj)/sizeof(vector_type);
  int Nextr=extracted.size();
  const int words=sizeof(vobj)/sizeof(vector_type);
  int s=Nsimd/Nextr;
  std::vector<scalar_type *> pointers(Nextr);
@@ -148,8 +152,8 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
-  const int words=sizeof(vobj)/sizeof(vector_type);
+  static const int words=sizeof(vobj)/sizeof(vector_type);
-  const int Nsimd=vobj::vector_type::Nsimd();
+  static const int Nsimd=vobj::vector_type::Nsimd();
  int Nextr=extracted.size();
  int s = Nsimd/Nextr;
@@ -172,8 +176,8 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
-  const int Nsimd=vobj::vector_type::Nsimd();
+  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
-  const int words=sizeof(vobj)/sizeof(vector_type);
+  static const int words=sizeof(vobj)/sizeof(vector_type);
  int Nextr = extracted.size();
  int splat=Nsimd/Nextr;
@@ -197,7 +201,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
-  const int Nsimd=vobj::vector_type::Nsimd();
+  const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
  const int words=sizeof(vobj)/sizeof(vector_type);
  int Nextr=extracted.size();
@@ -224,20 +228,17 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
-  const int Nsimd=vobj::vector_type::Nsimd();
+  static const int Nsimd=vobj::vector_type::Nsimd();
-  const int words=sizeof(vobj)/sizeof(vector_type);
+  static const int words=sizeof(vobj)/sizeof(vector_type);
  scalar_type *pointer;
  scalar_type *vp = (scalar_type *)&vec;
  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
  for(int i=0;i<Nsimd;i++){
    pointer=(scalar_type *)&extracted[i][offset];
  for(int w=0;w<words;w++){
-      vp[w*Nsimd+i] = pointer[w];
+  for(int i=0;i<Nsimd;i++){
-    }
+      vp[w*Nsimd+i] = ((scalar_type *)&extracted[i][offset])[w];
-  }
+  }}
 }
 template<class vobj> inline 
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -18,7 +18,7 @@ TESTS=`ls T*.cc`
 TESTLIST=`echo ${TESTS} | sed s/.cc//g `
 echo > Make.inc
-echo bin_PROGRAMS = ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
+echo bin_PROGRAMS += ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
 echo >> Make.inc
 for f in $TESTS
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@
-bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
+bin_PROGRAMS += Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
@@ -50,6 +50,14 @@ Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
 Test_cshift_red_black_LDADD=-lGrid
 Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
 Test_cshift_red_black_rotate_LDADD=-lGrid
 Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
 Test_cshift_rotate_LDADD=-lGrid
 Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
 Test_dwf_cg_prec_LDADD=-lGrid
@@ -90,6 +98,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 Test_dwf_lanczos_LDADD=-lGrid
 Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
 Test_dwf_rb5d_LDADD=-lGrid
 Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -8,8 +8,20 @@ endif
 AM_CXXFLAGS = -I$(top_srcdir)/lib
 AM_LDFLAGS = -L$(top_builddir)/lib
 if USE_LAPACK
 AM_CXXFLAGS += -DUSE_LAPACK
 if USE_LAPACK_LIB
 #if test "X${ac_LAPACK}X" != XyesX 
 AM_CXXFLAGS += -I$(ac_LAPACK)/include
 AM_LDFLAGS += -L$(ac_LAPACK)/lib
 #fi
 endif
 endif
 if BUILD_ZMM
  bin_PROGRAMS=Test_zmm
 else
  bin_PROGRAMS=
 endif
 include Make.inc
--- a/tests/Test_cshift.cc
+++ b/tests/Test_cshift.cc
@@ -96,13 +96,13 @@ int main (int argc, char ** argv)
 	  std::vector<int> peer(4);
 	  Complex tmp  =cm;
 	  Integer index=real(tmp);
-	  Fine.CoorFromIndex(peer,index,latt_size);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 	  if (nrm > 0){
 	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
-	    Fine.CoorFromIndex(peer,index,latt_size);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	  }
 	}}}}
--- a/tests/Test_cshift_red_black.cc
+++ b/tests/Test_cshift_red_black.cc
@@ -132,7 +132,7 @@ int main (int argc, char ** argv)
 	  std::vector<int> peer(4);
 	  Complex ctmp = cm;
 	  Integer index=real(ctmp);
-	  Fine.CoorFromIndex(peer,index,latt_size);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 	  if (nrm > 0){
 	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
@@ -140,7 +140,7 @@ int main (int argc, char ** argv)
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
-	    Fine.CoorFromIndex(peer,index,latt_size);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exit(-1);
 	  }
@@ -180,7 +180,7 @@ int main (int argc, char ** argv)
 	  std::vector<int> peer(4);
 	  Complex ctmp=cmeo;
 	  Integer index=real(ctmp);
-	  Fine.CoorFromIndex(peer,index,latt_size);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 	  double nrm = abs(cmeo()()()-scm);
 	  if (nrm != 0) {
@@ -189,7 +189,7 @@ int main (int argc, char ** argv)
 		     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
-	    Fine.CoorFromIndex(peer,index,latt_size);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exx=1;
@@ -205,7 +205,7 @@ int main (int argc, char ** argv)
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
-	    Fine.CoorFromIndex(peer,index,latt_size);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exx=1;
 	  } else if (1) { 
--- a/tests/Test_cshift_red_black_rotate.cc
+++ b/tests/Test_cshift_red_black_rotate.cc
@@ -0,0 +1,223 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_cshift_red_black.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  int Nd = latt_size.size();
  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
  std::vector<int> mpi_layout  = GridDefaultMpi();
  std::vector<int> mask(Nd,1);
  mask[0]=0;
  GridCartesian         Fine  (latt_size,simd_layout,mpi_layout);
  GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
  LatticeComplex U(&Fine);
  LatticeComplex ShiftU(&Fine);
  LatticeComplex rbShiftU(&Fine);
  LatticeComplex Ue(&RBFine); 
  LatticeComplex Uo(&RBFine);
  LatticeComplex ShiftUe(&RBFine);
  LatticeComplex ShiftUo(&RBFine);
  LatticeComplex lex(&Fine);
  lex=zero;
  Integer stride =1;
  {
    double nrm;
    LatticeComplex coor(&Fine);
    for(int d=0;d<Nd;d++){
      //      Integer i=10000;
      Integer i=0;
      LatticeCoordinate(coor,d);
      lex = lex + coor*stride+i;
      stride=stride*latt_size[d];
    }
    U=lex;
  }
  pickCheckerboard(Even,Ue,U);
  pickCheckerboard(Odd,Uo,U);
  //  std::cout<<GridLogMessage << U<<std::endl;
  std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
  std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
  TComplex cm;
  TComplex cmeo;
  for(int dir=0;dir<Nd;dir++){
    //    if ( dir!=1 ) continue;
    for(int shift=0;shift<latt_size[dir];shift++){
 	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
 	std::cout<<GridLogMessage<<"Even grid"<<std::endl;
 	ShiftUe = Cshift(Ue,dir,shift);    // Shift everything cb by cb
 	std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
 	std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
 	ShiftUo = Cshift(Uo,dir,shift);    
 	std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
 	std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
 	setCheckerboard(rbShiftU,ShiftUe);
 	setCheckerboard(rbShiftU,ShiftUo);
 	std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
 	std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
 	ShiftU  = Cshift(U,dir,shift);    // Shift everything
 	std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
 	std::vector<int> coor(4);
 	std::cout<<GridLogMessage << "Checking the non-checkerboard shift"<<std::endl;
 	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
 	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 	  peekSite(cm,ShiftU,coor);
 	  /////////	  double nrm=norm2(U);
 	  std::vector<int> scoor(coor);
 	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
 	  Integer slex = scoor[0]
 	    + latt_size[0]*scoor[1]
 	    + latt_size[0]*latt_size[1]*scoor[2]
 	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
 	  Complex scm(slex);
 	  double nrm = abs(scm-cm()()());
 	  std::vector<int> peer(4);
 	  Complex ctmp = cm;
 	  Integer index=real(ctmp);
 	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 	  if (nrm > 0){
 	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
 	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exit(-1);
 	  }
 	}}}}
 	int exx=0;
 	std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
 	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
 	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 	  peekSite(cm,rbShiftU,coor);
 	  Integer checkerboard = RBFine.CheckerBoard(coor);
 	  //	  std::cout << " coor "<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] \n ";
 	  //	  std::cout << "shift "<< shift <<" dir "<<dir<< " checker board "<< checkerboard << " ";
 	  //	  std::cout << "Uo "   << ShiftUo.checkerboard << " Ue "<<ShiftUe.checkerboard<<std::endl;
 	  if ( checkerboard == ShiftUo.checkerboard ) {
 	    peekSite(cmeo,ShiftUo,coor);
 	  } else { 
 	    peekSite(cmeo,ShiftUe,coor);
 	  }
 	  std::vector<int> scoor(coor);
 	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
 	  Integer slex = scoor[0]
 	    + latt_size[0]*scoor[1]
 	    + latt_size[0]*latt_size[1]*scoor[2]
 	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
 	  Complex scm(slex);
 	  std::vector<int> peer(4);
 	  Complex ctmp=cmeo;
 	  Integer index=real(ctmp);
 	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 	  double nrm = abs(cmeo()()()-scm);
 	  if (nrm != 0) {
 	    std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir
 		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 		     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
 	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exx=1;
 	  }
 	  ctmp=cm;
 	  index=real(ctmp);
 	  nrm = abs(scm-cm()()());
 	  if (nrm > 0){
 	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
 	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exx=1;
 	  } else if (1) { 
 	    std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
 		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	  }
 	}}}}
 	if (exx) exit(-1);
    }
  }
  Grid_finalize();
 }
--- a/tests/Test_cshift_rotate.cc
+++ b/tests/Test_cshift_rotate.cc
@@ -0,0 +1,125 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_cshift.cc
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::vector<int> latt_size   = GridDefaultLatt();
  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
  std::vector<int> mpi_layout  = GridDefaultMpi();
  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
  LatticeComplex U(&Fine);
  LatticeComplex ShiftU(&Fine);
  LatticeComplex lex(&Fine);
  lex=zero;
  Integer stride =1;
  {
    double nrm;
    LatticeComplex coor(&Fine);
    for(int d=0;d<4;d++){
      LatticeCoordinate(coor,d);
      lex = lex + coor*stride;
      stride=stride*latt_size[d];
    }
    U=lex;
  }
  TComplex cm;
  for(int dir=0;dir<4;dir++){
    for(int shift=0;shift<latt_size[dir];shift++){
      if ( Fine.IsBoss() ) 
 	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
 	ShiftU  = Cshift(U,dir,shift);    // Shift everything
 	/*
 	std::cout << "U[0]" << U[0]<<std::endl;
 	std::cout << "U[1]" << U[1]<<std::endl;
 	std::cout << "ShiftU[0]" << ShiftU[0]<<std::endl;
 	std::cout << "ShiftU[1]" << ShiftU[1]<<std::endl;
 	*/
 	std::vector<int> coor(4);
 	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
 	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 	  peekSite(cm,ShiftU,coor);
 	  double nrm=norm2(U);
 	  std::vector<int> scoor(coor);
 	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
 	  Integer slex = scoor[0]
 	    + latt_size[0]*scoor[1]
 	    + latt_size[0]*latt_size[1]*scoor[2]
 	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
 	  Complex scm(slex);
 	  nrm = abs(scm-cm()()());
 	  std::vector<int> peer(4);
 	  Complex tmp  =cm;
 	  Integer index=real(tmp);
 	  Lexicographic::CoorFromIndex(peer,index,latt_size);
 	  if (nrm > 0){
 	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
 	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	  }
 	  /*
 	  else {
 	    std::cerr<<"PASS shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	  }
 	  */
 	}}}}
    }
  }
  Grid_finalize();
 }
--- a/tests/Test_dwf_hdcr.cc
+++ b/tests/Test_dwf_hdcr.cc
@@ -42,6 +42,8 @@ public:
 			  int, domaindecompose,
 			  int, domainsize,
 			  int, order,
 			  int, Ls,
 			  double, mq,
 			  double, lo,
 			  double, hi,
 			  int, steps);
@@ -263,11 +265,6 @@ public:
      resid = norm2(r) /norm2(src); 
      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
 // Npoly*outer*2 1/2 vol matmuls.
 // 71 iters => 20*71 = 1400 matmuls.
 // 2*71 = 140 comms.
      // Even domain solve
      r= where(subset==(Integer)0,r,zz);
      _SmootherOperator.AdjOp(r,vec1);
@@ -332,7 +329,7 @@ public:
    CoarseVector Ctmp(_CoarseOperator.Grid());
    CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
-    ConjugateGradient<CoarseVector>  CG(1.0e-3,100000);
+    ConjugateGradient<CoarseVector>  CG(3.0e-3,100000);
    //    ConjugateGradient<FineField>    fCG(3.0e-2,1000);
    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
@@ -345,14 +342,14 @@ public:
    //    Chebyshev<FineField> Cheby    (0.5,70.0,30,InverseApproximation);
    //    Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
-    Chebyshev<FineField> Cheby    (2.0,70.0,15,InverseApproximation);
+    Chebyshev<FineField> Cheby    (params.lo,params.hi,params.order,InverseApproximation);
-    Chebyshev<FineField> ChebyAccu(2.0,70.0,15,InverseApproximation);
+    Chebyshev<FineField> ChebyAccu(params.lo,params.hi,params.order,InverseApproximation);
    //    Cheby.JacksonSmooth();
    //    ChebyAccu.JacksonSmooth();
-    _Aggregates.ProjectToSubspace  (Csrc,in);
+    //    _Aggregates.ProjectToSubspace  (Csrc,in);
-    _Aggregates.PromoteFromSubspace(Csrc,out);
+    //    _Aggregates.PromoteFromSubspace(Csrc,out);
-    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
+    //    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
    //    ofstream fout("smoother");
    //    Cheby.csv(fout);
@@ -479,7 +476,7 @@ int main (int argc, char ** argv)
  read(RD,"params",params);
  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
-  const int Ls=8;
+  const int Ls=params.Ls;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -490,10 +487,12 @@ int main (int argc, char ** argv)
  ///////////////////////////////////////////////////
  // Construct a coarsened grid; utility for this?
  ///////////////////////////////////////////////////
-  const int block=2;
+  std::vector<int> block ({2,2,2,2});
  const int nbasis= 32;
  std::vector<int> clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/block;
+    clatt[d] = clatt[d]/block[d];
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
@@ -539,7 +538,7 @@ int main (int argc, char ** argv)
  //  SU3::HotConfiguration(RNG4,Umu);
  //  Umu=zero;
-  RealD mass=0.01;
+  RealD mass=params.mq;
  RealD M5=1.8;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
@@ -548,9 +547,6 @@ int main (int argc, char ** argv)
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  const int nbasis = 32;
  //  const int nbasis = 4;
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
  typedef CoarseOperator::CoarseVector                                 CoarseVector;
@@ -564,7 +560,8 @@ int main (int argc, char ** argv)
  assert ( (nbasis & 0x1)==0);
  int nb=nbasis/2;
  std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
-  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
+  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
  for(int n=0;n<nb;n++){
    G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
@@ -600,7 +597,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
  ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
-  CG(PosdefLdop,c_src,c_res);
+  //  CG(PosdefLdop,c_src,c_res);
  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  //  std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
@@ -625,17 +622,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  Precon.SmootherTest(src);
+  //  Precon.SmootherTest(src);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PreconDD.SmootherTest(src);
+  //  PreconDD.SmootherTest(src);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PreconDD.SAP(src,result);
+  //  PreconDD.SAP(src,result);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
@@ -663,18 +660,18 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
+  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
-  result=zero;
+  //  result=zero;
-  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  PGCRDD(HermIndefOp,src,result);
+  //  PGCRDD(HermIndefOp,src,result);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
+  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,8);
-  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  //  result=zero;
+  result=zero;
-  //  PGCR(HermIndefOp,src,result);
+  PGCR(HermIndefOp,src,result);
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
--- a/Show More
+++ b/Show More