Merge remote-tracking branch 'origin/develop' into temporary-smearing

2025-07-10 10:27:05 +01:00 · 2016-07-04 17:28:40 +01:00
parent 6ce174cd60 d6737e4bd8
commit 9cb90f714e
107 changed files with 7839 additions and 4572 deletions
--- a/.gitignore
+++ b/.gitignore
@ -94,7 +94,7 @@ Thumbs.db

 # build directory #
 ###################
-build/*
+build*/*

 # IDE related files #
 #####################
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,90 @@
+language: cpp
+
+cache:
+  directories:
+    - clang
+
+matrix:
+  include:
+    - os:        osx
+      osx_image: xcode7.2
+      compiler: clang
+    - os:        osx
+      osx_image: xcode7.2
+      compiler: gcc
+      env: VERSION=-5
+    - compiler: gcc
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.9
+            - libmpfr-dev
+            - libgmp-dev
+            - libmpc-dev
+            - binutils-dev
+      env: VERSION=-4.9
+    - compiler: gcc
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-5
+            - libmpfr-dev
+            - libgmp-dev
+            - libmpc-dev
+            - binutils-dev
+      env: VERSION=-5
+    - compiler: clang
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.8
+            - libmpfr-dev
+            - libgmp-dev
+            - libmpc-dev
+            - binutils-dev
+      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
+    - compiler: clang
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.8
+            - libmpfr-dev
+            - libgmp-dev
+            - libmpc-dev
+            - binutils-dev
+      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
+      
+before_install:
+    - export GRIDDIR=`pwd`
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
+    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
+    
+install:
+    - export CC=$CC$VERSION
+    - export CXX=$CXX$VERSION
+    - echo $PATH
+    - which $CC
+    - $CC  --version
+    - which $CXX
+    - $CXX --version
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
+    
+script:
+    - ./scripts/reconfigure_script
+    - mkdir build
+    - cd build
+    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
+    - make -j4
+    - ./benchmarks/Benchmark_dwf --threads 1
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# Grid
+# Grid [![Build Status](https://travis-ci.org/paboyle/Grid.svg?branch=master)](https://travis-ci.org/paboyle/Grid)
 Data parallel C++ mathematical object library

 Last update 2015/7/30
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@ -27,6 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
+#include <PerfCount.h>

 using namespace std;
 using namespace Grid;
@ -45,6 +46,10 @@ struct scal {
  };

 bool overlapComms = false;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
+

 int main (int argc, char ** argv)
 {
@ -64,6 +69,12 @@ int main (int argc, char ** argv)
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

+  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});

@ -78,7 +89,9 @@ int main (int argc, char ** argv)

  ColourMatrix cm = Complex(1.0,0.0);

-  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+  LatticeGaugeField Umu(UGrid); 
+  random(RNG4,Umu);
+
  LatticeGaugeField Umu5d(FGrid); 

  // replicate across fifth dimension
@ -119,14 +132,21 @@ int main (int argc, char ** argv)
  
  RealD NP = UGrid->_Nprocessors;

+  for(int doasm=1;doasm<2;doasm++){
+
+    QCD::WilsonKernelsStatic::AsmOpt=doasm;
+
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall=1000;
-  {
+  int ncall =10;
+  if (1) {
+
    double t0=usecond();
    for(int i=0;i<ncall;i++){
+      __SSC_START;
      Dw.Dhop(src,result,0);
+      __SSC_STOP;
    }
    double t1=usecond();
    
@ -140,9 +160,121 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    Dw.Report();
+    //    Dw.Report();
  }

+  if (1)
+  {
+    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
+    LatticeFermionF ssrc(sFGrid);
+    LatticeFermionF sref(sFGrid);
+    LatticeFermionF sresult(sFGrid);
+    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
+  
+    for(int x=0;x<latt4[0];x++){
+    for(int y=0;y<latt4[1];y++){
+    for(int z=0;z<latt4[2];z++){
+    for(int t=0;t<latt4[3];t++){
+    for(int s=0;s<Ls;s++){
+      std::vector<int> site({s,x,y,z,t});
+      SpinColourVectorF tmp;
+      peekSite(tmp,src,site);
+      pokeSite(tmp,ssrc,site);
+    }}}}}
+
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      sDw.Dhop(ssrc,sresult,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    //  sDw.Report();
+  
+    if(0){
+      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+	sDw.Dhop(ssrc,sresult,0);
+	PerformanceCounter Counter(i);
+	Counter.Start();
+	sDw.Dhop(ssrc,sresult,0);
+	Counter.Stop();
+	Counter.Report();
+      }
+    }
+
+
+
+    RealF sum=0;
+    for(int x=0;x<latt4[0];x++){
+    for(int y=0;y<latt4[1];y++){
+    for(int z=0;z<latt4[2];z++){
+    for(int t=0;t<latt4[3];t++){
+    for(int s=0;s<Ls;s++){
+      std::vector<int> site({s,x,y,z,t});
+      SpinColourVectorF normal, simd;
+      peekSite(normal,result,site);
+      peekSite(simd,sresult,site);
+      sum=sum+norm2(normal-simd);
+      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
+      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
+      //      std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
+    }}}}}
+    std::cout<<" difference between normal and simd is "<<sum<<std::endl;
+
+
+    if (1) {
+
+      LatticeFermionF sr_eo(sFGrid);
+      LatticeFermionF serr(sFGrid);
+
+      LatticeFermion ssrc_e (sFrbGrid);
+      LatticeFermion ssrc_o (sFrbGrid);
+      LatticeFermion sr_e   (sFrbGrid);
+      LatticeFermion sr_o   (sFrbGrid);
+
+      pickCheckerboard(Even,ssrc_e,ssrc);
+      pickCheckerboard(Odd,ssrc_o,ssrc);
+
+      setCheckerboard(sr_eo,ssrc_o);
+      setCheckerboard(sr_eo,ssrc_e);
+      serr = sr_eo-ssrc; 
+      std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl;
+
+      sr_e = zero;
+      sr_o = zero;
+
+      double t0=usecond();
+      for(int i=0;i<ncall;i++){
+	sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+      }
+      double t1=usecond();
+
+      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+      double flops=(1344.0*volume*ncall)/2;
+
+      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
+      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+
+      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
+      sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
+      sDw.Dhop  (ssrc  ,sresult,DaggerNo);
+
+      pickCheckerboard(Even,ssrc_e,sresult);
+      pickCheckerboard(Odd ,ssrc_o,sresult);
+      ssrc_e = ssrc_e - sr_e;
+      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<<std::endl;
+      ssrc_o = ssrc_o - sr_o;
+      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<<std::endl;
+    }
+
+
+  }

  if (1)
  { // Naive wilson dag implementation
@ -197,7 +329,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
  }
-
  Dw.DhopEO(src_o,r_e,DaggerNo);
  Dw.DhopOE(src_e,r_o,DaggerNo);
  Dw.Dhop  (src  ,result,DaggerNo);
@ -217,5 +348,8 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;

+
+  }
+
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf_ntpf.cc
+++ b/benchmarks/Benchmark_dwf_ntpf.cc
@ -0,0 +1,154 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_dwf.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+#include <PerfCount.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+bool overlapComms = false;
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
+    overlapComms = true;
+  }
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> latt4 = GridDefaultLatt();
+  const int Ls=16;
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+
+  LatticeFermion src   (FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid);    ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+
+  ColourMatrix cm = Complex(1.0,0.0);
+
+  LatticeGaugeField Umu(UGrid); 
+  random(RNG4,Umu);
+
+  LatticeGaugeField Umu5d(FGrid); 
+
+  // replicate across fifth dimension
+  for(int ss=0;ss<Umu._grid->oSites();ss++){
+    for(int s=0;s<Ls;s++){
+      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
+    }
+  }
+
+  ////////////////////////////////////
+  // Naive wilson implementation
+  ////////////////////////////////////
+  std::vector<LatticeColourMatrix> U(4,FGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+  }
+
+  if (1)
+  {
+    ref = zero;
+    for(int mu=0;mu<Nd;mu++){
+
+      tmp = U[mu]*Cshift(src,mu+1,1);
+      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
+
+      tmp =adj(U[mu])*src;
+      tmp =Cshift(tmp,mu+1,-1);
+      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
+    }
+    ref = -0.5*ref;
+  }
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  typename DomainWallFermionR::ImplParams params; 
+  params.overlapCommsCompute = overlapComms;
+  
+  RealD NP = UGrid->_Nprocessors;
+
+
+  QCD::WilsonKernelsStatic::AsmOpt=1;
+
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
+  
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
+  int ncall =50;
+  if (1) {
+
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Dhop(src,result,0);
+    }
+    double t1=usecond();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    err = ref-result; 
+    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
+    //    Dw.Report();
+  }
+  Grid_finalize();
+}
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@ -0,0 +1,172 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_zmm.cc
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+#include <PerfCount.h>
+
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
+
+int main(int argc,char **argv)
+{
+  Grid_init(&argc,&argv);
+  std::ofstream os("zmm.dat");
+
+  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
+  for(int L=4;L<=32;L+=4){
+    for(int m=1;m<=2;m++){
+      for(int Ls=8;Ls<=16;Ls+=8){
+	std::vector<int> grid({L,L,m*L,m*L});
+	for(int i=0;i<4;i++) { 
+	  std::cout << grid[i]<<"x";
+	}
+	std::cout << Ls<<std::endl;
+	bench(os,grid,Ls);
+      }
+    }
+  }
+}
+
+int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
+{
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+  int threads = GridThread::GetThreads();
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
+
+  LatticeFermion src (FGrid);
+  LatticeFermion tmp (FGrid);
+  LatticeFermion srce(FrbGrid);
+
+  LatticeFermion resulto(FrbGrid); resulto=zero;
+  LatticeFermion resulta(FrbGrid); resulta=zero;
+  LatticeFermion junk(FrbGrid); junk=zero;
+  LatticeFermion diff(FrbGrid); 
+  LatticeGaugeField Umu(UGrid);
+
+  double mfc, mfa, mfo, mfl1;
+
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  random(RNG5,src);
+#if 1
+  random(RNG4,Umu);
+#else
+  int mmu=2;
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+  for(int mu=0;mu<Nd;mu++){
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+    if ( mu!=mmu ) U[mu] = zero;
+    if ( mu==mmu ) U[mu] = 1.0;
+    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
+  }
+#endif
+ pickCheckerboard(Even,srce,src);
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
+  int ncall=50;
+  double t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.DhopOE(srce,resulto,0);
+  }
+  double t1=usecond();
+
+  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+  double flops=1344*volume/2;
+
+  mfc = flops*ncall/(t1-t0);
+  std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s =   "<< mfc<<std::endl;
+
+  QCD::WilsonKernelsStatic::AsmOpt=1;
+  t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.DhopOE(srce,resulta,0);
+  }
+  t1=usecond();
+  mfa = flops*ncall/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
+  /*
+  int dag=DaggerNo;
+  t0=usecond();
+  for(int i=0;i<1;i++){
+    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
+  }
+  t1=usecond();
+  mfo = flops*100/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
+
+  t0=usecond();
+  for(int i=0;i<1;i++){
+    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
+  }
+  t1=usecond();
+  mfl1= flops*100/(t1-t0);
+  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
+  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
+     << mfc<<" "
+     << mfa<<" "
+     << mfo<<" "
+     << mfl1<<std::endl;
+  */
+
+#if 0
+  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+    Dw.DhopOE(srce,resulta,0);
+    PerformanceCounter Counter(i);
+    Counter.Start();
+    Dw.DhopOE(srce,resulta,0);
+    Counter.Stop();
+    Counter.Report();
+  }
+#endif
+  //resulta = (-0.5) * resulta;
+
+  diff = resulto-resulta;
+  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
+  std::cout<<std::endl;
+  return 0;
+}
+
+
--- a/benchmarks/Make.inc
+++ b/benchmarks/Make.inc
@ -1,5 +1,5 @@

-bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
+bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm


 Benchmark_comms_SOURCES=Benchmark_comms.cc
@ -10,6 +10,10 @@ Benchmark_dwf_SOURCES=Benchmark_dwf.cc
 Benchmark_dwf_LDADD=-lGrid


+Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
+Benchmark_dwf_ntpf_LDADD=-lGrid
+
+
 Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 Benchmark_memory_asynch_LDADD=-lGrid

@ -25,3 +29,7 @@ Benchmark_su3_LDADD=-lGrid
 Benchmark_wilson_SOURCES=Benchmark_wilson.cc
 Benchmark_wilson_LDADD=-lGrid

+
+Benchmark_zmm_SOURCES=Benchmark_zmm.cc
+Benchmark_zmm_LDADD=-lGrid
+
--- a/109
+++ b/109
@ -626,12 +626,18 @@ ac_subst_vars='am__EXEEXT_FALSE
 am__EXEEXT_TRUE
 LTLIBOBJS
 LIBOBJS
+USE_LAPACK_LIB_FALSE
+USE_LAPACK_LIB_TRUE
+USE_LAPACK_FALSE
+USE_LAPACK_TRUE
 BUILD_CHROMA_REGRESSION_FALSE
 BUILD_CHROMA_REGRESSION_TRUE
 BUILD_COMMS_NONE_FALSE
 BUILD_COMMS_NONE_TRUE
 BUILD_COMMS_MPI_FALSE
 BUILD_COMMS_MPI_TRUE
+BUILD_COMMS_SHMEM_FALSE
+BUILD_COMMS_SHMEM_TRUE
 BUILD_ZMM_FALSE
 BUILD_ZMM_TRUE
 EGREP
@ -751,7 +757,9 @@ enable_simd
 enable_precision
 enable_comms
 enable_rng
+enable_timers
 enable_chroma
+enable_lapack
 '
      ac_precious_vars='build_alias
 host_alias
@ -1410,7 +1418,9 @@ Optional Features:
  --enable-comms=none|mpi Select communications
  --enable-rng=ranlux48|mt19937
                          Select Random Number Generator to be used
+  --enable-timers=yes|no  Enable system dependent high res timers
  --enable-chroma         Expect chroma compiled under c++11
+  --enable-lapack         Enable lapack yes/no

 Some influential environment variables:
  CXX         C++ compiler command
@ -6410,7 +6420,7 @@ if test "${enable_simd+set}" = set; then :
  enableval=$enable_simd; \
 	ac_SIMD=${enable_simd}
 else
-  ac_SIMD=AVX2
+  ac_SIMD=DEBUG
 fi


@ -6477,7 +6487,7 @@ $as_echo "#define AVX512 1" >>confdefs.h
 $as_echo "#define IMCI 1" >>confdefs.h

       supported="cross compilation"
-       ac_ZMM=yes;
+       ac_ZMM=no;
     ;;
     NEONv8)
       echo Configuring for experimental ARMv8a support
@ -6561,12 +6571,26 @@ $as_echo "#define GRID_COMMS_NONE 1" >>confdefs.h

 $as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h

+     ;;
+     shmem)
+       echo Configuring for SHMEM communications
+
+$as_echo "#define GRID_COMMS_SHMEM 1" >>confdefs.h
+
     ;;
     *)
     as_fn_error $? "${ac_COMMS} unsupported --enable-comms option" "$LINENO" 5;
     ;;
 esac

+ if  test "X${ac_COMMS}X" == "XshmemX" ; then
+  BUILD_COMMS_SHMEM_TRUE=
+  BUILD_COMMS_SHMEM_FALSE='#'
+else
+  BUILD_COMMS_SHMEM_TRUE='#'
+  BUILD_COMMS_SHMEM_FALSE=
+fi
+
 if  test "X${ac_COMMS}X" == "XmpiX" ; then
  BUILD_COMMS_MPI_TRUE=
  BUILD_COMMS_MPI_FALSE='#'
@ -6610,6 +6634,34 @@ $as_echo "#define RNG_MT19937 1" >>confdefs.h
     as_fn_error $? "${ac_RNG} unsupported --enable-rng option" "$LINENO" 5;
     ;;
 esac
+
+#
+# SDE timing mode
+#
+# Check whether --enable-timers was given.
+if test "${enable_timers+set}" = set; then :
+  enableval=$enable_timers; \
+	ac_TIMERS=${enable_timers}
+else
+  ac_TIMERS=yes
+fi
+
+case ${ac_TIMERS} in
+     yes)
+
+$as_echo "#define TIMERS_ON 1" >>confdefs.h
+
+     ;;
+     no)
+
+$as_echo "#define TIMERS_OFF 1" >>confdefs.h
+
+     ;;
+     *)
+     as_fn_error $? "${ac_TIMERS} unsupported --enable-timers option" "$LINENO" 5;
+     ;;
+esac
+
 #
 # Chroma regression tests
 #
@ -6642,6 +6694,46 @@ else
 fi


+#
+# Lapack
+#
+# Check whether --enable-lapack was given.
+if test "${enable_lapack+set}" = set; then :
+  enableval=$enable_lapack; ac_LAPACK=${enable_lapack}
+else
+  ac_LAPACK=no
+fi
+
+
+case ${ac_LAPACK} in
+     yes)
+       echo Enabling lapack
+     ;;
+     no)
+       echo Disabling lapack
+     ;;
+     *)
+       echo Enabling lapack at ${ac_LAPACK}
+     ;;
+esac
+
+ if  test "X${ac_LAPACK}X" != "XnoX" ; then
+  USE_LAPACK_TRUE=
+  USE_LAPACK_FALSE='#'
+else
+  USE_LAPACK_TRUE='#'
+  USE_LAPACK_FALSE=
+fi
+
+ if  test "X${ac_LAPACK}X" != "XyesX" ; then
+  USE_LAPACK_LIB_TRUE=
+  USE_LAPACK_LIB_FALSE='#'
+else
+  USE_LAPACK_LIB_TRUE='#'
+  USE_LAPACK_LIB_FALSE=
+fi
+
+
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
@ -6809,6 +6901,10 @@ if test -z "${BUILD_ZMM_TRUE}" && test -z "${BUILD_ZMM_FALSE}"; then
  as_fn_error $? "conditional \"BUILD_ZMM\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${BUILD_COMMS_SHMEM_TRUE}" && test -z "${BUILD_COMMS_SHMEM_FALSE}"; then
+  as_fn_error $? "conditional \"BUILD_COMMS_SHMEM\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
  as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@ -6821,6 +6917,14 @@ if test -z "${BUILD_CHROMA_REGRESSION_TRUE}" && test -z "${BUILD_CHROMA_REGRESSI
  as_fn_error $? "conditional \"BUILD_CHROMA_REGRESSION\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${USE_LAPACK_TRUE}" && test -z "${USE_LAPACK_FALSE}"; then
+  as_fn_error $? "conditional \"USE_LAPACK\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${USE_LAPACK_LIB_TRUE}" && test -z "${USE_LAPACK_LIB_FALSE}"; then
+  as_fn_error $? "conditional \"USE_LAPACK_LIB\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi

 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@ -8167,6 +8271,7 @@ The following features are enabled:
 - communications type           : ${ac_COMMS}
 - default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG}
+- LAPACK	                : ${ac_LAPACK}


 "
--- a/configure.ac
+++ b/configure.ac
@ -71,7 +71,7 @@ AC_CHECK_FUNCS([gettimeofday])

 AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
-	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
+	[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])

 supported=no

@ -124,7 +124,7 @@ case ${ac_SIMD} in
       echo Configuring for IMCI
       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
       supported="cross compilation"
-       ac_ZMM=yes;
+       ac_ZMM=no;
     ;;
     NEONv8)
       echo Configuring for experimental ARMv8a support 
@ -178,11 +178,16 @@ case ${ac_COMMS} in
       echo Configuring for MPI communications
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
     ;;
+     shmem)
+       echo Configuring for SHMEM communications
+       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
+     ;;
     *)
     AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
     ;;
 esac

+AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])

@ -203,6 +208,25 @@ case ${ac_RNG} in
     AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); 
     ;;
 esac
+
+#
+# SDE timing mode
+#
+AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
+	[Enable system dependent high res timers])],\
+	[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
+case ${ac_TIMERS} in
+     yes)
+     AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
+     ;;
+     no)
+     AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
+     ;;
+     *)
+     AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]); 
+     ;;
+esac
+
 #
 # Chroma regression tests
 #
@ -222,6 +246,26 @@ esac

 AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])

+#
+# Lapack
+#
+AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
+
+case ${ac_LAPACK} in
+     yes)
+       echo Enabling lapack
+     ;;
+     no)
+       echo Disabling lapack
+     ;;
+     *)
+       echo Enabling lapack at ${ac_LAPACK}
+     ;;
+esac
+
+AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
+AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
+
 ###################################################################
 # Checks for doxygen support
 # if present enables the "make doxyfile" command
@ -265,6 +309,7 @@ The following features are enabled:
 - communications type           : ${ac_COMMS}
 - default precision             : ${ac_PRECISION}
 - RNG choice                    : ${ac_RNG} 
+- LAPACK	                : ${ac_LAPACK} 


 "
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@ -36,11 +36,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <malloc.h>
 #endif

-#include <immintrin.h>
 #ifdef HAVE_MM_MALLOC_H
 #include <mm_malloc.h>
 #endif

+#ifdef GRID_COMMS_SHMEM
+extern "C" { 
+#include <mpp/shmem.h>
+extern void * shmem_align(size_t, size_t);
+extern void  shmem_free(void *);
+}
+#endif
+
 namespace Grid {

 ////////////////////////////////////////////////////////////////////
@ -72,21 +79,59 @@ public:

  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }

-  pointer allocate(size_type __n, const void* = 0)
+  pointer allocate(size_type __n, const void* _p= 0)
  { 
+#ifdef GRID_COMMS_SHMEM
+
+    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
+
+
+#define PARANOID_SYMMETRIC_HEAP
+#ifdef PARANOID_SYMMETRIC_HEAP
+    static void * bcast;
+    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+
+    bcast = (void *) ptr;
+    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
+
+    if ( bcast != ptr ) {
+      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
+      BACKTRACEFILE();
+      exit(0);
+    }
+
+    assert( bcast == (void *) ptr);
+
+#endif 
+#else
+
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
+
+#endif
+    _Tp tmp;
+#undef FIRST_TOUCH_OPTIMISE
+#ifdef FIRST_TOUCH_OPTIMISE
+#pragma omp parallel for 
+  for(int i=0;i<__n;i++){
+    ptr[i]=tmp;
+  }
+#endif 
    return ptr;
  }

  void deallocate(pointer __p, size_type) { 
+#ifdef GRID_COMMS_SHMEM
+    shmem_free((void *)__p);
+#else
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
+#endif
 #endif
  }
  void construct(pointer __p, const _Tp& __val) { };
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@ -1,180 +0,0 @@
-/* lib/Config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* AVX Intrinsics */
-#undef AVX1
-
-/* AVX2 Intrinsics */
-#undef AVX2
-
-/* AVX512 Intrinsics for Knights Landing */
-#undef AVX512
-
-/* AVX Intrinsics with FMA4 */
-#undef AVXFMA4
-
-/* EMPTY_SIMD only for DEBUGGING */
-#undef EMPTY_SIMD
-
-/* GRID_COMMS_MPI */
-#undef GRID_COMMS_MPI
-
-/* GRID_COMMS_NONE */
-#undef GRID_COMMS_NONE
-
-/* GRID_DEFAULT_PRECISION is DOUBLE */
-#undef GRID_DEFAULT_PRECISION_DOUBLE
-
-/* GRID_DEFAULT_PRECISION is SINGLE */
-#undef GRID_DEFAULT_PRECISION_SINGLE
-
-/* Support Altivec instructions */
-#undef HAVE_ALTIVEC
-
-/* Support AVX (Advanced Vector Extensions) instructions */
-#undef HAVE_AVX
-
-/* Support AVX2 (Advanced Vector Extensions 2) instructions */
-#undef HAVE_AVX2
-
-/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
-   don't. */
-#undef HAVE_DECL_BE64TOH
-
-/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
-   */
-#undef HAVE_DECL_NTOHLL
-
-/* Define to 1 if you have the <endian.h> header file. */
-#undef HAVE_ENDIAN_H
-
-/* Define to 1 if you have the <execinfo.h> header file. */
-#undef HAVE_EXECINFO_H
-
-/* Support FMA3 (Fused Multiply-Add) instructions */
-#undef HAVE_FMA
-
-/* Define to 1 if you have the `gettimeofday' function. */
-#undef HAVE_GETTIMEOFDAY
-
-/* Define to 1 if you have the <gmp.h> header file. */
-#undef HAVE_GMP_H
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#undef HAVE_INTTYPES_H
-
-/* Define to 1 if you have the <malloc.h> header file. */
-#undef HAVE_MALLOC_H
-
-/* Define to 1 if you have the <malloc/malloc.h> header file. */
-#undef HAVE_MALLOC_MALLOC_H
-
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
-/* Support mmx instructions */
-#undef HAVE_MMX
-
-/* Define to 1 if you have the <mm_malloc.h> header file. */
-#undef HAVE_MM_MALLOC_H
-
-/* Support SSE (Streaming SIMD Extensions) instructions */
-#undef HAVE_SSE
-
-/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
-#undef HAVE_SSE2
-
-/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
-#undef HAVE_SSE3
-
-/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
-#undef HAVE_SSE4_1
-
-/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
-#undef HAVE_SSE4_2
-
-/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
-#undef HAVE_SSSE3
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#undef HAVE_STDLIB_H
-
-/* Define to 1 if you have the <strings.h> header file. */
-#undef HAVE_STRINGS_H
-
-/* Define to 1 if you have the <string.h> header file. */
-#undef HAVE_STRING_H
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#undef HAVE_SYS_STAT_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
-
-/* IMCI Intrinsics for Knights Corner */
-#undef IMCI
-
-/* NEON ARMv8 Experimental support */
-#undef NEONv8
-
-/* Name of package */
-#undef PACKAGE
-
-/* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
-
-/* Define to the full name of this package. */
-#undef PACKAGE_NAME
-
-/* Define to the full name and version of this package. */
-#undef PACKAGE_STRING
-
-/* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
-
-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
-/* Define to the version of this package. */
-#undef PACKAGE_VERSION
-
-/* RNG_MT19937 */
-#undef RNG_MT19937
-
-/* RNG_RANLUX */
-#undef RNG_RANLUX
-
-/* SSE4 Intrinsics */
-#undef SSE4
-
-/* Define to 1 if you have the ANSI C header files. */
-#undef STDC_HEADERS
-
-/* Version number of package */
-#undef VERSION
-
-/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
-   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
-   #define below would cause a syntax error. */
-#undef _UINT32_T
-
-/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
-   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
-   #define below would cause a syntax error. */
-#undef _UINT64_T
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-#undef size_t
-
-/* Define to the type of an unsigned integer type of width exactly 32 bits if
-   such a type exists and the standard includes do not define it. */
-#undef uint32_t
-
-/* Define to the type of an unsigned integer type of width exactly 64 bits if
-   such a type exists and the standard includes do not define it. */
-#undef uint64_t
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@ -37,4 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI
 #include <cshift/Cshift_mpi.h>
 #endif 
+
+#ifdef GRID_COMMS_SHMEM
+#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
+#endif 
 #endif
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -62,10 +62,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <serialisation/Serialisation.h>
 #include <Config.h>
 #include <Timer.h>
+#include <PerfCount.h>
 #include <Log.h>
 #include <AlignedAllocator.h>
 #include <Simd.h>
 #include <Threads.h>
+#include <Lexicographic.h>
 #include <Communicator.h> 
 #include <Cartesian.h>    
 #include <Tensors.h>      
--- a/lib/Init.cc
+++ b/lib/Init.cc
@ -45,12 +45,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <algorithm>
 #include <iterator>

-#define __X86_64
-
-#ifdef HAVE_EXECINFO_H
-#include <execinfo.h>
-#endif
-
 namespace Grid {

 //////////////////////////////////////////////////////
@ -150,6 +144,10 @@ void GridParseLayout(char **argv,int argc,
  }
  if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
    std::vector<int> ompthreads(0);
+#ifndef GRID_OMP
+    std::cout << GridLogWarning << "'--threads' option used but Grid was"
+              << " not compiled with thread support" << std::endl;
+#endif
    arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
    GridCmdOptionIntVector(arg,ompthreads);
    assert(ompthreads.size()==1);
@ -174,9 +172,8 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 /////////////////////////////////////////////////////////
 void Grid_init(int *argc,char ***argv)
 {
-#ifdef GRID_COMMS_MPI
-  MPI_Init(argc,argv);
-#endif
+  CartesianCommunicator::Init(argc,argv);
+
  // Parse command line args.

  GridLogger::StopWatch.Start();
@ -194,9 +191,10 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
-    std::cout<<GridLogMessage<<"--omp n         : default number of OMP threads"<<std::endl;    
+    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;    
+    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
+    exit(EXIT_SUCCESS);
  }

  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
@ -213,8 +211,7 @@ void Grid_init(int *argc,char ***argv)
    Grid_quiesce_nodes();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
-    QCD::WilsonFermionStatic::HandOptDslash=1;
-    QCD::WilsonFermion5DStatic::HandOptDslash=1;
+    QCD::WilsonKernelsStatic::HandOpt=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
@ -287,13 +284,7 @@ void Grid_finalize(void)
  Grid_unquiesce_nodes();
 #endif
 }
-double usecond(void) {
-  struct timeval tv;
-  gettimeofday(&tv,NULL);
-  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
-}

-#define _NBACKTRACE (256)
 void * Grid_backtrace_buffer[_NBACKTRACE];

 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
@ -303,13 +294,13 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  printf("         code %d\n",si->si_code);

  // Linux/Posix
-#ifdef __linux__ 
+#ifdef __linux__
  // And x86 64bit
-    ucontext_t * uc= (ucontext_t *)ptr;
+#ifdef __x86_64__
+  ucontext_t * uc= (ucontext_t *)ptr;
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 #define REG(A)  printf("  %s %lx\n",#A,sc-> A);
-
  REG(rdi);
  REG(rsi);
  REG(rbp);
@ -330,17 +321,15 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
  REG(r14);
  REG(r15);
 #endif
-#ifdef HAVE_EXECINFO_H
-  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
-  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
-  for (int i = 0; i < symbols; i++){
-    printf ("%s\n", strings[i]);
-  }
 #endif
+  BACKTRACE();
  exit(0);
  return;
 };
-
+#ifdef GRID_FPE
+#define _GNU_SOURCE
+#include <fenv.h>
+#endif
 void Grid_debug_handler_init(void)
 {
  struct sigaction sa,osa;
@ -349,5 +338,9 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
+#ifdef GRID_FPE
+  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+  sigaction(SIGFPE,&sa,NULL);
+#endif
 }
 }
--- a/lib/Lexicographic.h
+++ b/lib/Lexicographic.h
@ -0,0 +1,32 @@
+#ifndef GRID_LEXICOGRAPHIC_H
+#define GRID_LEXICOGRAPHIC_H
+
+
+namespace Grid{
+
+  class Lexicographic {
+  public:
+
+    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
+      int nd= dims.size();
+      coor.resize(nd);
+      for(int d=0;d<nd;d++){
+	coor[d] = index % dims[d];
+	index   = index / dims[d];
+      }
+    }
+
+    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
+      int nd=dims.size();
+      int stride=1;
+      index=0;
+      for(int d=0;d<nd;d++){
+	index = index+stride*coor[d];
+	stride=stride*dims[d];
+      }
+    }
+
+  };
+
+}
+#endif
--- a/lib/Log.cc
+++ b/lib/Log.cc
@ -73,13 +73,16 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void)
 {
+  int me=0;
 #ifdef GRID_COMMS_MPI
-  int me;
  MPI_Comm_rank(MPI_COMM_WORLD,&me);
+#endif
+#ifdef GRID_COMMS_SHMEM
+  me = shmem_my_pe();
+#endif
  if ( me ) { 
    std::cout.setstate(std::ios::badbit);
  }
-#endif
 }

 void Grid_unquiesce_nodes(void)
--- a/lib/Log.h
+++ b/lib/Log.h
@ -32,113 +32,150 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 #ifndef GRID_LOG_H
 #define GRID_LOG_H
-namespace Grid {
-  
-  // Dress the output; use std::chrono for time stamping via the StopWatch class
+
+#ifdef HAVE_EXECINFO_H
+#include <execinfo.h>
+#endif
+
+    namespace Grid {
+
+// Dress the output; use std::chrono for time stamping via the StopWatch class
+int Rank(void); // used for early stage debug before library init


-  class Colours{
-  protected:
-    bool is_active;
-  public:
-    std::map<std::string, std::string> colour;
-    
-   
-    Colours(bool activate=false){
-      Active(activate);
-    };
+class Colours{
+protected:
+  bool is_active;
+public:
+  std::map<std::string, std::string> colour;

-    
-    void Active(bool activate){
-      is_active=activate;
-      
-      if (is_active){
-	colour["BLACK"]  ="\033[30m";
-	colour["RED"]    ="\033[31m";
-	colour["GREEN"]  ="\033[32m";
-	colour["YELLOW"] ="\033[33m";
-	colour["BLUE"]   ="\033[34m";
-	colour["PURPLE"] ="\033[35m";
-	colour["CYAN"]   ="\033[36m";
-	colour["WHITE"]  ="\033[37m";
-	colour["NORMAL"] ="\033[0;39m";
-      } else {
-      colour["BLACK"] ="";
-      colour["RED"]   ="";
-      colour["GREEN"] ="";
-      colour["YELLOW"]="";
-      colour["BLUE"]  ="";
-      colour["PURPLE"]="";
-      colour["CYAN"]  ="";
-      colour["WHITE"] ="";
-      colour["NORMAL"]="";
-      }

-      
-    };
-    
+  Colours(bool activate=false){
+    Active(activate);
  };
-    
-  
-  class Logger {
-  protected:
-    Colours &Painter;
-    int active;
-    std::string name, topName;
-    std::string COLOUR;

-  public:
-    static GridStopWatch StopWatch;
-    static std::ostream devnull;
-    
-    std::string background() {return Painter.colour["NORMAL"];}
-    std::string evidence() {return Painter.colour["YELLOW"];}
-    std::string colour() {return Painter.colour[COLOUR];}
-    
-    Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
-      : active(on),
-	name(nm),
-	topName(topNm),
-	Painter(col_class),
-	COLOUR(col){} ;
+  void Active(bool activate){
+    is_active=activate;
+
+    if (is_active){
+     colour["BLACK"]  ="\033[30m";
+     colour["RED"]    ="\033[31m";
+     colour["GREEN"]  ="\033[32m";
+     colour["YELLOW"] ="\033[33m";
+     colour["BLUE"]   ="\033[34m";
+     colour["PURPLE"] ="\033[35m";
+     colour["CYAN"]   ="\033[36m";
+     colour["WHITE"]  ="\033[37m";
+     colour["NORMAL"] ="\033[0;39m";
+   } else {
+    colour["BLACK"] ="";
+    colour["RED"]   ="";
+    colour["GREEN"] ="";
+    colour["YELLOW"]="";
+    colour["BLUE"]  ="";
+    colour["PURPLE"]="";
+    colour["CYAN"]  ="";
+    colour["WHITE"] ="";
+    colour["NORMAL"]="";
+  }
+
+
+};
+
+};
+
+
+class Logger {
+protected:
+  Colours &Painter;
+  int active;
+  std::string name, topName;
+  std::string COLOUR;
+
+public:
+  static GridStopWatch StopWatch;
+  static std::ostream devnull;
+
+  std::string background() {return Painter.colour["NORMAL"];}
+  std::string evidence() {return Painter.colour["YELLOW"];}
+  std::string colour() {return Painter.colour[COLOUR];}
+
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
+  : active(on),
+  name(nm),
+  topName(topNm),
+  Painter(col_class),
+  COLOUR(col){} ;
  
  void Active(int on) {active = on;};
  int  isActive(void) {return active;};
  
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
-      
+
    if ( log.active ) {
-            StopWatch.Stop();
-            GridTime now = StopWatch.Elapsed();
-            StopWatch.Start();
-            stream << log.background()<< log.topName << log.background()<< " : ";
-            stream << log.colour() <<std::setw(10) << std::left << log.name << log.background() << " : ";
-            stream << log.evidence()<< now << log.background() << " : " << log.colour();
-            return stream;
-        } else { 
-            return devnull;
-        }
+      StopWatch.Stop();
+      GridTime now = StopWatch.Elapsed();
+      StopWatch.Start();
+      stream << log.background()<< log.topName << log.background()<< " : ";
+      stream << log.colour() <<std::setw(10) << std::left << log.name << log.background() << " : ";
+      stream << log.evidence()<< now << log.background() << " : " << log.colour();
+      return stream;
+    } else { 
+      return devnull;
    }
-    
+  }
+
 };
-    
+
 class GridLogger: public Logger {
 public:
  GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
-    Logger("Grid", on, nm, col_class, col_key){};
+  Logger("Grid", on, nm, col_class, col_key){};
 };

 void GridLogConfigure(std::vector<std::string> &logstreams);

-  extern GridLogger GridLogError;
-  extern GridLogger GridLogWarning;
-  extern GridLogger GridLogMessage;
-  extern GridLogger GridLogDebug  ;
-  extern GridLogger GridLogPerformance;
-  extern GridLogger GridLogIterative  ;
-  extern GridLogger GridLogIntegrator  ;
-  extern Colours    GridLogColours;
+extern GridLogger GridLogError;
+extern GridLogger GridLogWarning;
+extern GridLogger GridLogMessage;
+extern GridLogger GridLogDebug  ;
+extern GridLogger GridLogPerformance;
+extern GridLogger GridLogIterative  ;
+extern GridLogger GridLogIntegrator  ;
+extern Colours    GridLogColours;
+
+
+#define _NBACKTRACE (256)
+extern void * Grid_backtrace_buffer[_NBACKTRACE];
+
+#define BACKTRACEFILE() {\
+char string[20];					\
+std::sprintf(string,"backtrace.%d",Rank());				\
+std::FILE * fp = std::fopen(string,"w");				\
+BACKTRACEFP(fp)\
+std::fclose(fp);	    \
+}
+
+
+#ifdef HAVE_EXECINFO_H
+#define BACKTRACEFP(fp) { \
+int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);\
+char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
+for (int i = 0; i < symbols; i++){\
+  std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
+}\
+}
+#else 
+#define BACKTRACEFP(fp) { \
+std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
+std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
+}
+#endif
+
+#define BACKTRACE() BACKTRACEFP(stdout) 
+

-  
 }
 #endif
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -6,6 +6,10 @@ if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
 endif

+if BUILD_COMMS_SHMEM
+  extra_sources+=communicator/Communicator_shmem.cc
+endif
+
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
 endif
--- a/lib/Old/Endeavour.tgz
+++ b/lib/Old/Endeavour.tgz
--- a/lib/PerfCount.cc
+++ b/lib/PerfCount.cc
@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {

 #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
-
+#define RawConfig(A,B) (A<<8|B)
 const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 #ifdef __linux__
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
-  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." },
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......"},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS...."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS....."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS..."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS.."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS"},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS......."},
-  //  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS....."},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......"},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS...."},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS..."},
-  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
-  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." , INSTRUCTIONS},
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." , CACHE_REFERENCES},
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
+  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
+    // 4
+#ifdef AVX512
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
+    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
+    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
+    { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS  },
+    { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS  },
+    { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
+    { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
+    // 11
+#else
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS....",INSTRUCTIONS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
+    // 11
 #endif
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS.......",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS.....",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
+    //15
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......",INSTRUCTIONS},
+  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS....",INSTRUCTIONS}
+    //19
  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
+#endif
 };
 }
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@ -34,7 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <ctime>
 #include <chrono>
 #include <string.h>
-
+#include <unistd.h>
 #include <sys/ioctl.h>

 #ifdef __linux__
@ -43,8 +43,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #include <sys/syscall.h>
 #endif
-namespace Grid {

+namespace Grid {

 #ifdef __linux__
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
@ -58,6 +58,49 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 }
 #endif

+#ifdef TIMERS_OFF
+
+
+inline uint64_t cyclecount(void){ 
+  return 0;
+}
+#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
+#define __SSC_STOP  __SSC_MARK(0x110)
+#define __SSC_START __SSC_MARK(0x111)
+
+
+#else
+
+#define __SSC_MARK(mark) 
+#define __SSC_STOP  
+#define __SSC_START 
+
+/*
+ * cycle counters arch dependent
+ */
+
+#ifdef __bgq__
+inline uint64_t cyclecount(void){ 
+   uint64_t tmp;
+   asm volatile ("mfspr %0,0x10C" : "=&r" (tmp)  );
+   return tmp;
+}
+#elif defined __x86_64__
+#include <x86intrin.h>
+inline uint64_t cyclecount(void){ 
+  return __rdtsc();
+  //  unsigned int dummy;
+  // return __rdtscp(&dummy);
+}
+#else
+
+inline uint64_t cyclecount(void){ 
+   return 0;
+}
+
+#endif
+
+#endif

 class PerformanceCounter {
 private:
@ -67,6 +110,7 @@ private:
    uint32_t type;
    uint64_t config;
    const char *name;
+    int normalisation;
  } PerformanceCounterConfig; 
  
  static const PerformanceCounterConfig PerformanceCounterConfigs [];
@ -74,26 +118,12 @@ private:
 public:

  enum PerformanceCounterType {
-    CPUCYCLES=0,
-    INSTRUCTIONS,
-    //    STALL_CYCLES,
-    CACHE_REFERENCES,
-    CACHE_MISSES,
-    L1D_READ_MISS,
-    L1D_READ_ACCESS,
-    L1D_WRITE_MISS,
-    L1D_WRITE_ACCESS,
-    L1D_PREFETCH_MISS,
-    L1D_PREFETCH_ACCESS,
-    LL_READ_MISS,
-    //    LL_READ_ACCESS,
-    LL_WRITE_MISS,
-    LL_WRITE_ACCESS,
-    LL_PREFETCH_MISS,
-    LL_PREFETCH_ACCESS,
-    L1I_READ_MISS,
-    L1I_READ_ACCESS,
-    PERFORMANCE_COUNTER_NUM_TYPES
+    CACHE_REFERENCES=0,
+    CACHE_MISSES=1,
+    CPUCYCLES=2,
+    INSTRUCTIONS=3,
+    L1D_READ_ACCESS=4,
+    PERFORMANCE_COUNTER_NUM_TYPES=19
  };

 public:
@ -101,7 +131,9 @@ public:
  int PCT;

  long long count;
+  long long cycles;
  int fd;
+  int cyclefd;
  unsigned long long elapsed;
  uint64_t begin;

@ -114,7 +146,9 @@ public:
    assert(_pct>=0);
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
    fd=-1;
+    cyclefd=-1;
    count=0;
+    cycles=0;
    PCT =_pct;
    Open();
 #endif
@ -139,6 +173,15 @@ public:
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
+    int norm = PerformanceCounterConfigs[PCT].normalisation;
+    pe.type  = PerformanceCounterConfigs[norm].type;
+    pe.config= PerformanceCounterConfigs[norm].config;
+    name = PerformanceCounterConfigs[norm].name;
+    cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
+    if (cyclefd == -1) {
+      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
+      perror("Error is");
+    }
 #endif
  }

@ -146,10 +189,12 @@ public:
  {
 #ifdef __linux__
    if ( fd!= -1) {
-      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
-      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+      ::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
+      ::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
    }
-    begin  =__rdtsc();
+    begin  =cyclecount();
 #else
    begin = 0;
 #endif
@ -157,12 +202,15 @@ public:

  void Stop(void) {
    count=0;
+    cycles=0;
 #ifdef __linux__
    if ( fd!= -1) {
-      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+      ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
      ::read(fd, &count, sizeof(long long));
+      ::read(cyclefd, &cycles, sizeof(long long));
    }
-    elapsed = __rdtsc() - begin;
+    elapsed = cyclecount() - begin;
 #else
    elapsed = 0;
 #endif
@ -170,16 +218,20 @@ public:
  }
  void Report(void) {
 #ifdef __linux__
-    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+    int N = PerformanceCounterConfigs[PCT].normalisation;
+    const char * sn = PerformanceCounterConfigs[N].name ;
+    const char * sc = PerformanceCounterConfigs[PCT].name;
+      std::printf("tsc = %llu %s = %llu  %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, 
+		  sc, count, sc,sn, (double)count/(double)cycles);
 #else
-    printf("%llu cycles \n", elapsed );
+    std::printf("%llu cycles \n", elapsed );
 #endif
  }

  ~PerformanceCounter()
  {
 #ifdef __linux__
-    close(fd);
+    ::close(fd);    ::close(cyclefd);
 #endif
  }

--- a/lib/Simd.h
+++ b/lib/Simd.h
@ -42,10 +42,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>


 #define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
+#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
 #define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
 #define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 #define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)

+#define RotateBit (0x100)
+
 namespace Grid {

  typedef uint32_t Integer;
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
--- a/lib/Timer.h
+++ b/lib/Timer.h
@ -39,11 +39,18 @@ namespace Grid {
  // Dress the output; use std::chrono

 // C++11 time facilities better?
-double usecond(void);
+inline double usecond(void) {
+  struct timeval tv;
+#ifdef TIMERS_ON
+  gettimeofday(&tv,NULL);
+#endif
+  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
+}

 typedef  std::chrono::system_clock          GridClock;
 typedef  std::chrono::time_point<GridClock> GridTimePoint;
 typedef  std::chrono::milliseconds          GridTime;
+typedef  std::chrono::microseconds          GridUsecs;

 inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
 {
@ -55,29 +62,39 @@ class GridStopWatch {
 private:
  bool running;
  GridTimePoint start;
-  GridTime accumulator;
+  GridUsecs accumulator;
 public:
  GridStopWatch () { 
    Reset();
  }
  void     Start(void) { 
    assert(running == false);
+#ifdef TIMERS_ON
    start = GridClock::now(); 
+#endif
    running = true;
  }
  void     Stop(void)  { 
    assert(running == true);
-    accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start); 
+#ifdef TIMERS_ON
+    accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); 
+#endif
    running = false; 
  };
  void     Reset(void){
    running = false;
+#ifdef TIMERS_ON
    start = GridClock::now();
-    accumulator = std::chrono::duration_cast<GridTime>(start-start); 
+#endif
+    accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
  }
  GridTime Elapsed(void) {
    assert(running == false);
-    return accumulator;
+    return std::chrono::duration_cast<GridTime>( accumulator );
+  }
+  uint64_t useconds(void){
+    assert(running == false);
+    return (uint64_t) accumulator.count();
  }
 };

--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@ -147,6 +147,56 @@ namespace Grid {
      }
      Orthogonalise();
    }
+
+    virtual void CreateSubspaceLanczos(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) 
+    {
+      // Run a Lanczos with sloppy convergence
+	const int Nstop = nn;
+	const int Nk = nn+20;
+	const int Np = nn+20;
+	const int Nm = Nk+Np;
+	const int MaxIt= 10000;
+	RealD resid = 1.0e-3;
+
+	Chebyshev<FineField> Cheb(0.5,64.0,21);
+	ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
+	//	IRL.lock = 1;
+
+	FineField noise(FineGrid); gaussian(RNG,noise);
+	FineField tmp(FineGrid); 
+	std::vector<RealD>     eval(Nm);
+	std::vector<FineField> evec(Nm,FineGrid);
+
+	int Nconv;
+	IRL.calc(eval,evec,
+		 noise,
+		 Nconv);
+
+    	// pull back nn vectors
+	for(int b=0;b<nn;b++){
+
+	  subspace[b]   = evec[b];
+
+	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
+
+	  hermop.Op(subspace[b],tmp); 
+	  std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
+
+	  noise = tmp -  sqrt(eval[b])*subspace[b] ;
+
+	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
+
+	  noise = tmp +  eval[b]*subspace[b] ;
+
+	  std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<"  ;  [ M - Lambda ]_"<<b<<" vec_"<<b<<"  = " <<norm2(noise)<<std::endl;
+
+	}
+	Orthogonalise();
+	for(int b=0;b<nn;b++){
+	  std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
+	}
+    }
+
    virtual void CreateSubspace(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {

      RealD scale;
@ -200,7 +250,7 @@ namespace Grid {
    ////////////////////
    Geometry         geom;
    GridBase *       _grid; 
-    CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil; 
+    CartesianStencil<siteVector,siteVector> Stencil; 

    std::vector<CoarseMatrix> A;

--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@ -222,6 +222,7 @@ namespace Grid {
      SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);
+//	std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;

 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
@ -251,10 +252,10 @@ namespace Grid {
      virtual  RealD Mpc      (const Field &in, Field &out) {
 	Field tmp(in._grid);

-	_Mat.Meooe(in,tmp);
-	_Mat.MooeeInv(tmp,out);
-	_Mat.Meooe(out,tmp);
-	_Mat.MooeeInv(tmp,out);
+	_Mat.Meooe(in,out);
+	_Mat.MooeeInv(out,tmp);
+	_Mat.Meooe(tmp,out);
+	_Mat.MooeeInv(out,tmp);

 	return axpy_norm(out,-1.0,tmp,in);
      }
@ -270,6 +271,35 @@ namespace Grid {
      }
    };

+    template<class Matrix,class Field>
+      class SchurDiagTwoOperator :  public SchurOperatorBase<Field> {
+    protected:
+      Matrix &_Mat;
+    public:
+      SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
+
+      virtual  RealD Mpc      (const Field &in, Field &out) {
+	Field tmp(in._grid);
+
+	_Mat.MooeeInv(in,out);
+	_Mat.Meooe(out,tmp);
+	_Mat.MooeeInv(tmp,out);
+	_Mat.Meooe(out,tmp);
+
+	return axpy_norm(out,-1.0,tmp,in);
+      }
+      virtual  RealD MpcDag   (const Field &in, Field &out){
+	Field tmp(in._grid);
+
+	_Mat.MeooeDag(in,out);
+	_Mat.MooeeInvDag(out,tmp);
+	_Mat.MeooeDag(tmp,out);
+	_Mat.MooeeInvDag(out,tmp);
+
+	return axpy_norm(out,-1.0,tmp,in);
+      }
+    };
+

    /////////////////////////////////////////////////////////////
    // Base classes for functions of operators
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@ -58,13 +58,14 @@ namespace Grid {
      Field Mtmp(in._grid);
      AtoN = in;
      out = AtoN*Coeffs[0];
-      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
-      //      std::cout <<"0 " <<norm2(out)<<std::endl;
+//            std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
+//            std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
      for(int n=1;n<Coeffs.size();n++){
 	Mtmp = AtoN;
 	Linop.HermOp(Mtmp,AtoN);
 	out=out+AtoN*Coeffs[n];
-	//	std::cout << n<<" " <<norm2(out)<<std::endl;
+//            std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
+//		std::cout << n<<" " <<norm2(out)<<std::endl;
      }
    };
  };
@ -82,7 +83,8 @@ namespace Grid {

  public:
    void csv(std::ostream &out){
-      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
+	RealD diff = hi-lo;
+      for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
 	RealD f = approx(x);
 	out<< x<<" "<<f<<std::endl;
      }
@ -99,10 +101,24 @@ namespace Grid {

    Chebyshev(){};
    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
-    
+    Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
+
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
    ////////////////////////////////////////////////////////////////////////////////////////////////////
+// CJ: the one we need for Lanczos
+    void Init(RealD _lo,RealD _hi,int _order)
+    {
+      lo=_lo;
+      hi=_hi;
+      order=_order;
+      
+      if(order < 2) exit(-1);
+      Coeffs.resize(order);
+      Coeffs.assign(0.,order);
+      Coeffs[order-1] = 1.;
+    };
+
    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
    {
      lo=_lo;
@ -182,6 +198,8 @@ namespace Grid {
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {

      GridBase *grid=in._grid;
+//std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
+//<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;

      int vol=grid->gSites();

--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@ -16,9 +16,13 @@
 #define INCLUDED_ALG_REMEZ_H

 #include <stddef.h>
+#include <Config.h>

-//#include <algorithms/approx/bigfloat.h>
+#ifdef HAVE_GMP_H
+#include <algorithms/approx/bigfloat.h>
+#else
 #include <algorithms/approx/bigfloat_double.h>
+#endif

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@ -84,7 +84,7 @@ public:
 	return;
      }
      
-      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
+      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;

      GridStopWatch LinalgTimer;
      GridStopWatch MatrixTimer;
@ -101,8 +101,8 @@ public:
 	MatrixTimer.Stop();

 	LinalgTimer.Start();
-	RealD    qqck = norm2(mmp);
-	ComplexD dck  = innerProduct(p,mmp);
+	//	RealD    qqck = norm2(mmp);
+	//	ComplexD dck  = innerProduct(p,mmp);
      
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
@ -115,7 +115,7 @@ public:
 	p  = p*b+r;
 	  
 	LinalgTimer.Stop();
-	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
+	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
 	
 	// Stopping condition
 	if ( cp <= rsq ) { 
@ -132,9 +132,9 @@ public:

 	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 		   <<" computed residual "<<sqrt(cp/ssq)
-		   <<" true residual     "<<true_residual
-		   <<" target "<<Tolerance;
-	  std::cout<<" Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
+		   <<" true residual "    <<true_residual
+		   <<" target "<<Tolerance<<std::endl;
+	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 	  std::cout<<std::endl;
 	  
 	  assert(true_residual/Tolerance < 1000.0);
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@ -274,7 +274,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  }
  // ugly hack
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
-  assert(0);
+//  assert(0);
 }

  };
--- a/lib/algorithms/iterative/EigenSort.h
+++ b/lib/algorithms/iterative/EigenSort.h
@ -38,32 +38,34 @@ template<class Field>
 class SortEigen {
 private:
  
+//hacking for testing for now
+ private:
  static bool less_lmd(RealD left,RealD right){
-    return fabs(left) < fabs(right);
+    return left > right;
  }  
-  static bool less_pair(std::pair<RealD,Field>& left,
-		 std::pair<RealD,Field>& right){
-    return fabs(left.first) < fabs(right.first);
+  static bool less_pair(std::pair<RealD,Field const*>& left,
+                        std::pair<RealD,Field const*>& right){
+    return left.first > (right.first);
  }  
  
+  
 public:

  void push(DenseVector<RealD>& lmd,
-	    DenseVector<Field>& evec,int N) {
-
-    DenseVector<std::pair<RealD, Field> > emod;
-    typename DenseVector<std::pair<RealD, Field> >::iterator it;
+            DenseVector<Field>& evec,int N) {
+    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
+    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
    
-    for(int i=0;i<lmd.size();++i){
-      emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
-    }
+    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
+    for(int i=0;i<lmd.size();++i)
+      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);

    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);

-    it=emod.begin();
+    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
    for(int i=0;i<N;++i){
      lmd[i]=it->first;
-      evec[i]=it->second;
+      evec[i]=*(it->second);
      ++it;
    }
  }
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -29,6 +29,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_IRL_H
 #define GRID_IRL_H

+#include <string.h> //memset
+#ifdef USE_LAPACK
+#include <lapacke.h>
+#endif
 #include <algorithms/iterative/DenseMatrix.h>
 #include <algorithms/iterative/EigenSort.h>

@ -49,6 +53,7 @@ public:
    int Niter;
    int converged;

+    int Nstop;   // Number of evecs checked for convergence
    int Nk;      // Number of converged sought
    int Np;      // Np -- Number of spare vecs in kryloc space
    int Nm;      // Nm -- total number of vectors
@ -57,6 +62,8 @@ public:

    SortEigen<Field> _sort;

+//    GridCartesian &_fgrid;
+
    LinearOperatorBase<Field> &_Linop;

    OperatorFunction<Field>   &_poly;
@ -67,7 +74,27 @@ public:
    void init(void){};
    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);

-    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
+    ImplicitlyRestartedLanczos(
+				LinearOperatorBase<Field> &Linop, // op
+			       OperatorFunction<Field> & poly,   // polynmial
+			       int _Nstop, // sought vecs
+			       int _Nk, // sought vecs
+			       int _Nm, // spare vecs
+			       RealD _eresid, // resid in lmdue deficit 
+			       int _Niter) : // Max iterations
+      _Linop(Linop),
+      _poly(poly),
+      Nstop(_Nstop),
+      Nk(_Nk),
+      Nm(_Nm),
+      eresid(_eresid),
+      Niter(_Niter)
+    { 
+      Np = Nm-Nk; assert(Np>0);
+    };
+
+    ImplicitlyRestartedLanczos(
+				LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nk, // sought vecs
 			       int _Nm, // spare vecs
@ -75,6 +102,7 @@ public:
 			       int _Niter) : // Max iterations
      _Linop(Linop),
      _poly(poly),
+      Nstop(_Nk),
      Nk(_Nk),
      Nm(_Nm),
      eresid(_eresid),
@ -142,10 +170,11 @@ public:
      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
                                 // 7. vk+1 := wk/βk+1

+//	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
      const RealD tiny = 1.0e-20;
      if ( beta < tiny ) { 
 	std::cout << " beta is tiny "<<beta<<std::endl;
-      }
+     }
      lmd[k] = alph;
      lme[k]  = beta;

@ -219,15 +248,122 @@ public:
      }
    }

+#ifdef USE_LAPACK
+    void diagonalize_lapack(DenseVector<RealD>& lmd,
+		     DenseVector<RealD>& lme, 
+		     int N1,
+		     int N2,
+		     DenseVector<RealD>& Qt,
+		     GridBase *grid){
+  const int size = Nm;
+//  tevals.resize(size);
+//  tevecs.resize(size);
+  int NN = N1;
+  double evals_tmp[NN];
+  double evec_tmp[NN][NN];
+  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
+//  double AA[NN][NN];
+  double DD[NN];
+  double EE[NN];
+  for (int i = 0; i< NN; i++)
+    for (int j = i - 1; j <= i + 1; j++)
+      if ( j < NN && j >= 0 ) {
+        if (i==j) DD[i] = lmd[i];
+        if (i==j) evals_tmp[i] = lmd[i];
+        if (j==(i-1)) EE[j] = lme[j];
+      }
+  int evals_found;
+  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+  int liwork =  3+NN*10 ;
+  int iwork[liwork];
+  double work[lwork];
+  int isuppz[2*NN];
+  char jobz = 'V'; // calculate evals & evecs
+  char range = 'I'; // calculate all evals
+  //    char range = 'A'; // calculate all evals
+  char uplo = 'U'; // refer to upper half of original matrix
+  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+  int ifail[NN];
+  int info;
+//  int total = QMP_get_number_of_nodes();
+//  int node = QMP_get_node_number();
+//  GridBase *grid = evec[0]._grid;
+  int total = grid->_Nprocessors;
+  int node = grid->_processor;
+  int interval = (NN/total)+1;
+  double vl = 0.0, vu = 0.0;
+  int il = interval*node+1 , iu = interval*(node+1);
+  if (iu > NN)  iu=NN;
+  double tol = 0.0;
+    if (1) {
+      memset(evals_tmp,0,sizeof(double)*NN);
+      if ( il <= NN){
+        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
+        LAPACK_dstegr(&jobz, &range, &NN,
+            (double*)DD, (double*)EE,
+            &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+            &tol, // tolerance
+            &evals_found, evals_tmp, (double*)evec_tmp, &NN,
+            isuppz,
+            work, &lwork, iwork, &liwork,
+            &info);
+        for (int i = iu-1; i>= il-1; i--){
+          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
+          evals_tmp[i] = evals_tmp[i - (il-1)];
+          if (il>1) evals_tmp[i-(il-1)]=0.;
+          for (int j = 0; j< NN; j++){
+            evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
+            if (il>1) evec_tmp[i-(il-1)][j]=0.;
+          }
+        }
+      }
+      {
+//        QMP_sum_double_array(evals_tmp,NN);
+//        QMP_sum_double_array((double *)evec_tmp,NN*NN);
+         grid->GlobalSumVector(evals_tmp,NN);
+         grid->GlobalSumVector((double*)evec_tmp,NN*NN);
+      }
+    } 
+// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+  for(int i=0;i<NN;i++){
+    for(int j=0;j<NN;j++)
+      Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
+      lmd [NN-1-i]=evals_tmp[i];
+  }
+}
+#endif
+
+
    void diagonalize(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
-		     int Nm2,
-		     int Nm,
-		     DenseVector<RealD>& Qt)
+		     int N2,
+		     int N1,
+		     DenseVector<RealD>& Qt,
+		     GridBase *grid)
    {
-      int Niter = 100*Nm;
+
+#ifdef USE_LAPACK
+    const int check_lapack=0; // just use lapack if 0, check against lapack if 1
+
+    if(!check_lapack)
+	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
+
+	DenseVector <RealD> lmd2(N1);
+	DenseVector <RealD> lme2(N1);
+	DenseVector<RealD> Qt2(N1*N1);
+         for(int k=0; k<N1; ++k){
+	    lmd2[k] = lmd[k];
+	    lme2[k] = lme[k];
+	  }
+         for(int k=0; k<N1*N1; ++k)
+	Qt2[k] = Qt[k];
+
+//	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+#endif
+
+      int Niter = 100*N1;
      int kmin = 1;
-      int kmax = Nk;
+      int kmax = N2;
      // (this should be more sophisticated)

      for(int iter=0; iter<Niter; ++iter){
@ -239,7 +375,7 @@ public:
 	// (Dsh: shift)
 	
 	// transformation
-	qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
+	qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
 	
 	// Convergence criterion (redef of kmin and kamx)
 	for(int j=kmax-1; j>= kmin; --j){
@ -250,6 +386,23 @@ public:
 	  }
 	}
 	Niter = iter;
+#ifdef USE_LAPACK
+    if(check_lapack){
+	const double SMALL=1e-8;
+	diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
+	DenseVector <RealD> lmd3(N2);
+         for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
+        _sort.push(lmd3,N2);
+        _sort.push(lmd2,N2);
+         for(int k=0; k<N2; ++k){
+	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
+//	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
+	  }
+         for(int k=0; k<N1*N1; ++k){
+//	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
+	}
+    }
+#endif
 	return;

      continued:
@ -265,6 +418,7 @@ public:
      abort();
    }

+#if 1
    static RealD normalise(Field& v) 
    {
      RealD nn = norm2(v);
@ -326,6 +480,7 @@ until convergence
      {

 	GridBase *grid = evec[0]._grid;
+	assert(grid == src._grid);

 	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
 	std::cout << " -- Nm = " << Nm << std::endl;
@ -356,11 +511,21 @@ until convergence
 	// (uniform vector) Why not src??
 	//	evec[0] = 1.0;
 	evec[0] = src;
+	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
+// << src._grid  << std::endl;
 	normalise(evec[0]);
+	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
+// << evec[0]._grid << std::endl;
 	
 	// Initial Nk steps
 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
+//	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
+//	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
 	RitzMatrix(evec,Nk);
+	for(int k=0; k<Nk; ++k){
+//	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
+//	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
+	}

 	// Restarting loop begins
 	for(int iter = 0; iter<Niter; ++iter){
@ -382,20 +547,24 @@ until convergence
 	    lme2[k] = lme[k+k1-1];
 	  }
 	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nm,Nm,Qt);
+	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid);

 	  // sorting
 	  _sort.push(eval2,Nm);
 	  
 	  // Implicitly shifted QR transformations
 	  setUnit_Qt(Nm,Qt);
-	  for(int ip=k2; ip<Nm; ++ip) 
+	  for(int ip=k2; ip<Nm; ++ip){ 
+	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
+		
+	}
    
 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 	  
 	  for(int j=k1-1; j<k2+1; ++j){
 	    for(int k=0; k<Nm; ++k){
+	    B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+Nm*j] * evec[k];
 	    }
 	  }
@ -418,21 +587,25 @@ until convergence
 	    lme2[k] = lme[k];
 	  }
 	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nk,Nm,Qt);
+	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
 	  
 	  for(int k = 0; k<Nk; ++k) B[k]=0.0;
 	  
 	  for(int j = 0; j<Nk; ++j){
 	    for(int k = 0; k<Nk; ++k){
+	    B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+j*Nm] * evec[k];
 	    }
+//	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 	  }
+//	_sort.push(eval2,B,Nk);

 	  Nconv = 0;
 	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 	  for(int i=0; i<Nk; ++i){

-	    _poly(_Linop,B[i],v);
+//	    _poly(_Linop,B[i],v);
+	    _Linop.HermOp(B[i],v);
 	    
 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
 	    RealD vden = norm2(B[i]);
@ -440,11 +613,13 @@ until convergence
 	    v -= eval2[i]*B[i];
 	    RealD vv = norm2(v);
 	    
+	    std::cout.precision(13);
 	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
 	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
 	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
 	    
-	    if(vv<eresid*eresid){
+	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+	    if((vv<eresid*eresid) && (i == Nconv) ){
 	      Iconv[Nconv] = i;
 	      ++Nconv;
 	    }
@ -455,7 +630,7 @@ until convergence

 	  std::cout<<" #modes converged: "<<Nconv<<std::endl;

-	  if( Nconv>=Nk ){
+	  if( Nconv>=Nstop ){
 	    goto converged;
 	  }
 	} // end of iter loop
@ -464,21 +639,20 @@ until convergence
 	abort();
 	
      converged:
-	// Sorting
-	
-	eval.clear();
-	evec.clear();
-	for(int i=0; i<Nconv; ++i){
-	  eval.push_back(eval2[Iconv[i]]);
-	  evec.push_back(B[Iconv[i]]);
-	}
-	_sort.push(eval,evec,Nconv);
-	
-	std::cout << "\n Converged\n Summary :\n";
-	std::cout << " -- Iterations  = "<< Nconv  << "\n";
-	std::cout << " -- beta(k)     = "<< beta_k << "\n";
-	std::cout << " -- Nconv       = "<< Nconv  << "\n";
-      }
+       // Sorting
+       eval.resize(Nconv);
+       evec.resize(Nconv,grid);
+       for(int i=0; i<Nconv; ++i){
+         eval[i] = eval2[Iconv[i]];
+         evec[i] = B[Iconv[i]];
+       }
+      _sort.push(eval,evec,Nconv);
+
+      std::cout << "\n Converged\n Summary :\n";
+      std::cout << " -- Iterations  = "<< Nconv  << "\n";
+      std::cout << " -- beta(k)     = "<< beta_k << "\n";
+      std::cout << " -- Nconv       = "<< Nconv  << "\n";
+     }

    /////////////////////////////////////////////////
    // Adapted from Rudy's lanczos factor routine
@ -1025,6 +1199,7 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx

  }
 }
+#endif


 };
--- a/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
+++ b/lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
@ -47,6 +47,10 @@ namespace Grid {
    int mmax;
    int nstep;
    int steps;
+    GridStopWatch PrecTimer;
+    GridStopWatch MatTimer;
+    GridStopWatch LinalgTimer;
+
    LinearFunction<Field> &Preconditioner;

   PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
@ -68,14 +72,24 @@ namespace Grid {
      
      Field r(src._grid);

+        PrecTimer.Reset();
+         MatTimer.Reset();
+      LinalgTimer.Reset();
+
+      GridStopWatch SolverTimer;
+      SolverTimer.Start();
+
      steps=0;
      for(int k=0;k<MaxIterations;k++){

 	cp=GCRnStep(Linop,src,psi,rsq);

-	if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
+	std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;

 	if(cp<rsq) {
+
+	  SolverTimer.Stop();
+
 	  Linop.HermOp(psi,r);
 	  axpy(r,-1.0,src,r);
 	  RealD tr = norm2(r);
@ -83,6 +97,11 @@ namespace Grid {
 		   << " computed residual "<<sqrt(cp/ssq)
 	           << " true residual "    <<sqrt(tr/ssq)
 	           << " target "           <<Tolerance <<std::endl;
+
+	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<<   PrecTimer.Elapsed() <<std::endl;
+	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<<    MatTimer.Elapsed() <<std::endl;
+	  std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
 	  return;
 	}

@ -90,6 +109,7 @@ namespace Grid {
      std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
      assert(0);
    }
+
    RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){

      RealD cp;
@ -116,24 +136,25 @@ namespace Grid {
      // initial guess x0 is taken as nonzero.
      // r0=src-A x0 = src
      //////////////////////////////////
+      MatTimer.Start();
      Linop.HermOpAndNorm(psi,Az,zAz,zAAz); 
+      MatTimer.Stop();
      r=src-Az;
      
      /////////////////////
      // p = Prec(r)
      /////////////////////
+      PrecTimer.Start();
      Preconditioner(r,z);
+      PrecTimer.Stop();

-      std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl; 
-      std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl; 
-      
+      MatTimer.Start();
      Linop.HermOp(z,tmp); 
+      MatTimer.Stop();

-      std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl; 
      ttmp=tmp;
      tmp=tmp-r;

-      std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl; 
      /*
      std::cout<<GridLogMessage<<r<<std::endl;
      std::cout<<GridLogMessage<<z<<std::endl;
@ -141,7 +162,9 @@ namespace Grid {
      std::cout<<GridLogMessage<<tmp<<std::endl;
      */

+      MatTimer.Start();
      Linop.HermOpAndNorm(z,Az,zAz,zAAz); 
+      MatTimer.Stop();

      //p[0],q[0],qq[0] 
      p[0]= z;
@ -165,18 +188,22 @@ namespace Grid {

 	cp = axpy_norm(r,-a,q[peri_k],r);  

-	std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl; 
 	if((k==nstep-1)||(cp<rsq)){
 	  return cp;
 	}

+	std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"]  resid " <<sqrt(cp/rsq)<<std::endl; 
+
+	PrecTimer.Start();
 	Preconditioner(r,z);// solve Az = r
+	PrecTimer.Stop();
+
+	MatTimer.Start();
 	Linop.HermOpAndNorm(z,Az,zAz,zAAz);
-
-
 	Linop.HermOp(z,tmp);
+	MatTimer.Stop();
        tmp=tmp-r;
-	std::cout<<GridLogMessage<< " Preconditioner resid" <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 
+	std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl; 

 	q[peri_kp]=Az;
 	p[peri_kp]=z;
--- a/lib/algorithms/iterative/SchurRedBlack.h
+++ b/lib/algorithms/iterative/SchurRedBlack.h
@ -102,7 +102,9 @@ namespace Grid {

      pickCheckerboard(Even,src_e,in);
      pickCheckerboard(Odd ,src_o,in);
-
+      pickCheckerboard(Even,sol_e,out);
+      pickCheckerboard(Odd ,sol_o,out);
+    
      /////////////////////////////////////////////////////
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
      /////////////////////////////////////////////////////
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@ -115,27 +115,11 @@ public:
      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
      return idx;
    }
-    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
-      int nd= dims.size();
-      coor.resize(nd);
-      for(int d=0;d<nd;d++){
-	coor[d] = index % dims[d];
-	index   = index / dims[d];
-      }
-    }
    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
-      CoorFromIndex(coor,Oindex,_rdimensions);
-    }
-    static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
-      int nd=dims.size();
-      int stride=1;
-      index=0;
-      for(int d=0;d<nd;d++){
-	index = index+stride*coor[d];
-	stride=stride*dims[d];
-      }
+      Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
    }

+
    //////////////////////////////////////////////////////////
    // SIMD lane addressing
    //////////////////////////////////////////////////////////
@ -147,13 +131,32 @@ public:
    }
    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
    {
-      CoorFromIndex(coor,lane,_simd_layout);
+      Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
    }
    inline int PermuteDim(int dimension){
      return _simd_layout[dimension]>1;
    }
    inline int PermuteType(int dimension){
      int permute_type=0;
+      //
+      // FIXME:
+      //
+      // Best way to encode this would be to present a mask 
+      // for which simd dimensions are rotated, and the rotation
+      // size. If there is only one simd dimension rotated, this is just 
+      // a permute. 
+      //
+      // Cases: PermuteType == 1,2,4,8
+      // Distance should be either 0,1,2..
+      //
+      if ( _simd_layout[dimension] > 2 ) { 
+	for(int d=0;d<_ndimension;d++){
+	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
+	}
+	permute_type = RotateBit; // How to specify distance; this is not just direction.
+	return permute_type;
+      }
+
      for(int d=_ndimension-1;d>dimension;d--){
 	if (_simd_layout[d]>1 ) permute_type++;
      }
@ -163,12 +166,12 @@ public:
    // Array sizing queries
    ////////////////////////////////////////////////////////////////

-    inline int iSites(void) { return _isites; };
-    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
-    inline int oSites(void) { return _osites; };
-    inline int lSites(void) { return _isites*_osites; }; 
-    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
-    inline int Nd    (void) { return _ndimension;};
+    inline int iSites(void) const { return _isites; };
+    inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
+    inline int oSites(void) const { return _osites; };
+    inline int lSites(void) const { return _isites*_osites; }; 
+    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
+    inline int Nd    (void) const { return _ndimension;};

    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
@ -179,7 +182,10 @@ public:
    // Global addressing
    ////////////////////////////////////////////////////////////////
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
-      CoorFromIndex(gcoor,gidx,_gdimensions);
+      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
+    }
+    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
+      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
    }
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
      gidx=0;
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@ -170,9 +170,15 @@ public:
 	// Use a reduced simd grid
 	_simd_layout[d] = simd_layout[d];
 	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
+	assert(_rdimensions[d]>0);

 	// all elements of a simd vector must have same checkerboard.
-	if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); 
+	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
+	if ( _simd_layout[d]>1 ) {
+	  if ( d != _checker_dim ) { 
+	    assert( (_rdimensions[d]&0x1) == 0 );
+	  }
+	}

 	_osites *= _rdimensions[d];
 	_isites *= _simd_layout[d];
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -34,6 +34,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI
 #include <mpi.h>
 #endif
+#ifdef GRID_COMMS_SHMEM
+#include <mpp/shmem.h>
+#endif
 namespace Grid {
 class CartesianCommunicator {
  public:    
@ -53,6 +56,8 @@ class CartesianCommunicator {
    typedef int CommsRequest_t;
 #endif

+    static void Init(int *argc, char ***argv);
+
    // Constructor
    CartesianCommunicator(const std::vector<int> &pdimensions_in);

@ -81,6 +86,7 @@ class CartesianCommunicator {
    void GlobalSumVector(RealD *,int N);

    void GlobalSum(uint32_t &);
+    void GlobalSum(uint64_t &);

    void GlobalSum(ComplexF &c)
    {
@ -115,12 +121,11 @@ class CartesianCommunicator {
 			int recv_from_rank,
 			int bytes);

-    void RecvFrom(void *recv,
-		  int recv_from_rank,
-		  int bytes);
-    void SendTo(void *xmit,
-		int xmit_to_rank,
-		int bytes);
+    void SendRecvPacket(void *xmit,
+			void *recv,
+			int xmit_to_rank,
+			int recv_from_rank,
+			int bytes);

    void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 			 void *xmit,
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@ -31,6 +31,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {

  // Should error check all MPI calls.
+void CartesianCommunicator::Init(int *argc, char ***argv) {
+  int flag;
+  MPI_Initialized(&flag); // needed to coexist with other libs apparently
+  if ( !flag ) {
+    MPI_Init(argc,argv);
+  }
+}
+
+  int Rank(void) {
+    int pe;
+    MPI_Comm_rank(MPI_COMM_WORLD,&pe);
+    return pe;
+  }

 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
@ -59,6 +72,10 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
+void CartesianCommunicator::GlobalSum(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
@ -108,21 +125,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
-void CartesianCommunicator::RecvFrom(void *recv,
-				     int from,
-				     int bytes) 
+
+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int sender,
+					   int receiver,
+					   int bytes)
 {
  MPI_Status stat;
-  int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
-  assert(ierr==0);
-}
-void CartesianCommunicator::SendTo(void *xmit,
-				   int dest,
-				   int bytes)
-{
-  int rank = _processor; // used for tag; must know who it comes from
-  int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
-  assert(ierr==0);
+  assert(sender != receiver);
+  int tag = sender;
+  if ( _processor == sender ) {
+    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
+  }
+  if ( _processor == receiver ) { 
+    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
+  }
 }

 // Basic Halo comms primitive
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@ -28,6 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "Grid.h"
 namespace Grid {

+void CartesianCommunicator::Init(int *argc, char *** arv)
+{
+}
+
+int Rank(void ){ return 0; };
+
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
@ -47,17 +53,14 @@ void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
+void CartesianCommunicator::GlobalSum(uint64_t &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}

-void CartesianCommunicator::RecvFrom(void *recv,
-				     int recv_from_rank,
-				     int bytes) 
-{
-  assert(0);
-}
-void CartesianCommunicator::SendTo(void *xmit,
-				   int xmit_to_rank,
-				   int bytes)
+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int xmit_to_rank,
+					   int recv_from_rank,
+					   int bytes)
 {
  assert(0);
 }
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@ -0,0 +1,334 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_shmem.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include "Grid.h"
+#include <mpp/shmem.h>
+
+namespace Grid {
+
+  // Should error check all MPI calls.
+#define SHMEM_VET(addr) 
+
+#define SHMEM_VET_DEBUG(addr) {				\
+  if ( ! shmem_addr_accessible(addr,_processor) ) {\
+    std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
+    BACKTRACEFILE();		   \
+  }\
+}
+int Rank(void) {
+  return shmem_my_pe();
+}
+typedef struct HandShake_t { 
+  uint64_t seq_local;
+  uint64_t seq_remote;
+} HandShake;
+
+static Vector< HandShake > XConnections;
+static Vector< HandShake > RConnections;
+
+void CartesianCommunicator::Init(int *argc, char ***argv) {
+  shmem_init();
+  XConnections.resize(shmem_n_pes());
+  RConnections.resize(shmem_n_pes());
+  for(int pe =0 ; pe<shmem_n_pes();pe++){
+    XConnections[pe].seq_local = 0;
+    XConnections[pe].seq_remote= 0;
+    RConnections[pe].seq_local = 0;
+    RConnections[pe].seq_remote= 0;
+  }
+  shmem_barrier_all();
+}
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
+{
+  _ndimension = processors.size();
+  std::vector<int> periodic(_ndimension,1);
+
+  _Nprocessors=1;
+  _processors = processors;
+  _processor_coor.resize(_ndimension);
+
+  _processor = shmem_my_pe();
+  
+  Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
+
+  for(int i=0;i<_ndimension;i++){
+    _Nprocessors*=_processors[i];
+  }
+
+  int Size = shmem_n_pes(); 
+
+
+  assert(Size==_Nprocessors);
+}
+
+void CartesianCommunicator::GlobalSum(uint32_t &u){
+  static long long source ;
+  static long long dest   ;
+  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
+  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
+
+  //  int nreduce=1;
+  //  int pestart=0;
+  //  int logStride=0;
+
+  source = u;
+  dest   = 0;
+  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
+  shmem_barrier_all(); // necessary?
+  u = dest;
+}
+void CartesianCommunicator::GlobalSum(uint64_t &u){
+  static long long source ;
+  static long long dest   ;
+  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
+  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
+
+  //  int nreduce=1;
+  //  int pestart=0;
+  //  int logStride=0;
+
+  source = u;
+  dest   = 0;
+  shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
+  shmem_barrier_all(); // necessary?
+  u = dest;
+}
+void CartesianCommunicator::GlobalSum(float &f){
+  static float source ;
+  static float dest   ;
+  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
+  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+
+  source = f;
+  dest   =0.0;
+  shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
+  f = dest;
+}
+void CartesianCommunicator::GlobalSumVector(float *f,int N)
+{
+  static float source ;
+  static float dest   = 0 ;
+  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
+  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+
+  if ( shmem_addr_accessible(f,_processor)  ){
+    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
+    return;
+  }
+
+  for(int i=0;i<N;i++){
+    dest   =0.0;
+    source = f[i];
+    shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
+    f[i] = dest;
+  }
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  static double source;
+  static double dest  ;
+  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
+  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+
+  source = d;
+  dest   = 0;
+  shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
+  d = dest;
+}
+void CartesianCommunicator::GlobalSumVector(double *d,int N)
+{
+  static double source ;
+  static double dest   ;
+  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
+  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+
+  if ( shmem_addr_accessible(d,_processor)  ){
+    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
+    return;
+  }
+
+  for(int i=0;i<N;i++){
+    source = d[i];
+    dest   =0.0;
+    shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
+    d[i] = dest;
+  }
+}
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
+{
+  std::vector<int> coor = _processor_coor;
+
+  assert(std::abs(shift) <_processors[dim]);
+
+  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
+  Lexicographic::IndexFromCoor(coor,source,_processors);
+
+  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
+  Lexicographic::IndexFromCoor(coor,dest,_processors);
+
+}
+int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
+{
+  int rank;
+  Lexicographic::IndexFromCoor(coor,rank,_processors);
+  return rank;
+}
+void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
+{
+  Lexicographic::CoorFromIndex(coor,rank,_processors);
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
+{
+  SHMEM_VET(xmit);
+  SHMEM_VET(recv);
+  std::vector<CommsRequest_t> reqs(0);
+  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromComplete(reqs);
+}
+
+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int sender,
+					   int receiver,
+					   int bytes)
+{
+  static uint64_t seq;
+
+  assert(recv!=xmit);
+  volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
+  volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
+
+  if ( _processor == sender ) {
+
+    printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
+    // Check he has posted a receive
+    while(SendSeq->seq_remote == SendSeq->seq_local);
+
+    printf("Sender receive %d posted\n",sender,receiver);
+
+    // Advance our send count
+    seq = ++(SendSeq->seq_local);
+    
+    // Send this packet 
+    SHMEM_VET(recv);
+    shmem_putmem(recv,xmit,bytes,receiver);
+    shmem_fence();
+
+    printf("Sender sent payload %d\n",seq);
+    //Notify him we're done
+    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
+    shmem_fence();
+    printf("Sender ringing door bell  %d\n",seq);
+  }
+  if ( _processor == receiver ) {
+
+    printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
+    // Post a receive
+    seq = ++(RecvSeq->seq_local);
+    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
+
+    printf("Receiver Opening letter box %d\n",seq);
+
+    
+    // Now wait until he has advanced our reception counter
+    while(RecvSeq->seq_remote != RecvSeq->seq_local);
+
+    printf("Receiver Got the mail %d\n",seq);
+  }
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  SHMEM_VET(xmit);
+  SHMEM_VET(recv);
+  //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
+  shmem_putmem(recv,xmit,bytes,dest);
+}
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  //  shmem_quiet();      // I'm done
+  shmem_barrier_all();// He's done too
+}
+void CartesianCommunicator::Barrier(void)
+{
+  shmem_barrier_all();
+}
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+{
+  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static uint32_t word;
+  uint32_t *array = (uint32_t *) data;
+  assert( (bytes % 4)==0);
+  int words = bytes/4;
+
+  if ( shmem_addr_accessible(data,_processor)  ){
+    shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync);
+    return;
+  }
+
+  for(int w=0;w<words;w++){
+    word = array[w];
+    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
+    if ( shmem_my_pe() != root ) {
+      array[w] = word;
+    }
+    shmem_barrier_all();
+  }
+
+}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
+{
+  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static uint32_t word;
+  uint32_t *array = (uint32_t *) data;
+  assert( (bytes % 4)==0);
+  int words = bytes/4;
+
+  for(int w=0;w<words;w++){
+    word = array[w];
+    shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
+    if ( shmem_my_pe() != root ) {
+      array[w]= word;
+    }
+    shmem_barrier_all();
+  }
+}
+
+}
+
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@ -35,7 +35,7 @@ class SimpleCompressor {
 public:
  void Point(int) {};

-  vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
+  vobj operator() (const vobj &arg) {
    return arg;
  }
 };
@ -56,24 +56,24 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-
+  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
-	int o  = n*rhs._grid->_slice_stride[dimension];
-	int bo = n*rhs._grid->_slice_block[dimension];
-	buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	int o  = n*stride;
+	int bo = n*e2;
+	buffer[off+bo+b]=compress(rhs._odata[so+o+b]);
      }
    }
  } else { 
     int bo=0;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
-	 int o  = n*rhs._grid->_slice_stride[dimension];
+	 int o  = n*stride;
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	 if ( ocb &cbmask ) {
-	   buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	   buffer[off+bo++]=compress(rhs._odata[so+o+b]);
 	 }
       }
     }
@ -97,16 +97,16 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-  
+  int n1=rhs._grid->_slice_stride[dimension];
+  int n2=rhs._grid->_slice_block[dimension];
  if ( cbmask ==0x3){
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){

-	int o=n*rhs._grid->_slice_stride[dimension];
-	int offset = b+n*rhs._grid->_slice_block[dimension];
-
-	cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	int o      =   n*n1;
+	int offset = b+n*n2;
+	cobj temp =compress(rhs._odata[so+o+b]);
 	extract<cobj>(temp,pointers,offset);

      }
@ -121,7 +121,7 @@ PARALLEL_NESTED_LOOP2
 	int offset = b+n*rhs._grid->_slice_block[dimension];

 	if ( ocb & cbmask ) {
-	  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	  cobj temp =compress(rhs._odata[so+o+b]);
 	  extract<cobj>(temp,pointers,offset);
 	}
      }
@ -243,13 +243,13 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs

  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
-
+  int stride = rhs._grid->_slice_stride[dimension];
  if(cbmask == 0x3 ){
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 
-        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int o =n*stride+b;
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
      }
@ -259,7 +259,7 @@ PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 
-        int o =n*rhs._grid->_slice_stride[dimension]+b;
+        int o =n*stride+b;
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
  	//lhs._odata[lo+o]=rhs._odata[ro+o];
@ -285,11 +285,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block [dimension];
+  int stride = rhs._grid->_slice_stride[dimension];
 PARALLEL_NESTED_LOOP2
  for(int n=0;n<e1;n++){
  for(int b=0;b<e2;b++){

-      int o  =n*rhs._grid->_slice_stride[dimension];
+      int o  =n*stride;
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
      if ( ocb&cbmask ) {
 	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
@ -323,6 +324,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int gd = grid->_gdimensions[dimension];
+  int ly = grid->_simd_layout[dimension];

  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;
@ -331,6 +333,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
  // the permute type
  int permute_dim =grid->PermuteDim(dimension);
  int permute_type=grid->PermuteType(dimension);
+  int permute_type_dist;

  for(int x=0;x<rd;x++){       

@ -342,15 +345,31 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;

+    // FIXME : This must change where we have a 
+    // Rotate slice.
+    
+    // Document how this works ; why didn't I do this when I first wrote it...
+    // wrap is whether sshift > rd.
+    //  num is sshift mod rd.
+    // 
    int permute_slice=0;
    if(permute_dim){
      int wrap = sshift/rd;
      int  num = sshift%rd;
+
      if ( x< rd-num ) permute_slice=wrap;
-      else permute_slice = 1-wrap;
+      else permute_slice = (wrap+1)%ly;
+
+      if ( (ly>2) && (permute_slice) ) {
+	assert(permute_type & RotateBit);
+	permute_type_dist = permute_type|permute_slice;
+      } else {
+	permute_type_dist = permute_type;
+      }
+      
    }

-    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 

  
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@ -191,8 +191,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
-  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
+  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
+  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
+
  int bytes = buffer_size*sizeof(scalar_object);

  std::vector<scalar_object *>  pointers(Nsimd); // 
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@ -55,7 +55,13 @@ extern int GridCshiftPermuteMap[4][16];
 // Basic expressions used in Expression Template
 ////////////////////////////////////////////////

-class LatticeBase {};
+class LatticeBase
+{
+public:
+    virtual ~LatticeBase(void) = default;
+    GridBase *_grid;
+};
+    
 class LatticeExpressionBase {};

 template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
@ -88,8 +94,6 @@ template<class vobj>
 class Lattice : public LatticeBase
 {
 public:
-
-    GridBase *_grid;
    int checkerboard;
    Vector<vobj> _odata;
    
@ -177,8 +181,8 @@ PARALLEL_FOR_LOOP
  }
  //GridFromExpression is tricky to do
  template<class Op,class T1>
-    Lattice(const LatticeUnaryExpression<Op,T1> & expr):    _grid(nullptr){
-
+    Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
+    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);

@ -199,7 +203,8 @@ PARALLEL_FOR_LOOP
    }
  };
  template<class Op,class T1, class T2>
-  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr):    _grid(nullptr){
+  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
+    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);

@ -220,7 +225,8 @@ PARALLEL_FOR_LOOP
    }
  };
  template<class Op,class T1, class T2, class T3>
-  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr):    _grid(nullptr){
+  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
+    _grid = nullptr;
    GridFromExpression(_grid,expr);
    assert(_grid!=nullptr);

@ -240,7 +246,8 @@ PARALLEL_FOR_LOOP
    // Constructor requires "grid" passed.
    // what about a default grid?
    //////////////////////////////////////////////////////////////////
-    Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
+    Lattice(GridBase *grid) : _odata(grid->oSites()) {
+        _grid = grid;
    //        _odata.reserve(_grid->oSites());
    //        _odata.resize(_grid->oSites());
    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
@ -248,6 +255,8 @@ PARALLEL_FOR_LOOP
        checkerboard=0;
    }

+    virtual ~Lattice(void) = default;
+    
    template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<_grid->oSites();ss++){
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@ -152,7 +152,7 @@ PARALLEL_FOR_LOOP
    // Peek a scalar object from the SIMD array
    //////////////////////////////////////////////////////////
    template<class vobj,class sobj>
-    void peekLocalSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
        
      GridBase *grid=l._grid;

--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@ -152,7 +152,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  assert(grid!=NULL);

  // FIXME
-  std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
+  // std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;

  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@ -178,7 +178,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  // sum over reduced dimension planes, breaking out orthog dir

  for(int ss=0;ss<grid->oSites();ss++){
-    GridBase::CoorFromIndex(coor,ss,grid->_rdimensions);
+    Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
    int r = coor[orthogdim];
    lvSum[r]=lvSum[r]+Data._odata[ss];
  }  
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@ -75,7 +75,7 @@ namespace Grid {

    std::seed_seq src;
    
-    fixedSeed(std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
+    fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};

    result_type operator () (void){

@ -119,9 +119,10 @@ namespace Grid {
    typedef uint32_t     RngStateType;
    static const int     RngStateCount = std::mt19937::state_size;
 #endif
-    std::vector<RngEngine>             _generators;
-    std::vector<std::uniform_real_distribution<RealD> > _uniform;
-    std::vector<std::normal_distribution<RealD> >       _gaussian;
+    std::vector<RngEngine>                             _generators;
+    std::vector<std::uniform_real_distribution<RealD>> _uniform;
+    std::vector<std::normal_distribution<RealD>>       _gaussian;
+    std::vector<std::discrete_distribution<int32_t>>     _bernoulli;

    void GetState(std::vector<RngStateType> & saved,int gen) {
      saved.resize(RngStateCount);
@ -161,6 +162,7 @@ namespace Grid {
      _generators.resize(1);
      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+      _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
      _seeded=0;
    }

@ -242,7 +244,7 @@ namespace Grid {
      std::random_device rd;
      Seed(rd);
    }
-    void SeedFixedIntegers(std::vector<int> &seeds){
+    void SeedFixedIntegers(const std::vector<int> &seeds){
      fixedSeed src(seeds);
      Seed(src);
    }
@ -266,6 +268,7 @@ namespace Grid {
      _generators.resize(_vol);
      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
      _seeded=0;
    }

@ -354,7 +357,7 @@ PARALLEL_FOR_LOOP
      std::random_device rd;
      Seed(rd);
    }
-    void SeedFixedIntegers(std::vector<int> &seeds){
+    void SeedFixedIntegers(const std::vector<int> &seeds){
      fixedSeed src(seeds);
      Seed(src);
    }
@ -368,14 +371,22 @@ PARALLEL_FOR_LOOP
  template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l){
    rng.fill(l,rng._gaussian);
  }
-
+  
+  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){
+    rng.fill(l,rng._bernoulli);
+  }

  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
    rng.fill(l,rng._uniform);
  }
+  
  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
    rng.fill(l,rng._gaussian);
  }
+  
+  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){
+    rng.fill(l,rng._bernoulli);
+  }

 }
 #endif
--- a/lib/lattice/Lattice_transfer.h
+++ b/lib/lattice/Lattice_transfer.h
@ -44,7 +44,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
  }
 }

-
+ 
  ////////////////////////////////////////////////////////////////////////////////////////////
  // remove and insert a half checkerboard
  ////////////////////////////////////////////////////////////////////////////////////////////
@ -115,9 +115,9 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
    int sc;
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);
-    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

    for(int i=0;i<nbasis;i++) {
      
@ -160,9 +160,9 @@ PARALLEL_FOR_LOOP
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);

-    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

    // z = A x + y
    fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
@ -225,9 +225,9 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);

-    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];

@ -311,9 +311,9 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
    std::vector<int> coor_c(_ndimension);
    std::vector<int> coor_f(_ndimension);

-    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
-    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);

    for(int i=0;i<nbasis;i++) {
      if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
@ -325,6 +325,126 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
  
 }

+// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
+// Simd layouts need not match since we use peek/poke Local
+template<class vobj,class vvobj>
+void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vvobj::scalar_object ssobj;
+
+  sobj s;
+  ssobj ss;
+
+  GridBase *ig = in._grid;
+  GridBase *og = out._grid;
+
+  int ni = ig->_ndimension;
+  int no = og->_ndimension;
+
+  assert(ni == no);
+
+  for(int d=0;d<no;d++){
+    assert(ig->_processors[d]  == og->_processors[d]);
+    assert(ig->_ldimensions[d] == og->_ldimensions[d]);
+  }
+
+PARALLEL_FOR_LOOP
+  for(int idx=0;idx<ig->lSites();idx++){
+    std::vector<int> lcoor(ni);
+    ig->LocalIndexToLocalCoor(idx,lcoor);
+    peekLocalSite(s,in,lcoor);
+    ss=s;
+    pokeLocalSite(ss,out,lcoor);
+  }
+}
+
+
+template<class vobj>
+void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl+1 == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+  assert(hg->_processors[orthog]==1);
+
+  int dl; dl = 0;
+  for(int d=0;d<nh;d++){
+    if ( d != orthog) {
+      assert(lg->_processors[dl]  == hg->_processors[d]);
+      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
+      dl++;
+    }
+  }
+
+  // the above should guarantee that the operations are local
+PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    dl=0;
+    hcoor[orthog] = slice;
+    for(int d=0;d<nh;d++){
+      if ( d!=orthog ) { 
+	hcoor[d]=lcoor[dl++];
+      }
+    }
+    peekLocalSite(s,lowDim,lcoor);
+    pokeLocalSite(s,higherDim,hcoor);
+  }
+}
+
+template<class vobj>
+void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj s;
+
+  GridBase *lg = lowDim._grid;
+  GridBase *hg = higherDim._grid;
+  int nl = lg->_ndimension;
+  int nh = hg->_ndimension;
+
+  assert(nl+1 == nh);
+  assert(orthog<nh);
+  assert(orthog>=0);
+  assert(hg->_processors[orthog]==1);
+
+  int dl; dl = 0;
+  for(int d=0;d<nh;d++){
+    if ( d != orthog) {
+      assert(lg->_processors[dl]  == hg->_processors[d]);
+      assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
+      dl++;
+    }
+  }
+  // the above should guarantee that the operations are local
+PARALLEL_FOR_LOOP
+  for(int idx=0;idx<lg->lSites();idx++){
+    std::vector<int> lcoor(nl);
+    std::vector<int> hcoor(nh);
+    lg->LocalIndexToLocalCoor(idx,lcoor);
+    dl=0;
+    hcoor[orthog] = slice;
+    for(int d=0;d<nh;d++){
+      if ( d!=orthog ) { 
+	hcoor[d]=lcoor[dl++];
+      }
+    }
+    peekLocalSite(s,higherDim,hcoor);
+    pokeLocalSite(s,lowDim,lcoor);
+  }
+
+}

 template<class vobj>
 void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@ -146,7 +146,7 @@ class BinaryIO {
    csum = 0;
    std::vector<int> lcoor;
    for(int l=0;l<grid->lSites();l++){
-      grid->CoorFromIndex(lcoor,l,grid->_ldimensions);
+      Lexicographic::CoorFromIndex(lcoor,l,grid->_ldimensions);
      peekLocalSite(siteObj,lat,lcoor);
      munge(siteObj,fileObj,csum);
    }
@ -168,6 +168,7 @@ class BinaryIO {
    GridBase *grid = Umu._grid;

    std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
+    GridStopWatch timer; timer.Start();

    int ieee32big = (format == std::string("IEEE32BIG"));
    int ieee32    = (format == std::string("IEEE32"));
@ -182,6 +183,7 @@ class BinaryIO {

    Umu = zero;
    uint32_t csum=0;
+    uint64_t bytes=0;
    fobj file_object;
    sobj munged;
    
@ -194,7 +196,7 @@ class BinaryIO {

      if ( grid->IsBoss() ) {
 	fin.read((char *)&file_object,sizeof(file_object));
-	
+	bytes += sizeof(file_object);
 	if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
 	if(ieee32)    le32toh_v((void *)&file_object,sizeof(file_object));
 	if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
@ -205,6 +207,10 @@ class BinaryIO {
      // The boss who read the file has their value poked
      pokeSite(munged,Umu,site);
    }}}}
+    timer.Stop();
+    std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl;
+
    return csum;
  }

@ -224,13 +230,14 @@ class BinaryIO {
    // Serialise through node zero
    //////////////////////////////////////////////////
    std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
+    GridStopWatch timer; timer.Start();

    std::ofstream fout;
    if ( grid->IsBoss() ) {
      fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
      fout.seekp(offset);
    }
-    
+    uint64_t bytes=0;
    uint32_t csum=0;
    fobj file_object;
    sobj unmunged;
@ -252,10 +259,15 @@ class BinaryIO {
 	if(ieee32)    htole32_v((void *)&file_object,sizeof(file_object));
 	if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
 	if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object));
-	
+
+	// NB could gather an xstrip as an optimisation.
 	fout.write((char *)&file_object,sizeof(file_object));
+	bytes+=sizeof(file_object);
      }
    }}}}
+    timer.Stop();
+    std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
@ -265,6 +277,7 @@ class BinaryIO {
    typedef typename GridSerialRNG::RngStateType RngStateType;
    const int RngStateCount = GridSerialRNG::RngStateCount;

+
    GridBase *grid = parallel._grid;
    int gsites = grid->_gsites;

@ -310,7 +323,7 @@ class BinaryIO {
      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
      fout.write((char *)&saved[0],bytes);
    }
-
+    grid->Broadcast(0,(void *)&csum,sizeof(csum));
    return csum;
  }
  static inline uint32_t readRNGSerial(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file,int offset)
@ -360,6 +373,8 @@ class BinaryIO {
      Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
    }

+    grid->Broadcast(0,(void *)&csum,sizeof(csum));
+
    return csum;
  }

@ -398,7 +413,7 @@ class BinaryIO {
    int IOnode = 1;
    for(int d=0;d<grid->_ndimension;d++) {

-      if ( d==0 ) parallel[d] = 0;
+      if ( d == 0 ) parallel[d] = 0;
      if (parallel[d]) {
 	range[d] = grid->_ldimensions[d];
 	start[d] = grid->_processor_coor[d]*range[d];
@ -426,6 +441,9 @@ class BinaryIO {
      std::cout << std::endl;
    }

+    GridStopWatch timer; timer.Start();
+    uint64_t bytes=0;
+
    int myrank = grid->ThisRank();
    int iorank = grid->RankFromProcessorCoor(ioproc);

@ -439,9 +457,9 @@ class BinaryIO {
    // available (how short sighted is that?)
    //////////////////////////////////////////////////////////
    Umu = zero;
-    uint32_t csum=0;
+    static uint32_t csum=0;
    fobj fileObj;
-    sobj siteObj;
+    static sobj siteObj; // Static to place in symmetric region for SHMEM

      // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
@ -451,7 +469,7 @@ class BinaryIO {
      std::vector<int> lsite(nd);
      std::vector<int> iosite(nd);

-      grid->CoorFromIndex(tsite,tlex,range);
+      Lexicographic::CoorFromIndex(tsite,tlex,range);

      for(int d=0;d<nd;d++){
 	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
@ -461,7 +479,7 @@ class BinaryIO {
      /////////////////////////
      // Get the rank of owner of data
      /////////////////////////
-	int rank, o_idx,i_idx, g_idx;
+      int rank, o_idx,i_idx, g_idx;
      grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gsite);
      grid->GlobalCoorToGlobalIndex(gsite,g_idx);
      
@ -472,6 +490,7 @@ class BinaryIO {
 	
 	fin.seekg(offset+g_idx*sizeof(fileObj));
 	fin.read((char *)&fileObj,sizeof(fileObj));
+	bytes+=sizeof(fileObj);
 	
 	if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
 	if(ieee32)    le32toh_v((void *)&fileObj,sizeof(fileObj));
@ -479,23 +498,29 @@ class BinaryIO {
 	if(ieee64)    le64toh_v((void *)&fileObj,sizeof(fileObj));
 	
 	munge(fileObj,siteObj,csum);
-	
-	if ( rank != myrank ) {
-	  grid->SendTo((void *)&siteObj,rank,sizeof(siteObj));
-	} else { 
-	  pokeLocalSite(siteObj,Umu,lsite);
+
+      }	
+
+      // Possibly do transport through pt2pt 
+      if ( rank != iorank ) { 
+	if ( (myrank == rank) || (myrank==iorank) ) {
+	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
 	}
-	 
-      } else { 
-	if ( myrank == rank ) {
-	  grid->RecvFrom((void *)&siteObj,iorank,sizeof(siteObj));
+      }
+      // Poke at destination
+      if ( myrank == rank ) {
 	  pokeLocalSite(siteObj,Umu,lsite);
-	} 
      }
      grid->Barrier(); // necessary?
    }

    grid->GlobalSum(csum);
+    grid->GlobalSum(bytes);
+    grid->Barrier();
+
+    timer.Stop();
+    std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;
    
    return csum;
  }
@ -530,7 +555,7 @@ class BinaryIO {

    for(int d=0;d<grid->_ndimension;d++) {

-      if ( d==0 ) parallel[d] = 0;
+      if ( d!= grid->_ndimension-1 ) parallel[d] = 0;

      if (parallel[d]) {
 	range[d] = grid->_ldimensions[d];
@ -559,6 +584,9 @@ class BinaryIO {
      std::cout << std::endl;
    }

+    GridStopWatch timer; timer.Start();
+    uint64_t bytes=0;
+
    int myrank = grid->ThisRank();
    int iorank = grid->RankFromProcessorCoor(ioproc);

@ -577,10 +605,10 @@ class BinaryIO {

    uint32_t csum=0;
    fobj fileObj;
-    sobj siteObj;
+    static sobj siteObj; // static for SHMEM target; otherwise dynamic allocate with AlignedAllocator

-
-      // need to implement these loops in Nd independent way with a lexico conversion
+    // should aggregate a whole chunk and then write.
+    // need to implement these loops in Nd independent way with a lexico conversion
    for(int tlex=0;tlex<slice_vol;tlex++){
 	
      std::vector<int> tsite(nd); // temporary mixed up site
@ -588,7 +616,7 @@ class BinaryIO {
      std::vector<int> lsite(nd);
      std::vector<int> iosite(nd);

-      grid->CoorFromIndex(tsite,tlex,range);
+      Lexicographic::CoorFromIndex(tsite,tlex,range);

      for(int d=0;d<nd;d++){
 	lsite[d] = tsite[d]%grid->_ldimensions[d];  // local site
@ -606,13 +634,21 @@ class BinaryIO {
      ////////////////////////////////
      // iorank writes from the seek
      ////////////////////////////////
-      if (myrank == iorank) {
+      
+      // Owner of data peeks it
+      peekLocalSite(siteObj,Umu,lsite);

-	if ( rank != myrank ) {
-	  grid->RecvFrom((void *)&siteObj,rank,sizeof(siteObj));
-	} else { 
-	  peekLocalSite(siteObj,Umu,lsite);
+      // Pair of nodes may need to do pt2pt send
+      if ( rank != iorank ) { // comms is necessary
+	if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
+	  // Send to IOrank 
+	  grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
 	}
+      }
+
+      grid->Barrier(); // necessary?
+
+      if (myrank == iorank) {
 	
 	munge(siteObj,fileObj,csum);

@ -623,17 +659,16 @@ class BinaryIO {
 	
 	fout.seekp(offset+g_idx*sizeof(fileObj));
 	fout.write((char *)&fileObj,sizeof(fileObj));
-
-      } else { 
-	if ( myrank == rank ) {
-	  peekLocalSite(siteObj,Umu,lsite);
-	  grid->SendTo((void *)&siteObj,iorank,sizeof(siteObj));
-	} 
+	bytes+=sizeof(fileObj);
      }
-      grid->Barrier(); // necessary// or every 16 packets to rate throttle??
    }

    grid->GlobalSum(csum);
+    grid->GlobalSum(bytes);
+
+    timer.Stop();
+    std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
+	     << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl;

    return csum;
  }
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@ -213,37 +213,38 @@ class NerscIO : public BinaryIO {
  static inline void truncate(std::string file){
    std::ofstream fout(file,std::ios::out);
  }
+  
+  #define dump_nersc_header(field, s)\
+  s << "BEGIN_HEADER"      << std::endl;\
+  s << "HDR_VERSION = "    << field.hdr_version    << std::endl;\
+  s << "DATATYPE = "       << field.data_type      << std::endl;\
+  s << "STORAGE_FORMAT = " << field.storage_format << std::endl;\
+  for(int i=0;i<4;i++){\
+    s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;\
+  }\
+  s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;\
+  s << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;\
+  for(int i=0;i<4;i++){\
+    s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;\
+  }\
+  \
+  s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;\
+  s << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;\
+  s << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;\
+  s << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;\
+  s << "CREATOR = "         << field.creator          << std::endl;\
+  s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;\
+  s << "CREATION_DATE = "   << field.creation_date    << std::endl;\
+  s << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;\
+  s << "FLOATING_POINT = "  << field.floating_point   << std::endl;\
+  s << "END_HEADER"         << std::endl;
+  
  static inline unsigned int writeHeader(NerscField &field,std::string file)
  {
    std::ofstream fout(file,std::ios::out|std::ios::in);
  
    fout.seekp(0,std::ios::beg);
-    fout << "BEGIN_HEADER"      << std::endl;
-    fout << "HDR_VERSION = "    << field.hdr_version    << std::endl;
-    fout << "DATATYPE = "       << field.data_type      << std::endl;
-    fout << "STORAGE_FORMAT = " << field.storage_format << std::endl;
-
-    for(int i=0;i<4;i++){
-      fout << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;
-    }
-    // just to keep the space and write it later
-    fout << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;
-    fout << "PLAQUETTE  = " << std::setprecision(10) << field.plaquette  << std::endl;
-    for(int i=0;i<4;i++){
-      fout << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;
-    }
-
-    fout << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;
-
-    fout << "ENSEMBLE_ID = "     << field.ensemble_id      << std::endl;
-    fout << "ENSEMBLE_LABEL = "  << field.ensemble_label   << std::endl;
-    fout << "SEQUENCE_NUMBER = " << field.sequence_number  << std::endl;
-    fout << "CREATOR = "         << field.creator          << std::endl;
-    fout << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;
-    fout << "CREATION_DATE = "   << field.creation_date    << std::endl;
-    fout << "ARCHIVE_DATE = "    << field.archive_date     << std::endl;
-    fout << "FLOATING_POINT = "  << field.floating_point   << std::endl;
-    fout << "END_HEADER"         << std::endl;
+    dump_nersc_header(field, fout);
    field.data_start = fout.tellp();
    return field.data_start;
 }
@ -345,17 +346,17 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
  if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
    if ( ieee32 || ieee32big ) {
      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
-      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
+	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> 
 	(Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format);
    }
    if ( ieee64 || ieee64big ) {
-      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
+      //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> 
-	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
+      	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
    }
-  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3X3") ) {
+  } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
    if ( ieee32 || ieee32big ) {
-      //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
+      //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
      csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format);
    }
@ -372,6 +373,7 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,

  assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
  assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
+
  assert(csum == header.checksum );

  std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
@ -419,6 +421,7 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
    std::string file1 = file+"para";
    int offset1 = writeHeader(header,file1);
    int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
+    //int csum1=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);

    
    std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl;
@ -429,11 +432,12 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu

  } else { 
    header.floating_point = std::string("IEEE64BIG");
-    header.data_type      = std::string("4D_SU3_GAUGE_3X3");
+    header.data_type      = std::string("4D_SU3_GAUGE_3x3");
    NerscSimpleUnmunger<fobj3D,sobj> munge;
    BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
    offset = writeHeader(header,file);
-    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
+    //    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
+    csum=BinaryIO::writeObjectParallel<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
  }

  std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
@ -507,6 +511,8 @@ static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel
  // munger is a function of <floating point, Real, data_type>
  uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);

+  std::cerr<<" Csum "<< csum << " "<< header.checksum <<std::endl;
+
  assert(csum == header.checksum );

  std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@ -90,7 +90,7 @@ namespace QCD {
    template<typename vtype> using iHalfSpinVector            = iScalar<iVector<iScalar<vtype>, Nhs> >;
    template<typename vtype> using iHalfSpinColourVector      = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;

-    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
+    template<typename vtype> using iGparitySpinColourVector       = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
    template<typename vtype> using iGparityHalfSpinColourVector   = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;

    // Spin matrix
@ -383,7 +383,6 @@ namespace QCD {
    //////////////////////////////////////////////
    // Poke scalars
    //////////////////////////////////////////////
-
    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
    {
      pokeIndex<SpinIndex>(lhs,rhs,i);
@ -407,7 +406,41 @@ namespace QCD {
      pokeIndex<LorentzIndex>(lhs,rhs,i);
    }

-
+    //////////////////////////////////////////////
+    // Fermion <-> propagator assignements
+    //////////////////////////////////////////////
+    template <class Prop, class Ferm>
+    void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
+    {
+        for(int j = 0; j < Ns; ++j)
+        {
+            auto pjs = peekSpin(p, j, s);
+            auto fj  = peekSpin(f, j);
+            
+            for(int i = 0; i < Nc; ++i)
+            {
+                pokeColour(pjs, peekColour(fj, i), i, c);
+            }
+            pokeSpin(p, pjs, j, s);
+        }
+    }
+    
+    template <class Prop, class Ferm>
+    void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
+    {
+        for(int j = 0; j < Ns; ++j)
+        {
+            auto pjs = peekSpin(p, j, s);
+            auto fj  = peekSpin(f, j);
+            
+            for(int i = 0; i < Nc; ++i)
+            {
+                pokeColour(fj, peekColour(pjs, i, c), i);
+            }
+            pokeSpin(f, fj, j);
+        }
+    }
+    
    //////////////////////////////////////////////
    // transpose array and scalar
    //////////////////////////////////////////////
--- a/lib/qcd/action/Actions.h
+++ b/lib/qcd/action/Actions.h
@ -109,10 +109,12 @@ typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeAction

 #define FermOpTemplateInstantiate(A) \
  template class A<WilsonImplF>;		\
-  template class A<WilsonImplD>;  \
+  template class A<WilsonImplD>;		\
  template class A<GparityWilsonImplF>;		\
  template class A<GparityWilsonImplD>;		

+#define GparityFermOpTemplateInstantiate(A) 
+
 ////////////////////////////////////////////
 // Fermion operators / actions
 ////////////////////////////////////////////
@ -208,6 +210,14 @@ typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
 typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
 typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;

+typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
+typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
+typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
+typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
+typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
+typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
+
+
  }}
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@ -527,6 +527,7 @@ namespace QCD {
  }

  FermOpTemplateInstantiate(CayleyFermion5D);
+  GparityFermOpTemplateInstantiate(CayleyFermion5D);

 }}

--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@ -130,7 +130,7 @@ namespace Grid {

      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
      typedef WilsonImplParams ImplParams;
-      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;

      ImplParams Params;

@ -142,6 +142,10 @@ namespace Grid {
        mult(&phi(),&U(mu),&chi());
      }

+      template<class ref>
+      inline void loadLinkElement(Simd & reg,ref &memory){
+	reg = memory;
+      }
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
      {
        conformable(Uds._grid,GaugeGrid);
@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP

    };

+
+
+    ///////
+    // Single flavour four spinors with colour index, 5d redblack
+    ///////
+    template<class S,int Nrepresentation=Nc>
+    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    public:
+
+      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
+
+      INHERIT_GIMPL_TYPES(Gimpl);
+      
+      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
+      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
+      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+    
+      typedef iImplSpinor    <Simd>           SiteSpinor;
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
+      typedef Lattice<SiteSpinor>             FermionField;
+
+      // Make the doubled gauge field a *scalar*
+      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
+      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
+      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
+
+      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
+
+      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef WilsonImplParams ImplParams;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
+
+      ImplParams Params;
+
+      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+
+      bool overlapCommsCompute(void) { return false; };
+    
+      template<class ref>
+      inline void loadLinkElement(Simd & reg,ref &memory){
+	vsplat(reg,memory);
+      }
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
+      {
+	SiteGaugeLink UU;
+	for(int i=0;i<Nrepresentation;i++){
+	  for(int j=0;j<Nrepresentation;j++){
+	    vsplat(UU()()(i,j),U(mu)()(i,j));
+	  }
+	}
+        mult(&phi(),&UU(),&chi());
+      }
+
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+      {
+	SiteScalarGaugeField  ScalarUmu;
+	SiteDoubledGaugeField ScalarUds;
+
+        GaugeLinkField U   (Umu._grid);
+	GaugeField     Uadj(Umu._grid);
+        for(int mu=0;mu<Nd;mu++){
+  	  U = PeekIndex<LorentzIndex>(Umu,mu);
+	  U = adj(Cshift(U,mu,-1));
+	  PokeIndex<LorentzIndex>(Uadj,U,mu);
+	}
+
+	for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
+	  std::vector<int> lcoor;
+	  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
+
+	  peekLocalSite(ScalarUmu,Umu,lcoor);
+	  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
+
+	  peekLocalSite(ScalarUmu,Uadj,lcoor);
+	  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
+
+	  pokeLocalSite(ScalarUds,Uds,lcoor);
+	}
+
+      }
+	
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	assert(0);
+      }   
+
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+	assert(0);
+      }
+
+    };
+
+
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
@ -205,7 +303,7 @@ PARALLEL_FOR_LOOP
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;

      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
-      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;

      typedef GparityWilsonImplParams ImplParams;

@ -290,8 +388,8 @@ PARALLEL_FOR_LOOP
 	conformable(Uds._grid,GaugeGrid);
 	conformable(Umu._grid,GaugeGrid);
 	
-	GaugeLinkField Utmp(GaugeGrid);
-	GaugeLinkField U(GaugeGrid);
+	GaugeLinkField Utmp (GaugeGrid);
+	GaugeLinkField U    (GaugeGrid);
 	GaugeLinkField Uconj(GaugeGrid);
 	
 	Lattice<iScalar<vInteger> > coor(GaugeGrid);
@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double

+    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
+    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
+    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
+
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
--- a/lib/qcd/action/fermion/ScaledShamirFermion.h
+++ b/lib/qcd/action/fermion/ScaledShamirFermion.h
@ -48,14 +48,16 @@ namespace Grid {
 			GridCartesian         &FourDimGrid,
 			GridRedBlackCartesian &FourDimRedBlackGrid,
 			RealD _mass,RealD _M5,
-			RealD scale) :
+//			RealD scale):
+			RealD scale,const ImplParams &p= ImplParams()) :
      
      // b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
      MobiusFermion<Impl>(_Umu,
 		    FiveDimGrid,
 		    FiveDimRedBlackGrid,
 		    FourDimGrid,
-		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
+	FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
+//		    FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
      {
      }

--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@ -48,12 +48,7 @@ namespace QCD {
      mu=p;
    };

-    virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) {
-      return spinproject(in);
-    }
-
-    SiteHalfSpinor spinproject(const SiteSpinor &in)
-    {
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
      SiteHalfSpinor ret;
      int mudag=mu;
      if (!dag) {
@ -92,6 +87,173 @@ namespace QCD {
    }
  };

+  /////////////////////////
+  // optimised versions
+  /////////////////////////
+
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonXpCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjXp(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonYpCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjYp(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonZpCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjZp(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonTpCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjTp(ret,in);
+      return ret;
+    }
+  };
+
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonXmCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjXm(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonYmCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjYm(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonZmCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjZm(ret,in);
+      return ret;
+    }
+  };
+  template<class SiteHalfSpinor,class SiteSpinor>
+  class WilsonTmCompressor {
+  public:
+    inline SiteHalfSpinor operator () (const SiteSpinor &in) {
+      SiteHalfSpinor ret;
+      spProjTm(ret,in);
+      return ret;
+    }
+  };
+
+    // Fast comms buffer manipulation which should inline right through (avoid direction
+    // dependent logic that prevents inlining
+  template<class vobj,class cobj>
+  class WilsonStencil : public CartesianStencil<vobj,cobj> {
+  public:
+
+    WilsonStencil(GridBase *grid,
+		int npoints,
+		int checkerboard,
+		const std::vector<int> &directions,
+		const std::vector<int> &distances)  : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) 
+      {    };
+
+    template < class compressor>
+    std::thread HaloExchangeOptBegin(const Lattice<vobj> &source,compressor &compress) {
+      this->Mergers.resize(0); 
+      this->Packets.resize(0);
+      this->HaloGatherOpt(source,compress);
+      return std::thread([&] { this->Communicate(); });
+    }
+
+    template < class compressor>
+    void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) 
+    {
+      auto thr = this->HaloExchangeOptBegin(source,compress);
+      this->HaloExchangeOptComplete(thr);
+    }
+
+    void HaloExchangeOptComplete(std::thread &thr) 
+    {
+	this->CommsMerge(); // spins
+	this->jointime-=usecond();
+	thr.join();
+	this->jointime+=usecond();
+    }
+
+    template < class compressor>
+    void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
+    {
+	// conformable(source._grid,_grid);
+	assert(source._grid==this->_grid);
+	this->halogtime-=usecond();
+
+	assert (this->comm_buf.size() == this->_unified_buffer_size );
+	this->u_comm_offset=0;
+
+	int dag = compress.dag;
+	static std::vector<int> dirs(Nd*2);
+	for(int mu=0;mu<Nd;mu++){
+	  if ( dag ) {
+	    dirs[mu]  =mu;
+	    dirs[mu+4]=mu+Nd;
+	  } else { 
+	    dirs[mu]  =mu+Nd;
+	    dirs[mu+Nd]=mu;
+	  }
+	}
+
+
+	WilsonXpCompressor<cobj,vobj> XpCompress;
+	this->HaloGatherDir(source,XpCompress,dirs[0]);
+
+	WilsonYpCompressor<cobj,vobj> YpCompress;
+	this->HaloGatherDir(source,YpCompress,dirs[1]);
+
+	WilsonZpCompressor<cobj,vobj> ZpCompress;
+	this->HaloGatherDir(source,ZpCompress,dirs[2]);
+
+	WilsonTpCompressor<cobj,vobj> TpCompress;
+	this->HaloGatherDir(source,TpCompress,dirs[3]);
+
+	WilsonXmCompressor<cobj,vobj> XmCompress;
+	this->HaloGatherDir(source,XmCompress,dirs[4]);
+
+	WilsonYmCompressor<cobj,vobj> YmCompress;
+	this->HaloGatherDir(source,YmCompress,dirs[5]);
+
+	WilsonZmCompressor<cobj,vobj> ZmCompress;
+	this->HaloGatherDir(source,ZmCompress,dirs[6]);
+
+	WilsonTmCompressor<cobj,vobj> TmCompress;
+	this->HaloGatherDir(source,TmCompress,dirs[7]);
+
+	assert(this->u_comm_offset==this->_unified_buffer_size);
+	this->halogtime+=usecond();
+      }
+
+  };
+

 }} // namespace close
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@ -64,7 +64,9 @@ namespace QCD {
  template<class Impl>
  void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  {
-    Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
+    GaugeField HUmu(_Umu._grid);
+    HUmu = _Umu*(-0.5);
+    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
    pickCheckerboard(Even,UmuEven,Umu);
    pickCheckerboard(Odd ,UmuOdd,Umu);
  }
@ -286,121 +288,27 @@ PARALLEL_FOR_LOOP
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
-    if ( Impl::overlapCommsCompute () ) { 
-      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
-    } else { 
-      DhopInternalCommsThenCompute(st,U,in,out,dag);
-    }
-  }
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
-							 const FermionField &in, FermionField &out,int dag) {
-
    assert((dag==DaggerNo) ||(dag==DaggerYes));

    Compressor compressor(dag);
    st.HaloExchange(in,compressor);
    
    if ( dag == DaggerYes ) {
-      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
-	}
+      for(int sss=0;sss<in._grid->oSites();sss++){
+	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    } else {
-      if( HandOptDslash ) {
 PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
-	}
+      for(int sss=0;sss<in._grid->oSites();sss++){
+	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    }
  };

-
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
-						     const FermionField &in, FermionField &out,int dag) {
-
-    assert((dag==DaggerNo) ||(dag==DaggerYes));
-
-    Compressor compressor(dag);
-
-    auto handle = st.HaloExchangeBegin(in,compressor);
-
-    bool local    = true;
-    bool nonlocal = false;
-    if ( dag == DaggerYes ) {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    } else {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    }
-
-    st.HaloExchangeComplete(handle);
-
-    local    = false;
-    nonlocal = true;
-    if ( dag == DaggerYes ) {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    } else {
-      if( HandOptDslash ) {
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      } else { 
-PARALLEL_FOR_LOOP
-        for(int sss=0;sss<in._grid->oSites();sss++){
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
-	}
-      }
-    }
-
-  };
-
 
  FermOpTemplateInstantiate(WilsonFermion);
+  GparityFermOpTemplateInstantiate(WilsonFermion);


 }}
--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@ -114,12 +114,6 @@ namespace Grid {
      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 			const FermionField &in, FermionField &out,int dag) ;

-      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
-				    const FermionField &in, FermionField &out,int dag) ;
-      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
-				    const FermionField &in, FermionField &out,int dag) ;
-
-
      // Constructor
      WilsonFermion(GaugeField &_Umu,
 		    GridCartesian         &Fgrid,
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -38,8 +38,6 @@ namespace QCD {
 // S-direction is INNERMOST and takes no part in the parity.
 const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
-int WilsonFermion5DStatic::HandOptDslash;
-int WilsonFermion5DStatic::AsmOptDslash;

  // 5d lattice for DWF.
 template<class Impl>
@ -67,10 +65,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
-  
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FourDimRedBlackGrid._ndimension==4);
-
  assert(FiveDimRedBlackGrid._checker_dim==1);

  // Dimension zero of the five-d is the Ls direction
@ -99,16 +95,74 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,

  // Allocate the required comms buffer
  ImportGauge(_Umu);
-  alltime=0;
-  commtime=0;
-  jointime=0;
-  dslashtime=0;
-  dslash1time=0;
 }  
+
+template<class Impl>
+WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
+				       GridCartesian         &FiveDimGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridCartesian         &FourDimGrid,
+				       RealD _M5,const ImplParams &p) :
+  Kernels(p),
+  _FiveDimGrid        (&FiveDimGrid),
+  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
+  _FourDimGrid        (&FourDimGrid),
+  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
+  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
+  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
+  M5(_M5),
+  Umu(_FourDimGrid),
+  UmuEven(_FourDimGrid),
+  UmuOdd (_FourDimGrid),
+  Lebesgue(_FourDimGrid),
+  LebesgueEvenOdd(_FourDimGrid)
+{
+  int nsimd = Simd::Nsimd();
+
+  // some assertions
+  assert(FiveDimGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
+  assert(FourDimGrid._ndimension==4);
+
+  // Dimension zero of the five-d is the Ls direction
+  Ls=FiveDimGrid._fdimensions[0];
+  assert(FiveDimGrid._processors[0]         ==1);
+  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
+
+  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+  assert(FiveDimRedBlackGrid._processors[0] ==1);
+  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
+
+  // Other dimensions must match the decomposition of the four-D fields 
+  for(int d=0;d<4;d++){
+    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+
+    assert(FourDimGrid._simd_layout[d]=1);
+    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
+
+    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+  }
+
+  {
+    GaugeField HUmu(_Umu._grid);
+    HUmu = _Umu*(-0.5);
+    Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
+    UmuEven=Umu;// Really want a reference.
+    UmuOdd =Umu;
+  }
+}  
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
-  Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
+  GaugeField HUmu(_Umu._grid);
+  HUmu = _Umu*(-0.5);
+  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
  pickCheckerboard(Even,UmuEven,Umu);
  pickCheckerboard(Odd ,UmuOdd,Umu);
 }
@ -232,30 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 }


-template<class Impl>
-void WilsonFermion5D<Impl>::Report(void)
-{
-  std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
-  std::cout<<GridLogMessage << "Wilson5d      time "<<alltime <<" us"<<std::endl;
-  std::cout<<GridLogMessage << "HaloBegin     time "<<commtime <<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Dslash        time "<<dslashtime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Dslash1       time "<<dslash1time<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "HaloComplete  time "<<jointime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil all gather      time "<<Stencil.halogtime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil splice   gather time "<<Stencil.splicetime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "********************"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil spin   simd   "<<Stencil.spintime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "********************"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil MB/s          "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
-  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
-  std::cout<<GridLogMessage << "********************"<<std::endl;
-}
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 				  const FermionField &A,
@ -277,280 +307,32 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
-{
-  if ( Impl::overlapCommsCompute () ) { 
-    DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
-  } else { 
-    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
-  }
-}
-
-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
-					 DoubledGaugeField & U,
-					 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  alltime-=usecond();
  Compressor compressor(dag);

-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
+  int LLs = in._grid->_rdimensions[0];
  
-  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  st.HaloExchangeComplete(handle);
-  commtime +=usecond();
-
-  jointime -=usecond();
-  jointime +=usecond();
+  st.HaloExchange(in,compressor);
  
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  // Not loop ordering and data layout.
-  // Designed to create 
-  // - per thread reuse in L1 cache for U
-  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
-  dslashtime -=usecond();
  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-#pragma omp parallel for schedule(static)
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-      }
-    } else { 
 PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
+    for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	int sF=LLs*sU;
+	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
    }
  } else {
-    if( this->AsmOptDslash ) {
-      //      for(int i=0;i<1;i++){
-      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-      //	PerformanceCounter Counter(i);
-      //	Counter.Start();
-
-#pragma omp parallel for 
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  for(int s=soff;s<soff+swork;s++){
-
-	    sU=ss+ ssoff;
-
-	    if ( LebesgueOrder::UseLebesgueOrder ) {
-	      sU = lo.Reorder(sU);
-	    }
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
-	  }
-	}
-      }
-      //      Counter.Stop();
-      //      Counter.Report();
-      //      }
-    } else if( this->HandOptDslash ) {
-      /*
-
-#pragma omp parallel for schedule(static)
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  sU=ss+ ssoff;
-	  for(int s=soff;s<soff+swork;s++){
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-      */
-
-#pragma omp parallel for schedule(static)
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    } else { 
 PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
+    for(int ss=0;ss<U._grid->oSites();ss++){
+      int sU=ss;
+      int sF=LLs*sU;
+      Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
    }
  }
-  dslashtime +=usecond();
-  alltime+=usecond();
 }

-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
-						     DoubledGaugeField & U,
-						     const FermionField &in, FermionField &out,int dag)
-{
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  alltime-=usecond();
-
-  int calls;
-  int updates;
-  Compressor compressor(dag);
-
-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
-  
-  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  commtime +=usecond();
-  
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  // Not loop ordering and data layout.
-  // Designed to create 
-  // - per thread reuse in L1 cache for U
-  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
-  bool local    = true;
-  bool nonlocal = false;
-  dslashtime -=usecond();
-  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	  }
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	  }
-	}
-      }
-    }
-  } else {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	}
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	}
-      }
-    }
-  }
-  dslashtime +=usecond();
-
-  jointime -=usecond();
-  st.HaloExchangeComplete(handle);
-  jointime +=usecond();
-
-  local    = false;
-  nonlocal = true;
-  dslash1time -=usecond();
-  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	  }
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	  }
-	}
-      }
-    }
-  } else {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	}
-      }
-    } else { 
-PARALLEL_FOR_LOOP
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
-	}
-      }
-    }
-  }
-  dslash1time +=usecond();
-  alltime+=usecond();
-
-}

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
@ -593,7 +375,10 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 }

 FermOpTemplateInstantiate(WilsonFermion5D);
-
+GparityFermOpTemplateInstantiate(WilsonFermion5D);
+template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
+template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
+  
 }}


--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@ -1,3 +1,4 @@
+
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@ -48,8 +49,6 @@ namespace Grid {
    class WilsonFermion5DStatic { 
    public:
      // S-direction is INNERMOST and takes no part in the parity.
-      static int AsmOptDslash; // these are a temporary hack
-      static int HandOptDslash; // these are a temporary hack
      static const std::vector<int> directions;
      static const std::vector<int> displacements;
      const int npoint = 8;
@ -61,11 +60,7 @@ namespace Grid {
    public:
     INHERIT_IMPL_TYPES(Impl);
     typedef WilsonKernels<Impl> Kernels;
-     double alltime;
-     double jointime;
-     double commtime;
-     double dslashtime;
-     double dslash1time;
+
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
@ -86,6 +81,7 @@ namespace Grid {
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

      // These can be overridden by fancy 5d chiral action
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
@ -120,19 +116,6 @@ namespace Grid {
 			FermionField &out,
 			int dag);

-      void DhopInternalCommsThenCompute(StencilImpl & st,
-			LebesgueOrder &lo,
-			DoubledGaugeField &U,
-			const FermionField &in, 
-			FermionField &out,
-			int dag);
-      void DhopInternalCommsOverlapCompute(StencilImpl & st,
-			LebesgueOrder &lo,
-			DoubledGaugeField &U,
-			const FermionField &in, 
-			FermionField &out,
-			int dag);
-
      // Constructors
      WilsonFermion5D(GaugeField &_Umu,
 		      GridCartesian         &FiveDimGrid,
@ -141,14 +124,21 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      double _M5,const ImplParams &p= ImplParams());

+      // Constructors
+      WilsonFermion5D(int simd, 
+		      GaugeField &_Umu,
+		      GridCartesian         &FiveDimGrid,
+		      GridRedBlackCartesian &FiveDimRedBlackGrid,
+		      GridCartesian         &FourDimGrid,
+		      double _M5,const ImplParams &p= ImplParams());
+
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);

-      void Report(void);
      ///////////////////////////////////////////////////////////////
      // Data members require to support the functionality
      ///////////////////////////////////////////////////////////////
-    protected:
+    public:

      // Add these to the support from Wilson
      GridBase *_FourDimGrid;
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@ -31,440 +31,410 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 namespace QCD {

+  int WilsonKernelsStatic::HandOpt;
+  int WilsonKernelsStatic::AsmOpt;
+
 template<class Impl> 
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};

-  // Need controls to do interior, exterior, or both
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
+{
+  if ( AsmOpt ) {
+
+    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
+
+  } else {
+
+    for(int site=0;site<Ns;site++) {
+      for(int s=0;s<Ls;s++) {
+	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
+	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
+	sF++;
+      }
+      sU++;
+    }
+
+  }
+}
+
 template<class Impl> 
 void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
+					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
+{
+  // No asm implementation yet.
+  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
+  //  else
+  for(int site=0;site<Ns;site++) {
+    for(int s=0;s<Ls;s++) {
+      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
+      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
+      sF++;
+    }
+    sU++;
+  }
+}
+
+
+  ////////////////////////////////////////////
+  // Generic implementation; move to different file?
+  ////////////////////////////////////////////
+
+template<class Impl> 
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					   int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
+  SiteHalfSpinor *chi_p;
  SiteHalfSpinor Uchi;
  SiteSpinor result;
  StencilEntry *SE;
  int ptype;

-  int num = 0;
-
-  result=zero;
-
  ///////////////////////////
  // Xp
  ///////////////////////////
  SE=st.GetEntry(ptype,Xp,sF);

-  if (local && SE->_is_local ) { 
+  if (SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXp(chi,in._odata[SE->_offset]);
    }
-  }
-
-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
+  } else { 
+    chi_p=&buf[SE->_offset];
  }
  
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
-    accumReconXp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
+  spReconXp(result,Uchi);
    
  ///////////////////////////
  // Yp
  ///////////////////////////
  SE=st.GetEntry(ptype,Yp,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
-    accumReconYp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
+  accumReconYp(result,Uchi);

  ///////////////////////////
  // Zp
  ///////////////////////////
  SE=st.GetEntry(ptype,Zp,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
-    accumReconZp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
+  accumReconZp(result,Uchi);

  ///////////////////////////
  // Tp
  ///////////////////////////
  SE=st.GetEntry(ptype,Tp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjTp(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
-    accumReconTp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
+  accumReconTp(result,Uchi);

  ///////////////////////////
  // Xm
  ///////////////////////////
  SE=st.GetEntry(ptype,Xm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
-    accumReconXm(result,Uchi);
-    num++;
-  }
-
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
+  accumReconXm(result,Uchi);
+  
  ///////////////////////////
  // Ym
  ///////////////////////////
  SE=st.GetEntry(ptype,Ym,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
-    accumReconYm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
+  accumReconYm(result,Uchi);
  
  ///////////////////////////
  // Zm
  ///////////////////////////
  SE=st.GetEntry(ptype,Zm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
-    accumReconZm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
+  accumReconZm(result,Uchi);

  ///////////////////////////
  // Tm
  ///////////////////////////
  SE=st.GetEntry(ptype,Tm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else { 
      spProjTm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
+  accumReconTm(result,Uchi);

-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
-    accumReconTm(result,Uchi);
-    num++;
-  }
-
-  if ( local ) {
-    vstream(out._odata[sF],result*(-0.5));
-  } else if ( num ) { 
-    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
-  }
+  vstream(out._odata[sF],result);
 };


  // Need controls to do interior, exterior, or both
 template<class Impl> 
-void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
-					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					   int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						  int sF,int sU,const FermionField &in, FermionField &out)
 {
  SiteHalfSpinor  tmp;    
  SiteHalfSpinor  chi;    
+  SiteHalfSpinor *chi_p;    
  SiteHalfSpinor Uchi;
  SiteSpinor result;
  StencilEntry *SE;
  int ptype;

-  int num = 0;
-
-  result=zero;
-
  ///////////////////////////
  // Xp
  ///////////////////////////
  SE=st.GetEntry(ptype,Xm,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXp(chi,in._odata[SE->_offset]);
    }
-  }
-
-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
+  } else { 
+    chi_p=&buf[SE->_offset];
  }
  
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
-    accumReconXp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
+  spReconXp(result,Uchi);
    
  ///////////////////////////
  // Yp
  ///////////////////////////
  SE=st.GetEntry(ptype,Ym,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
-    accumReconYp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
+  accumReconYp(result,Uchi);

  ///////////////////////////
  // Zp
  ///////////////////////////
  SE=st.GetEntry(ptype,Zm,sF);

-  if (local && SE->_is_local ) { 
+  if ( SE->_is_local ) { 
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZp(chi,in._odata[SE->_offset]);
    }
+  } else { 
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) { 
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
-    accumReconZp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
+  accumReconZp(result,Uchi);

  ///////////////////////////
  // Tp
  ///////////////////////////
  SE=st.GetEntry(ptype,Tm,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTp(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjTp(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
-    accumReconTp(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
+  accumReconTp(result,Uchi);

  ///////////////////////////
  // Xm
  ///////////////////////////
  SE=st.GetEntry(ptype,Xp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjXm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjXm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
-    accumReconXm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
+  accumReconXm(result,Uchi);

  ///////////////////////////
  // Ym
  ///////////////////////////
  SE=st.GetEntry(ptype,Yp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjYm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjYm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
-    accumReconYm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
+  accumReconYm(result,Uchi);
  
  ///////////////////////////
  // Zm
  ///////////////////////////
  SE=st.GetEntry(ptype,Zp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjZm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else {
      spProjZm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
-
-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
-    accumReconZm(result,Uchi);
-    num++;
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
+  accumReconZm(result,Uchi);

  ///////////////////////////
  // Tm
  ///////////////////////////
  SE=st.GetEntry(ptype,Tp,sF);

-  if (local && SE->_is_local ) {
+  if ( SE->_is_local ) {
+    chi_p = &chi;
    if ( SE->_permute ) {
      spProjTm(tmp,in._odata[SE->_offset]);
      permute(chi,tmp,ptype);
    } else { 
      spProjTm(chi,in._odata[SE->_offset]);
    }
+  } else {
+    chi_p=&buf[SE->_offset];
  }

-  if ( nonlocal && (!SE->_is_local) ) {
-    chi=buf[SE->_offset];
-  }
+  Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
+  accumReconTm(result,Uchi);

-  if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
-    Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
-    accumReconTm(result,Uchi);
-    num++;
-  }
-
-  if ( local ) {
-    vstream(out._odata[sF],result*(-0.5));
-  } else if ( num ) { 
-    vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
-  }
+  vstream(out._odata[sF],result);
 };

 template<class Impl> 
@ -593,19 +563,13 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
    spReconTm(result,Uchi);
  }

-  vstream(out._odata[sF],result*(-0.5));
+  vstream(out._odata[sF],result);
 }

-#if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
-template<class Impl> 
-void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
-					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					      int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
-{
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-}
-#endif

  FermOpTemplateInstantiate(WilsonKernels);

+template class WilsonKernels<DomainWallRedBlack5dImplF>;		
+template class WilsonKernels<DomainWallRedBlack5dImplD>;
+
 }}
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@ -38,37 +38,56 @@ namespace Grid {
    // Helper routines that implement Wilson stencil for a single site.
    // Common to both the WilsonFermion and WilsonFermion5D
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    class WilsonKernelsStatic { 
+    public:
+      // S-direction is INNERMOST and takes no part in the parity.
+      static int AsmOpt;  // these are a temporary hack
+      static int HandOpt; // these are a temporary hack
+    };

-    template<class Impl> class WilsonKernels : public FermionOperator<Impl> { 
+    template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
    public:

     INHERIT_IMPL_TYPES(Impl);
     typedef FermionOperator<Impl> Base;
     
    public:
+
     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			   int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
      
     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);

     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);

+    private:
+     // Specialised variants
+     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
+			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			   int sF,int sU, const FermionField &in, FermionField &out);
+      
+     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+			      int sF,int sU,const FermionField &in,FermionField &out);
+
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);

-     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+
+     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+			      int sF,int sU,const FermionField &in, FermionField &out);
     
-     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-				 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
+				 int sF,int sU,const FermionField &in, FermionField &out);
+    public:

     WilsonKernels(const ImplParams &p= ImplParams());
     
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@ -2,6 +2,8 @@

    Grid physics library, www.github.com/paboyle/Grid 

+
+
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc

    Copyright (C) 2015
@ -26,320 +28,88 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+
 #include <Grid.h>
-#if defined(AVX512) || defined (IMCI)
-
-#include <simd/Avx512Asm.h>
-
-#undef VLOAD
-#undef VSTORE
-#undef VMUL
-#undef VMADD
-#undef ZEND
-#undef ZLOAD
-#undef ZMUL
-#undef ZMADD
-#undef VZERO
-#undef VTIMESI
-#undef VTIMESMINUSI
-
-#define VZERO(A)                  VZEROf(A)
-#define VMOV(A,B)                 VMOVf(A,B)
-#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
-#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
-
-#define VADD(A,B,C)               VADDf(A,B,C)
-#define VSUB(A,B,C)               VSUBf(A,B,C)
-#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULf(Uri,Uir,Chi,UChi,Z)
-#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
-
-#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
-#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
-#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
-#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
-
-#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
-#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
-#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
-#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
-
-#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
-#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
-#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
-#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
-
-#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
-#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
-#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
-#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
-
-#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
-#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
-#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
-
-#define VPERM0(A,B)               VPERM0f(A,B)
-#define VPERM1(A,B)               VPERM1f(A,B)
-#define VPERM2(A,B)               VPERM2f(A,B)
-#define VPERM3(A,B)               VPERM3f(A,B)
-#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
-
-#define ZEND1(A,B,C)               ZEND1f(A,B,C)
-#define ZEND2(A,B,C)               ZEND2f(A,B,C)
-#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
-#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
-
-#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
-#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
-
-#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
-#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 

 namespace Grid {
 namespace QCD {

+
+  ///////////////////////////////////////////////////////////
+  // Default to no assembler implementation
+  ///////////////////////////////////////////////////////////
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
-						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-					       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
-  uint64_t  now;
-  uint64_t first ;
-  int offset,local,perm, ptype;
-  const SiteHalfSpinor *pbuf = & buf[0];
-  const SiteSpinor   *plocal = & in._odata[0];
-  void *pf;
-  int osites = in._grid->oSites();
-
-  
-  StencilEntry *SE;
-
-  //#define STAMP(i) timers[i] = __rdtsc() ; 
-#define STAMP(i) //timers[i] = __rdtsc() ; 
-
-  MASK_REGS;
-
-  first = __rdtsc();
-
-  SE=st.GetEntry(ptype,Xm,ss);
-
-#if 0
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  LOAD64(%r9,pf);
-  __asm__( 
-	  VPREFETCH(0,%r9)
-	  VPREFETCH(1,%r9)
-	  VPREFETCH(2,%r9)
-	  VPREFETCH(3,%r9)
-	  VPREFETCH(4,%r9)
-	  VPREFETCH(5,%r9)
-	  VPREFETCH(6,%r9)
-	  VPREFETCH(7,%r9)
-	  VPREFETCH(8,%r9)
-	  VPREFETCH(9,%r9)
-	  VPREFETCH(10,%r9)
-	  VPREFETCH(11,%r9) );
-#endif
-
-  // Xm
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Ym,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    XM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFXM(Xm,pf);
-  }
-  XM_RECON;
-
-  // Ym
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Zm,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    YM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFYM(Ym,pf);
-  }
-  YM_RECON_ACCUM;
-
-  // Zm
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Tm,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  if ( local ) {
-    ZM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFZM(Zm,pf);
-  }
-  ZM_RECON_ACCUM;
-
-  // Tm
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-  
-  SE=st.GetEntry(ptype,Tp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-
-  if ( local ) {
-    TM_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFTM(Tm,pf);
-  }
-  TM_RECON_ACCUM;
-
-  // Tp
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Zp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    TP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFTP(Tp,pf);
-  }
-  TP_RECON_ACCUM;
-
-  // Zp
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Yp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  if ( local ) {
-    ZP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFZP(Zp,pf);
-  }
-  ZP_RECON_ACCUM;
-
-
-  offset = SE->_offset;
-  local  = SE->_is_local;
-  perm   = SE->_permute;
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Xp,ss);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-  
-  if ( local ) {
-    YP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFYP(Yp,pf);
-  }
-  YP_RECON_ACCUM;
-
-  // Xp
-  perm   = SE->_permute;
-  offset = SE->_offset;
-  local  = SE->_is_local;
-    
-  //  PREFETCH_R(A);
-
-  // Prefetch
-  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
-  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
-  else               pf=(void *)&pbuf[SE->_offset];
-
-  if ( local ) {
-    XP_PROJMEM(&plocal[offset]);
-    if ( perm) {
-      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  } else { 
-    LOAD_CHI(&pbuf[offset]);
-  }
-  {
-    MULT_2SPIN_DIR_PFXP(Xp,pf);
-  }
-  XP_RECON_ACCUM;
-
- debug:
-  SAVE_RESULT(&out._odata[ss]);
-
+  assert(0);
 }

-  template class WilsonKernels<WilsonImplF>;		
-  template class WilsonKernels<WilsonImplD>; 
+#if defined(AVX512) 
+
+
+  ///////////////////////////////////////////////////////////
+  // If we are AVX512 specialise the single precision routine
+  ///////////////////////////////////////////////////////////
+
+#include <simd/Intel512wilson.h>
+#include <simd/Intel512single.h>
+
+static Vector<vComplexF> signs;
+
+int setupSigns(void ){
+  Vector<vComplexF> bother(2);
+  signs = bother;
+  vrsign(signs[0]);
+  visign(signs[1]);
+  return 1;
+}
+static int signInit = setupSigns();
+
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+
+template<>
+void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+						     int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#define MAYBEPERM(A,B) 
+#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+template<>
+void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								   int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>

-}}
 #endif
+
+template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							      int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+
+template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
+							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
+}}
+
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@ -0,0 +1,164 @@
+{
+  int locala,perma, ptypea;
+  int localb,permb, ptypeb;
+  uint64_t basea, baseb;
+  uint64_t basex;
+  const uint64_t plocal =(uint64_t) & in._odata[0];
+
+  //  vComplexF isigns[2] = { signs[0], signs[1] };
+  vComplexF *isigns = &signs[0];
+
+  MASK_REGS;
+
+  for(int site=0;site<Ns;site++) {
+  for(int s=0;s<Ls;s++) {
+
+  ////////////////////////////////
+  // Xp
+  ////////////////////////////////
+  int ent=ss*8;// 2*Ndim
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  basex = basea;
+
+  if ( locala ) {
+    LOAD64(%r10,isigns);
+    XM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXP(Xp,baseb);
+  }
+  LOAD64(%r10,isigns);
+  XM_RECON;
+
+  ////////////////////////////////
+  // Yp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYP(Yp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zp
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZM_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZP(Zp,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tp
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TM_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTP(Tp,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TM_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Xm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    XP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR3,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFXM(Xm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  XP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Ym
+  ////////////////////////////////
+  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    YP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR2,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFYM(Ym,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  YP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Zm
+  ////////////////////////////////
+  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
+  if ( locala ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    ZP_PROJMEM(basea);
+    MAYBEPERM(PERMUTE_DIR1,perma);
+  } else { 
+    LOAD_CHI(basea);
+  }
+  {
+    MULT_2SPIN_DIR_PFZM(Zm,baseb);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  ZP_RECON_ACCUM;
+
+  ////////////////////////////////
+  // Tm
+  ////////////////////////////////
+  basea = (uint64_t)&out._odata[ss];
+  if ( localb ) {
+    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+    TP_PROJMEM(baseb);
+    MAYBEPERM(PERMUTE_DIR0,permb);
+  } else { 
+    LOAD_CHI(baseb);
+  }
+  {
+    MULT_2SPIN_DIR_PFTM(Tm,basea);
+  }
+  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
+  TP_RECON_ACCUM;
+
+  PREFETCH_CHIMU(basex);
+  SAVE_RESULT(&out._odata[ss]);
+
+  
+  ss++;
+  } 
+  sU++;
+  }
+}
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chi_11 = ref()(1)(1);\
    Chi_12 = ref()(1)(2);

+// To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
   auto & ref(U._odata[sU](A));	\
-    U_00 = ref()(0,0);\
-    U_10 = ref()(1,0);\
-    U_20 = ref()(2,0);\
-    U_01 = ref()(0,1);\
-    U_11 = ref()(1,1);				\
-    U_21 = ref()(2,1);\
+   Impl::loadLinkElement(U_00,ref()(0,0));	\
+   Impl::loadLinkElement(U_10,ref()(1,0));	\
+   Impl::loadLinkElement(U_20,ref()(2,0));	\
+   Impl::loadLinkElement(U_01,ref()(0,1));	\
+   Impl::loadLinkElement(U_11,ref()(1,1));	\
+   Impl::loadLinkElement(U_21,ref()(2,1));	\
    UChi_00 = U_00*Chi_00;\
    UChi_10 = U_00*Chi_10;\
    UChi_01 = U_10*Chi_00;\
@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_11+= U_11*Chi_11;\
    UChi_02+= U_21*Chi_01;\
    UChi_12+= U_21*Chi_11;\
-    U_00 = ref()(0,2);\
-    U_10 = ref()(1,2);\
-    U_20 = ref()(2,2);\
+    Impl::loadLinkElement(U_00,ref()(0,2));	\
+    Impl::loadLinkElement(U_10,ref()(1,2));	\
+    Impl::loadLinkElement(U_20,ref()(2,2));	\
    UChi_00+= U_00*Chi_02;\
    UChi_10+= U_00*Chi_12;\
    UChi_01+= U_10*Chi_02;\
@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;

+
 #define PERMUTE_DIR(dir)			\
      permute##dir(Chi_00,Chi_00);\
      permute##dir(Chi_01,Chi_01);\
@ -309,546 +311,10 @@ namespace Grid {
 namespace QCD {


-template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
-						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
-{
-  //  std::cout << "Hand op Dhop "<<std::endl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
-  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
-  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
-  
-  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
-  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
-  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
-
-  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
-  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
-  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
-
-  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
-  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
-  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
-
-  REGISTER Simd Chi_00;    // two spinor; 6 regs
-  REGISTER Simd Chi_01;
-  REGISTER Simd Chi_02;
-
-  REGISTER Simd Chi_10;
-  REGISTER Simd Chi_11;
-  REGISTER Simd Chi_12;   // 14 left
-
-  REGISTER Simd UChi_00;  // two spinor; 6 regs
-  REGISTER Simd UChi_01;
-  REGISTER Simd UChi_02;
-
-  REGISTER Simd UChi_10;
-  REGISTER Simd UChi_11;
-  REGISTER Simd UChi_12;  // 8 left
-
-  REGISTER Simd U_00;  // two rows of U matrix
-  REGISTER Simd U_10;
-  REGISTER Simd U_20;  
-  REGISTER Simd U_01;
-  REGISTER Simd U_11;
-  REGISTER Simd U_21;  // 2 reg left.
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-
-  StencilEntry *SE;
-  int offset, ptype;
-  int num = 0;
-
-  // Xp
-  SE=st.GetEntry(ptype,Xp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xp);
-    XP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Yp
-  SE=st.GetEntry(ptype,Yp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Yp);
-    YP_RECON_ACCUM;
-    num++;  
-  }
-
-
-  // Zp
-  SE=st.GetEntry(ptype,Zp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }  
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zp);
-    ZP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tp
-  SE=st.GetEntry(ptype,Tp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tp);
-    TP_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Xm
-  SE=st.GetEntry(ptype,Xm,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xm);
-    XM_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Ym
-  SE=st.GetEntry(ptype,Ym,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Ym);
-    YM_RECON_ACCUM;
-    num++;  
-  }
-
-  // Zm
-  SE=st.GetEntry(ptype,Zm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zm);
-    ZM_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tm
-  SE=st.GetEntry(ptype,Tm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tm);
-    TM_RECON_ACCUM;
-    num++;  
-  }
-
-  SiteSpinor & ref (out._odata[ss]);
-  if ( Local ) {
-    vstream(ref()(0)(0),result_00*(-0.5));
-    vstream(ref()(0)(1),result_01*(-0.5));
-    vstream(ref()(0)(2),result_02*(-0.5));
-    vstream(ref()(1)(0),result_10*(-0.5));
-    vstream(ref()(1)(1),result_11*(-0.5));
-    vstream(ref()(1)(2),result_12*(-0.5));
-    vstream(ref()(2)(0),result_20*(-0.5));
-    vstream(ref()(2)(1),result_21*(-0.5));
-    vstream(ref()(2)(2),result_22*(-0.5));
-    vstream(ref()(3)(0),result_30*(-0.5));
-    vstream(ref()(3)(1),result_31*(-0.5));
-    vstream(ref()(3)(2),result_32*(-0.5));
-    return 1;
-  } else if ( num ) { 
-    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
-    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
-    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
-    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
-    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
-    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
-    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
-    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
-    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
-    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
-    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
-    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
-    return 1;
-  }
-  return 0;
-}
-
-
-
-
-template<class Impl>
-int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
-						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
-{
-  //  std::cout << "Hand op Dhop "<<std::endl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
-  REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
-  REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
-  
-  REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
-  REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
-  REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
-
-  REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
-  REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
-  REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
-
-  REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
-  REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
-  REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
-
-  REGISTER Simd Chi_00;    // two spinor; 6 regs
-  REGISTER Simd Chi_01;
-  REGISTER Simd Chi_02;
-
-  REGISTER Simd Chi_10;
-  REGISTER Simd Chi_11;
-  REGISTER Simd Chi_12;   // 14 left
-
-  REGISTER Simd UChi_00;  // two spinor; 6 regs
-  REGISTER Simd UChi_01;
-  REGISTER Simd UChi_02;
-
-  REGISTER Simd UChi_10;
-  REGISTER Simd UChi_11;
-  REGISTER Simd UChi_12;  // 8 left
-
-  REGISTER Simd U_00;  // two rows of U matrix
-  REGISTER Simd U_10;
-  REGISTER Simd U_20;  
-  REGISTER Simd U_01;
-  REGISTER Simd U_11;
-  REGISTER Simd U_21;  // 2 reg left.
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-
-  StencilEntry *SE;
-  int offset, ptype;
-  int num = 0;
-
-  // Xp
-  SE=st.GetEntry(ptype,Xp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xp);
-    XM_RECON_ACCUM;
-    num++;  
-  }
-
-
-  // Yp
-  SE=st.GetEntry(ptype,Yp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Yp);
-    YM_RECON_ACCUM;
-    num++;  
-  }
-
-
-  // Zp
-  SE=st.GetEntry(ptype,Zp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }  
-
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zp);
-    ZM_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tp
-  SE=st.GetEntry(ptype,Tp,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TM_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tp);
-    TM_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Xm
-  SE=st.GetEntry(ptype,Xm,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    XP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Xm);
-    XP_RECON_ACCUM;
-    num++;  
-  }
-  
-  // Ym
-  SE=st.GetEntry(ptype,Ym,ss);
-  offset = SE->_offset;
-  
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    YP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Ym);
-    YP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Zm
-  SE=st.GetEntry(ptype,Zm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    ZP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Zm);
-    ZP_RECON_ACCUM;
-    num++;  
-  }
-
-  // Tm
-  SE=st.GetEntry(ptype,Tm,ss);
-  offset = SE->_offset;
-
-  if (Local && SE->_is_local ) { 
-    LOAD_CHIMU;
-    TP_PROJ;
-    if ( SE->_permute ) {
-      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-    }
-  }
-  if ( Nonlocal && (!SE->_is_local) ) { 
-    LOAD_CHI;
-  }
-  if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
-    MULT_2SPIN(Tm);
-    TP_RECON_ACCUM;
-    num++;  
-  }
-
-  SiteSpinor & ref (out._odata[ss]);
-  if ( Local ) {
-    vstream(ref()(0)(0),result_00*(-0.5));
-    vstream(ref()(0)(1),result_01*(-0.5));
-    vstream(ref()(0)(2),result_02*(-0.5));
-    vstream(ref()(1)(0),result_10*(-0.5));
-    vstream(ref()(1)(1),result_11*(-0.5));
-    vstream(ref()(1)(2),result_12*(-0.5));
-    vstream(ref()(2)(0),result_20*(-0.5));
-    vstream(ref()(2)(1),result_21*(-0.5));
-    vstream(ref()(2)(2),result_22*(-0.5));
-    vstream(ref()(3)(0),result_30*(-0.5));
-    vstream(ref()(3)(1),result_31*(-0.5));
-    vstream(ref()(3)(2),result_32*(-0.5));
-    return 1;
-  } else if ( num ) { 
-    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
-    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
-    vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
-    vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
-    vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
-    vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
-    vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
-    vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
-    vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
-    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
-    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
-    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
-    return 1;
-  }
-  return 0;
-}
-
-  /*
 template<class Impl>
 void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
-						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
@ -1073,89 +539,346 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel

  {
    SiteSpinor & ref (out._odata[ss]);
-    vstream(ref()(0)(0),result_00*(-0.5));
-    vstream(ref()(0)(1),result_01*(-0.5));
-    vstream(ref()(0)(2),result_02*(-0.5));
-    vstream(ref()(1)(0),result_10*(-0.5));
-    vstream(ref()(1)(1),result_11*(-0.5));
-    vstream(ref()(1)(2),result_12*(-0.5));
-    vstream(ref()(2)(0),result_20*(-0.5));
-    vstream(ref()(2)(1),result_21*(-0.5));
-    vstream(ref()(2)(2),result_22*(-0.5));
-    vstream(ref()(3)(0),result_30*(-0.5));
-    vstream(ref()(3)(1),result_31*(-0.5));
-    vstream(ref()(3)(2),result_32*(-0.5));
+    vstream(ref()(0)(0),result_00);
+    vstream(ref()(0)(1),result_01);
+    vstream(ref()(0)(2),result_02);
+    vstream(ref()(1)(0),result_10);
+    vstream(ref()(1)(1),result_11);
+    vstream(ref()(1)(2),result_12);
+    vstream(ref()(2)(0),result_20);
+    vstream(ref()(2)(1),result_21);
+    vstream(ref()(2)(2),result_22);
+    vstream(ref()(3)(0),result_30);
+    vstream(ref()(3)(1),result_31);
+    vstream(ref()(3)(2),result_32);
  }
 }
-*/
+
+template<class Impl>
+void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+					       int ss,int sU,const FermionField &in, FermionField &out)
+{
+  //  std::cout << "Hand op Dhop "<<std::endl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  REGISTER Simd result_00; // 12 regs on knc
+  REGISTER Simd result_01;
+  REGISTER Simd result_02;
+  
+  REGISTER Simd result_10;
+  REGISTER Simd result_11;
+  REGISTER Simd result_12;
+
+  REGISTER Simd result_20;
+  REGISTER Simd result_21;
+  REGISTER Simd result_22;
+
+  REGISTER Simd result_30;
+  REGISTER Simd result_31;
+  REGISTER Simd result_32; // 20 left
+
+  REGISTER Simd Chi_00;    // two spinor; 6 regs
+  REGISTER Simd Chi_01;
+  REGISTER Simd Chi_02;
+
+  REGISTER Simd Chi_10;
+  REGISTER Simd Chi_11;
+  REGISTER Simd Chi_12;   // 14 left
+
+  REGISTER Simd UChi_00;  // two spinor; 6 regs
+  REGISTER Simd UChi_01;
+  REGISTER Simd UChi_02;
+
+  REGISTER Simd UChi_10;
+  REGISTER Simd UChi_11;
+  REGISTER Simd UChi_12;  // 8 left
+
+  REGISTER Simd U_00;  // two rows of U matrix
+  REGISTER Simd U_10;
+  REGISTER Simd U_20;  
+  REGISTER Simd U_01;
+  REGISTER Simd U_11;
+  REGISTER Simd U_21;  // 2 reg left.
+
+#define Chimu_00 Chi_00
+#define Chimu_01 Chi_01
+#define Chimu_02 Chi_02
+#define Chimu_10 Chi_10
+#define Chimu_11 Chi_11
+#define Chimu_12 Chi_12
+#define Chimu_20 UChi_00
+#define Chimu_21 UChi_01
+#define Chimu_22 UChi_02
+#define Chimu_30 UChi_10
+#define Chimu_31 UChi_11
+#define Chimu_32 UChi_12
+
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  
+  // Xp
+  SE=st.GetEntry(ptype,Xp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    XP_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+
+  {
+    MULT_2SPIN(Xp);
+  }
+  XP_RECON;
+
+  // Yp
+  SE=st.GetEntry(ptype,Yp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    YP_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Yp);
+  }
+  YP_RECON_ACCUM;
+
+
+  // Zp
+  SE=st.GetEntry(ptype,Zp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    ZP_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Zp);
+  }
+  ZP_RECON_ACCUM;
+
+  // Tp
+  SE=st.GetEntry(ptype,Tp,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    TP_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Tp);
+  }
+  TP_RECON_ACCUM;
+  
+  // Xm
+  SE=st.GetEntry(ptype,Xm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    XM_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Xm);
+  }
+  XM_RECON_ACCUM;
+  
+  // Ym
+  SE=st.GetEntry(ptype,Ym,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+  
+  if ( local ) {
+    LOAD_CHIMU;
+    YM_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Ym);
+  }
+  YM_RECON_ACCUM;
+
+  // Zm
+  SE=st.GetEntry(ptype,Zm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+
+  if ( local ) {
+    LOAD_CHIMU;
+    ZM_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Zm);
+  }
+  ZM_RECON_ACCUM;
+
+  // Tm
+  SE=st.GetEntry(ptype,Tm,ss);
+  offset = SE->_offset;
+  local  = SE->_is_local;
+  perm   = SE->_permute;
+
+  if ( local ) {
+    LOAD_CHIMU;
+    TM_PROJ;
+    if ( perm) {
+      PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+    }
+  } else { 
+    LOAD_CHI;
+  }
+  {
+    MULT_2SPIN(Tm);
+  }
+  TM_RECON_ACCUM;
+
+  {
+    SiteSpinor & ref (out._odata[ss]);
+    vstream(ref()(0)(0),result_00);
+    vstream(ref()(0)(1),result_01);
+    vstream(ref()(0)(2),result_02);
+    vstream(ref()(1)(0),result_10);
+    vstream(ref()(1)(1),result_11);
+    vstream(ref()(1)(2),result_12);
+    vstream(ref()(2)(0),result_20);
+    vstream(ref()(2)(1),result_21);
+    vstream(ref()(2)(2),result_22);
+    vstream(ref()(3)(0),result_30);
+    vstream(ref()(3)(1),result_31);
+    vstream(ref()(3)(2),result_32);
+  }
+}
+
+
  ////////////////////////////////////////////////
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+							     int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
-  //check consistency of return types between these functions and the ones in WilsonKernels.cc
-  return 0;
-  
+  assert(0);
 }

 template<>
-int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+								int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
+  assert(0);
 }

 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							     int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+							     int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
+  assert(0);
 }

 template<>
-int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
+								int sF,int sU,const FermionField &in, FermionField &out)
 {
-  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
-  return 0;
+  assert(0);
 }



-template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+////////////// Wilson ; uses this implementation /////////////////////
+// Need Nc=3 though //
+
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							       int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-							       int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
-template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+							       int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
-template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+								  int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								  int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
+								  int ss,int sU,const FermionField &in, FermionField &out);


-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-								      int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
-template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
-template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
-									 int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
+									 int ss,int sU,const FermionField &in, FermionField &out);
+
+
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+

 }}
--- a/lib/qcd/action/gauge/GaugeImpl.h
+++ b/lib/qcd/action/gauge/GaugeImpl.h
@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
 #define INHERIT_GIMPL_TYPES(GImpl) \
    typedef typename GImpl::Simd                           Simd;\
    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
-    typedef typename GImpl::GaugeField               GaugeField;	
+    typedef typename GImpl::GaugeField               GaugeField;\
+    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
+    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;

    // 
    template<class S,int Nrepresentation=Nc>
@ -62,9 +64,9 @@ template<class Gimpl> class WilsonLoops;

    // Move this elsewhere?
    static inline void AddGaugeLink(GaugeField& U, GaugeLinkField& W, int mu){  // U[mu] += W 
-PARALLEL_FOR_LOOP
+    PARALLEL_FOR_LOOP
      for(auto ss=0;ss<U._grid->oSites();ss++){
-	U._odata[ss]._internal[mu] = U._odata[ss]._internal[mu] + W._odata[ss]._internal;
+	         U._odata[ss]._internal[mu] = U._odata[ss]._internal[mu] + W._odata[ss]._internal;
        }  
    }
    
--- a/lib/qcd/hmc/HmcRunner.h
+++ b/lib/qcd/hmc/HmcRunner.h
@ -92,13 +92,13 @@ public:
    
    // Create integrator, including the smearing policy
    // Smearing policy
-    std::cout << GridLogMessage << " Creating the Stout class\n";
-    double rho = 0.1; // smearing parameter
+    std::cout << GridLogDebug << " Creating the Stout class\n";
+    double rho = 0.1; // smearing parameter, now hardcoded
    int Nsmear = 1;   // number of smearing levels
    Smear_Stout<Gimpl> Stout(rho);
-    std::cout << GridLogMessage << " Creating the SmearedConfiguration class\n";
+    std::cout << GridLogDebug << " Creating the SmearedConfiguration class\n";
    SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
-    std::cout << GridLogMessage << " done\n";
+    std::cout << GridLogDebug << " done\n";
    //////////////
    typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> >  IntegratorType;// change here to change the algorithm
    IntegratorParameters MDpar(20);
@ -116,27 +116,27 @@ public:

    if ( StartType == HotStart ) {
      // Hot start
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::HotConfiguration(pRNG, U);
    } else if ( StartType == ColdStart ) { 
      // Cold start
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::ColdConfiguration(pRNG, U);
    } else if ( StartType == TepidStart ) {       
      // Tepid start
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      sRNG.SeedFixedIntegers(SerSeed);
      pRNG.SeedFixedIntegers(ParSeed);
      SU3::TepidConfiguration(pRNG, U);
    } else if ( StartType == CheckpointStart ) { 
-      HMCpar.NoMetropolisUntil =0;
+      HMCpar.NoMetropolisUntil =10;
      HMCpar.MetropolisTest = true;
      // CheckpointRestart
      Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
@ -144,7 +144,7 @@ public:

    // Attach the gauge field to the smearing Policy and create the fill the smeared set
    // notice that the unit configuration is singular in this procedure
-    std::cout << GridLogMessage << "Filling the smeared set\n";
+    std::cout << GridLogMessage << "Filling the smeared set\n"; 
    SmearingPolicy.set_GaugeField(U);
    
    HybridMonteCarlo<GaugeField,IntegratorType>  HMC(HMCpar, MDynamics,sRNG,pRNG,U); 
--- a/lib/qcd/spin/Dirac.cc
+++ b/lib/qcd/spin/Dirac.cc
@ -60,6 +60,31 @@ namespace Grid {
      "-Gamma5  ",
      "         "
    };
+    
+    SpinMatrix makeGammaProd(const unsigned int i)
+    {
+      SpinMatrix g;
+      
+      g = 1.;
+      if (i & 0x1)
+      {
+        g = g*Gamma(Gamma::GammaMatrix::GammaX);
+      }
+      if (i & 0x2)
+      {
+        g = g*Gamma(Gamma::GammaMatrix::GammaY);
+      }
+      if (i & 0x4)
+      {
+        g = g*Gamma(Gamma::GammaMatrix::GammaZ);
+      }
+      if (i & 0x8)
+      {
+        g = g*Gamma(Gamma::GammaMatrix::GammaT);
+      }
+      
+      return g;
+    }

    //    void sprojMul( vHalfSpinColourVector &out,vColourMatrix &u, vSpinColourVector &in){
    //      vHalfSpinColourVector hspin;
--- a/lib/qcd/spin/Dirac.h
+++ b/lib/qcd/spin/Dirac.h
@ -82,7 +82,10 @@ namespace QCD {
    GammaMatrix _g;

  };
-
+  
+    // Make gamma products (Chroma convention)
+    SpinMatrix makeGammaProd(const unsigned int i);
+    
    /* Gx
     *  0 0  0  i    
     *  0 0  i  0    
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@ -608,14 +608,14 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
    LatticeMatrix Umu(out._grid);
    for(int mu=0;mu<Nd;mu++){
      LieRandomize(pRNG,Umu,0.01);
-      pokeLorentz(out,Umu,mu);
+      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }
  static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
    LatticeMatrix Umu(out._grid);
    Umu=1.0;
    for(int mu=0;mu<Nd;mu++){
-      pokeLorentz(out,Umu,mu);
+      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }

--- a/lib/qcd/utils/SpaceTimeGrid.cc
+++ b/lib/qcd/utils/SpaceTimeGrid.cc
@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 {
  return new GridRedBlackCartesian(FourDimGrid); 
 }
-
+GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
+{
+  std::vector<int> simd(4,1);
+  return makeFourDimGrid(latt,simd,mpi);
+}
 GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@ -58,6 +62,7 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
  return new GridCartesian(latt5,simd5,mpi5); 
 }

+
 GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 }

+
+GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
+{
+  int N4=FourDimGrid->_ndimension;
+  int nsimd = FourDimGrid->Nsimd();
+
+  std::vector<int> latt5(1,Ls);
+  std::vector<int> simd5(1,nsimd);
+  std::vector<int>  mpi5(1,1);
+  
+  for(int d=0;d<N4;d++){
+    latt5.push_back(FourDimGrid->_fdimensions[d]);
+    simd5.push_back(1);
+     mpi5.push_back(FourDimGrid->_processors[d]);
+  }
+  return new GridCartesian(latt5,simd5,mpi5); 
+}
+
+GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
+{
+  int N4=FourDimGrid->_ndimension;
+  int nsimd = FourDimGrid->Nsimd();
+  int cbd=0;
+  std::vector<int> latt5(1,Ls);
+  std::vector<int> simd5(1,nsimd);
+  std::vector<int>  mpi5(1,1);
+  std::vector<int>   cb5(1,1);
+    
+  for(int d=0;d<N4;d++){
+    latt5.push_back(FourDimGrid->_fdimensions[d]);
+    simd5.push_back(1);
+     mpi5.push_back(FourDimGrid->_processors[d]);
+      cb5.push_back(1);
+    }
+  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
+}
+
+
 }}
--- a/lib/qcd/utils/SpaceTimeGrid.h
+++ b/lib/qcd/utils/SpaceTimeGrid.h
@ -35,9 +35,14 @@ class SpaceTimeGrid {

  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);
+
  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
  static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);

+  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
+  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
+  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
+
 };

 }}
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@ -52,9 +52,9 @@ namespace Grid {
 	// or this-> ; there is no "this" in a static method. This forces explicit Gimpl scope
 	// resolution throughout the usage in this file, and rather defeats the purpose of deriving
 	// from Gimpl.
-	plaq= Gimpl::CovShiftBackward(U[mu],mu,
-				      Gimpl::CovShiftBackward(U[nu],nu,
-							      Gimpl::CovShiftForward (U[mu],mu,U[nu])));
+	plaq = Gimpl::CovShiftBackward(U[mu],mu,
+		   Gimpl::CovShiftBackward(U[nu],nu,
+		   Gimpl::CovShiftForward (U[mu],mu,U[nu])));
      }
      //////////////////////////////////////////////////
      // trace of directed plaquette oriented in mu,nu plane
@ -100,16 +100,16 @@ namespace Grid {
      //////////////////////////////////////////////////
      // average over all x,y,z,t and over all planes of plaquette
      //////////////////////////////////////////////////
-      static RealD avgPlaquette(const GaugeLorentz &Umu){
-	
-	RealD sumplaq = sumPlaquette(Umu);
-	
-	double vol = Umu._grid->gSites();
-	
-	double faces = (1.0*Nd*(Nd-1))/2.0;
-	
-	return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
-      }
+	static RealD avgPlaquette(const GaugeLorentz &Umu){
+		RealD sumplaq = sumPlaquette(Umu);
+		double vol = Umu._grid->gSites();
+		double faces = (1.0*Nd*(Nd-1))/2.0;
+		return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
+   	}
+
+      //////////////////////////////////////////////////
+      // average over traced single links
+      //////////////////////////////////////////////////
      static RealD linkTrace(const GaugeLorentz &Umu){
 	std::vector<GaugeMat> U(4,Umu._grid);
 	
@ -126,47 +126,6 @@ namespace Grid {
 	
 	return p.real()/vol/4.0/3.0;
      };
-      //////////////////////////////////////////////////
-      // the sum over all staples on each site
-      //////////////////////////////////////////////////
-      static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
-	
-	GridBase *grid = Umu._grid;
-	
-	std::vector<GaugeMat> U(4,grid);
-	for(int d=0;d<Nd;d++){
-	  U[d] = PeekIndex<LorentzIndex>(Umu,d);
-	}
-	staple = zero;
-		
-	
-	for(int nu=0;nu<Nd;nu++){
-	  
-	  if(nu != mu) {
-	    
-	    // mu
-	    // ^
-	    // |__>  nu
-	    
-	    //    __ 
-	    //      |
-	    //    __|
-	    //
-	    
-	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftForward (U[nu],nu, 
-							       Gimpl::CovShiftBackward(U[mu],mu,
-										       Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
-	    
-	    //  __ 
-	    // |   
-	    // |__ 
-	    //
-	    //
-	    staple+=Gimpl::ShiftStaple(Gimpl::CovShiftBackward(U[nu],nu,		  		  
-							       Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
-	  }
-	}
-      }
      
      //////////////////////////////////////////////////
      // the sum over all staples on each site in direction mu,nu
@ -210,6 +169,51 @@ namespace Grid {
 	}
      }

+//////////////////////////////////////////////////
+// the sum over all staples on each site
+//////////////////////////////////////////////////
+  static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
+
+    GridBase *grid = Umu._grid;
+
+    std::vector<GaugeMat> U(Nd,grid);
+    for(int d=0;d<Nd;d++){
+      U[d] = PeekIndex<LorentzIndex>(Umu,d);
+    }
+    staple = zero;
+    GaugeMat tmp(grid);
+
+    
+    for(int nu=0;nu<Nd;nu++){
+
+      if(nu != mu) {
+
+      // mu
+      // ^
+      // |__>  nu
+
+      //    __ 
+      //      |
+      //    __|
+      //
+
+	staple+=Gimpl::ShiftStaple(
+	        Gimpl::CovShiftForward (U[nu],nu, 
+		Gimpl::CovShiftBackward(U[mu],mu,
+		Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
+
+      //  __ 
+      // |   
+      // |__ 
+      //
+      //
+	staple+=Gimpl::ShiftStaple(  
+                Gimpl::CovShiftBackward(U[nu],nu,		  		  
+		Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
+      }
+    }
+  }
+

      //////////////////////////////////////////////////
      // the sum over all staples on each site in direction mu,nu, upper part
@ -247,246 +251,246 @@ namespace Grid {



-
-      //////////////////////////////////////////////////////
-      // Similar to above for rectangle is required
-      //////////////////////////////////////////////////////
-      static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
-      {
-	rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
-	  adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
-	rect = rect + 
+  //////////////////////////////////////////////////////
+  // Similar to above for rectangle is required
+  //////////////////////////////////////////////////////
+  static void dirRectangle(GaugeMat &rect,const std::vector<GaugeMat> &U, const int mu, const int nu)
+  {
+    rect =  Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[mu],mu,U[nu]))* // ->->|
+	adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[mu],mu,U[mu]))) ;
+    rect = rect + 
          Gimpl::CovShiftForward(U[mu],mu,Gimpl::CovShiftForward(U[nu],nu,U[nu]))* // ->||
-	  adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
+      adj(Gimpl::CovShiftForward(U[nu],nu,Gimpl::CovShiftForward(U[nu],nu,U[mu]))) ;
+  }
+  static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
+  {
+    GaugeMat sp(U[0]._grid);
+    dirRectangle(sp,U,mu,nu);
+    rect=trace(sp);
+  }
+  static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
+  {
+    LatticeComplex siteRect(U[0]._grid);
+    Rect=zero;
+    for(int mu=1;mu<Nd;mu++){
+      for(int nu=0;nu<mu;nu++){
+	traceDirRectangle(siteRect,U,mu,nu);
+	Rect = Rect + siteRect;
      }
-      static void traceDirRectangle(LatticeComplex &rect, const std::vector<GaugeMat> &U, const int mu, const int nu)
-      {
-	GaugeMat sp(U[0]._grid);
-	dirRectangle(sp,U,mu,nu);
-	rect=trace(sp);
-      }
-      static void siteRectangle(LatticeComplex &Rect,const std::vector<GaugeMat> &U)
-      {
-	LatticeComplex siteRect(U[0]._grid);
-	Rect=zero;
-	for(int mu=1;mu<Nd;mu++){
-	  for(int nu=0;nu<mu;nu++){
-	    traceDirRectangle(siteRect,U,mu,nu);
-	    Rect = Rect + siteRect;
-	  }
-	}
-      }
-      //////////////////////////////////////////////////
-      // sum over all x,y,z,t and over all planes of plaquette
-      //////////////////////////////////////////////////
-      static RealD sumRectangle(const GaugeLorentz &Umu){
-	std::vector<GaugeMat> U(4,Umu._grid);
+    }
+  }

-	for(int mu=0;mu<Nd;mu++){
-	  U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-	}
+ //////////////////////////////////////////////////
+  // sum over all x,y,z,t and over all planes of plaquette
+  //////////////////////////////////////////////////
+  static RealD sumRectangle(const GaugeLorentz &Umu){
+    std::vector<GaugeMat> U(Nd,Umu._grid);

-	LatticeComplex Rect(Umu._grid);
+    for(int mu=0;mu<Nd;mu++){
+      U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
+    }
+
+    LatticeComplex Rect(Umu._grid);
    
-	siteRectangle(Rect,U);
+    siteRectangle(Rect,U);
    
-	TComplex Tp = sum(Rect);
-	Complex p  = TensorRemove(Tp);
-	return p.real();
-      }
-      //////////////////////////////////////////////////
-      // average over all x,y,z,t and over all planes of plaquette
-      //////////////////////////////////////////////////
-      static RealD avgRectangle(const GaugeLorentz &Umu){
+    TComplex Tp = sum(Rect);
+    Complex p  = TensorRemove(Tp);
+    return p.real();
+  }
+  //////////////////////////////////////////////////
+  // average over all x,y,z,t and over all planes of plaquette
+  //////////////////////////////////////////////////
+  static RealD avgRectangle(const GaugeLorentz &Umu){

-	RealD sumrect = sumRectangle(Umu);
+    RealD sumrect = sumRectangle(Umu);
    
-	double vol = Umu._grid->gSites();
+    double vol = Umu._grid->gSites();
    
-	double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
+    double faces = (1.0*Nd*(Nd-1)); // 2 distinct orientations summed
    
-	return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
-      }
+    return sumrect/vol/faces/Nc; // Nd , Nc dependent... FIXME
+  }

-      //////////////////////////////////////////////////
-      // the sum over all staples on each site
-      //////////////////////////////////////////////////
-      static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
-	U2 = U * Cshift(U,mu,1);
-      }
+  //////////////////////////////////////////////////
+  // the sum over all staples on each site
+  //////////////////////////////////////////////////
+  static void RectStapleDouble(GaugeMat &U2,const GaugeMat & U,int mu){
+    U2 = U * Cshift(U,mu,1);
+  }

-      ////////////////////////////////////////////////////////////////////////////
-      // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
-      // but need to track two deep where cross boundary and apply a conjugation).
-      // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
-      ////////////////////////////////////////////////////////////////////////////
-      static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){
+  ////////////////////////////////////////////////////////////////////////////
+  // Hop by two optimisation strategy does not work nicely with Gparity. (could do,
+  // but need to track two deep where cross boundary and apply a conjugation).
+  // Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do so .
+  ////////////////////////////////////////////////////////////////////////////
+  static void RectStapleOptimised(GaugeMat &Stap,std::vector<GaugeMat> &U2,std::vector<GaugeMat> &U,int mu){

-	Stap = zero;
+    Stap = zero;

-	GridBase *grid = U[0]._grid;
+    GridBase *grid = U[0]._grid;

-	GaugeMat Staple2x1 (grid);
-	GaugeMat tmp (grid);
+    GaugeMat Staple2x1 (grid);
+    GaugeMat tmp (grid);

-	for(int nu=0;nu<Nd;nu++){
-	  if ( nu!=mu) {
+    for(int nu=0;nu<Nd;nu++){
+      if ( nu!=mu) {

-	    // Up staple    ___ ___ 
-	    //             |       |
-	    tmp = Cshift(adj(U[nu]),nu,-1); 
-	    tmp = adj(U2[mu])*tmp;
-	    tmp = Cshift(tmp,mu,-2);
+	// Up staple    ___ ___ 
+	//             |       |
+	tmp = Cshift(adj(U[nu]),nu,-1); 
+	tmp = adj(U2[mu])*tmp;
+	tmp = Cshift(tmp,mu,-2);

-	    Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);
+	Staple2x1 = Gimpl::CovShiftForward (U[nu],nu,tmp);


-	    // Down staple
-	    //             |___ ___|
-	    //
-	    tmp = adj(U2[mu])*U[nu];
-	    Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));
+	// Down staple
+	//             |___ ___|
+	//
+	tmp = adj(U2[mu])*U[nu];
+	Staple2x1+= Gimpl::CovShiftBackward(U[nu],nu,Cshift(tmp,mu,-2));


-	    //              ___ ___
-	    //             |    ___|
-	    //             |___ ___|
-	    //
+	//              ___ ___
+	//             |    ___|
+	//             |___ ___|
+	//

-	    Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);
+	Stap+= Cshift(Gimpl::CovShiftForward (U[mu],mu,Staple2x1),mu,1);

-	    //              ___ ___
-	    //             |___    |
-	    //             |___ ___|
-	    //
+	//              ___ ___
+	//             |___    |
+	//             |___ ___|
+	//

-	    //	tmp= Staple2x1* Cshift(U[mu],mu,-2);
-	    //	Stap+= Cshift(tmp,mu,1) ;
-	    Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;
+	//	tmp= Staple2x1* Cshift(U[mu],mu,-2);
+	//	Stap+= Cshift(tmp,mu,1) ;
+	Stap+= Cshift(Staple2x1,mu,1)*Cshift(U[mu],mu,-1); ;

-	    //       --    
-	    //      |  |              
-	    //          
-	    //      |  | 
+	//       --    
+	//      |  |              
+	//          
+	//      |  | 
 	
-	    tmp = Cshift(adj(U2[nu]),nu,-2);
-	    tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
-	    tmp = U2[nu]*Cshift(tmp,nu,2);
-	    Stap+= Cshift(tmp, mu, 1);
+	tmp = Cshift(adj(U2[nu]),nu,-2);
+	tmp = Gimpl::CovShiftBackward(U[mu],mu,tmp);
+	tmp = U2[nu]*Cshift(tmp,nu,2);
+	Stap+= Cshift(tmp, mu, 1);

-	    //      |  |              
-	    //          
-	    //      |  | 
-	    //       -- 
+	//      |  |              
+	//          
+	//      |  | 
+	//       -- 
 	
-	    tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
-	    tmp = adj(U2[nu])*tmp;
-	    tmp = Cshift(tmp,nu,-2);
-	    Stap+=Cshift(tmp, mu, 1);
-	  }}
+	tmp = Gimpl::CovShiftBackward(U[mu],mu,U2[nu]);
+	tmp = adj(U2[nu])*tmp;
+	tmp = Cshift(tmp,nu,-2);
+	Stap+=Cshift(tmp, mu, 1);
+    }}


-      }
+  }

-      static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
-      {
-	RectStapleUnoptimised(Stap,Umu,mu);
-      }
-      static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
-			     std::vector<GaugeMat> &U2,
-			     std::vector<GaugeMat> &U, int mu)
-      {
-	if ( Gimpl::isPeriodicGaugeField() ){ 
-	  RectStapleOptimised(Stap,U2,U,mu);
-	} else {
-	  RectStapleUnoptimised(Stap,Umu,mu);
-	}
-      }
+  static void RectStaple(GaugeMat &Stap,const GaugeLorentz & Umu,int mu)
+  {
+    RectStapleUnoptimised(Stap,Umu,mu);
+  }
+  static void RectStaple(const GaugeLorentz & Umu,GaugeMat &Stap,
+			 std::vector<GaugeMat> &U2,
+			 std::vector<GaugeMat> &U, int mu)
+  {
+    if ( Gimpl::isPeriodicGaugeField() ){ 
+      RectStapleOptimised(Stap,U2,U,mu);
+    } else {
+      RectStapleUnoptimised(Stap,Umu,mu);
+    }
+  }

-      static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
-	GridBase *grid = Umu._grid;
+  static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
+    GridBase *grid = Umu._grid;

-	std::vector<GaugeMat> U(4,grid);
-	for(int d=0;d<Nd;d++){
-	  U[d] = PeekIndex<LorentzIndex>(Umu,d);
-	}
+    std::vector<GaugeMat> U(Nd,grid);
+    for(int d=0;d<Nd;d++){
+      U[d] = PeekIndex<LorentzIndex>(Umu,d);
+    }

-	Stap=zero;
+    Stap=zero;

-	for(int nu=0;nu<Nd;nu++){
-	  if ( nu!=mu) {
-	    //           __ ___ 
-	    //          |    __ |
-	    //
-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftForward (U[mu],mu,
-							      Gimpl::CovShiftForward (U[nu],nu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftBackward(U[mu],mu,
-																      Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
+    for(int nu=0;nu<Nd;nu++){
+      if ( nu!=mu) {
+    //           __ ___ 
+    //          |    __ |
+    //
+    Stap+= Gimpl::ShiftStaple(
+		  Gimpl::CovShiftForward (U[mu],mu,
+		  Gimpl::CovShiftForward (U[nu],nu,
+		  Gimpl::CovShiftBackward(U[mu],mu,
+                  Gimpl::CovShiftBackward(U[mu],mu,
+		  Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);

-	    //              __ 
-	    //          |__ __ |
+    //              __ 
+    //          |__ __ |

-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftForward (U[mu],mu,
-							      Gimpl::CovShiftBackward(U[nu],nu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);
+    Stap+= Gimpl::ShiftStaple(
+                  Gimpl::CovShiftForward (U[mu],mu,
+		  Gimpl::CovShiftBackward(U[nu],nu,
+		  Gimpl::CovShiftBackward(U[mu],mu,
+                  Gimpl::CovShiftBackward(U[mu],mu, U[nu])))) , mu);

-	    //           __ 
-	    //          |__ __ |
+    //           __ 
+    //          |__ __ |

-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftBackward(U[nu],nu,
-							      Gimpl::CovShiftBackward(U[mu],mu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);
+    Stap+= Gimpl::ShiftStaple(
+		  Gimpl::CovShiftBackward(U[nu],nu,
+		  Gimpl::CovShiftBackward(U[mu],mu,
+		  Gimpl::CovShiftBackward(U[mu],mu,
+		  Gimpl::CovShiftForward(U[nu],nu,U[mu])))) , mu);

-	    //           __ ___ 
-	    //          |__    |
+    //           __ ___ 
+    //          |__    |

-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftForward (U[nu],nu,
-							      Gimpl::CovShiftBackward(U[mu],mu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);
+    Stap+= Gimpl::ShiftStaple(
+		   Gimpl::CovShiftForward (U[nu],nu,
+	           Gimpl::CovShiftBackward(U[mu],mu,
+                   Gimpl::CovShiftBackward(U[mu],mu,
+                   Gimpl::CovShiftBackward(U[nu],nu,U[mu])))) , mu);

-	    //       --    
-	    //      |  |              
-	    //          
-	    //      |  | 
+     //       --    
+     //      |  |              
+     //          
+     //      |  | 
     
-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftForward(U[nu],nu,
-							     Gimpl::CovShiftForward(U[nu],nu,
-										    Gimpl::CovShiftBackward(U[mu],mu,
-													    Gimpl::CovShiftBackward(U[nu],nu,
-																    Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);
+    Stap+= Gimpl::ShiftStaple(
+		   Gimpl::CovShiftForward(U[nu],nu,
+		   Gimpl::CovShiftForward(U[nu],nu,
+                   Gimpl::CovShiftBackward(U[mu],mu,
+                   Gimpl::CovShiftBackward(U[nu],nu,
+		   Gimpl::CovShiftIdentityBackward(U[nu],nu))))) , mu);


-	    //      |  |              
-	    //          
-	    //      |  | 
-	    //       -- 
+     //      |  |              
+     //          
+     //      |  | 
+     //       -- 
     
-	    Stap+= Gimpl::ShiftStaple(
-				      Gimpl::CovShiftBackward(U[nu],nu,
-							      Gimpl::CovShiftBackward(U[nu],nu,
-										      Gimpl::CovShiftBackward(U[mu],mu,
-													      Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
-	  }}
-      }
+    Stap+= Gimpl::ShiftStaple(
+		   Gimpl::CovShiftBackward(U[nu],nu,
+		   Gimpl::CovShiftBackward(U[nu],nu,
+                   Gimpl::CovShiftBackward(U[mu],mu,
+                   Gimpl::CovShiftForward (U[nu],nu,U[nu])))) , mu);
+    }}
+  }


-    };
+};


-    typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
-    typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
-    typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
-    typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
+ typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
+ typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
+ typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
+ typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;

-  }}
+}}

-#endif
+#endif
--- a/lib/serialisation/BaseIO.h
+++ b/lib/serialisation/BaseIO.h
@ -32,6 +32,40 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <type_traits>

 namespace Grid {
+  // helper function to read space-separated values
+  template <typename T>
+  std::vector<T> strToVec(const std::string s)
+  {
+    std::istringstream sstr(s);
+    T                  buf;
+    std::vector<T>     v;
+    
+    while(!sstr.eof())
+    {
+      sstr >> buf;
+      v.push_back(buf);
+    }
+    
+    return v;
+  }
+  
+  // output to streams for vectors
+  template < class T >
+  inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
+  {
+    os << "[";
+    for (auto &x: v)
+    {
+      os << x << " ";
+    }
+    if (v.size() > 0)
+    {
+      os << "\b";
+    }
+    os << "]";
+    
+    return os;
+  }
  
  class Serializable {};
  
@ -138,23 +172,6 @@ namespace Grid {
    r.read(s, output);
  }
  
-  template < class T >
-  inline std::ostream& operator << (std::ostream& os, const std::vector<T>& v)
-  {
-    os << "[";
-    for (auto &x: v)
-    {
-      os << x << " ";
-    }
-    if (v.size() > 0)
-    {
-      os << "\b";
-    }
-    os << "]";
-    
-    return os;
-  }
-  
  // Writer template implementation ////////////////////////////////////////////
  template <typename T>
  Writer<T>::Writer(void)
--- a/lib/serialisation/MacroMagic.h
+++ b/lib/serialisation/MacroMagic.h
@ -120,7 +120,7 @@ THE SOFTWARE.
  \
  \
  template <typename T>\
-  static void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
+  static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
    push(WR,s);\
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
    pop(WR);\
@ -128,14 +128,14 @@ THE SOFTWARE.
  \
  \
  template <typename T>\
-  static void read(Reader<T> &RD,const std::string &s, cname &obj){	\
+  static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
    push(RD,s);\
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
    pop(RD);\
  } \
  \
  \
-  friend std::ostream & operator << (std::ostream &os, const cname &obj ) { \
+  friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
    os<<"class "<<#cname<<" {"<<std::endl;\
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__))	\
      os<<"}";								\
@ -165,7 +165,7 @@ namespace Grid {
  class EnumIO<name> {\
    public:\
      template <typename T>\
-      static void write(Writer<T> &WR,const std::string &s, const name &obj){ \
+      static inline void write(Writer<T> &WR,const std::string &s, const name &obj){ \
        switch (obj) {\
          GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
          default: Grid::write(WR,s,#undefname); break;\
@ -173,7 +173,7 @@ namespace Grid {
      }\
      \
      template <typename T>\
-      static void read(Reader<T> &RD,const std::string &s, name &obj){ \
+      static inline void read(Reader<T> &RD,const std::string &s, name &obj){ \
        std::string buf;\
        Grid::read(RD, s, buf);\
        if (buf == #undefname) {obj = name::undefname;}\
@ -182,7 +182,7 @@ namespace Grid {
      }\
  };\
  \
-  std::ostream & operator << (std::ostream &os, const name &obj ) { \
+  inline std::ostream & operator << (std::ostream &os, const name &obj ) { \
    switch (obj) {\
        GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
        default: os << #undefname; break;\
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@ -80,6 +80,20 @@ void XmlReader::pop(void)
  node_ = node_.parent();
 }

+bool XmlReader::nextElement(const std::string &s)
+{
+  if (node_.next_sibling(s.c_str()))
+  {
+    node_ = node_.next_sibling(s.c_str());
+    
+    return true;
+  }
+  else
+  {
+    return false;
+  }
+}
+
 template <>
 void XmlReader::readDefault(const string &s, string &output)
 {
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@ -68,6 +68,7 @@ namespace Grid
    virtual ~XmlReader(void) = default;
    void push(const std::string &s);
    void pop(void);
+    bool nextElement(const std::string &s);
    template <typename U>
    void readDefault(const std::string &s, U &output);
    template <typename U>
--- a/lib/simd/Avx512Asm.h
+++ b/lib/simd/Avx512Asm.h
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@ -410,22 +410,22 @@ namespace Optimization {
  struct Permute{

    static inline __m256 Permute0(__m256 in){
-      return _mm256_permute2f128_ps(in,in,0x01);
+      return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
    };
    static inline __m256 Permute1(__m256 in){
-      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
    };
    static inline __m256 Permute2(__m256 in){
-      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
    };
    static inline __m256 Permute3(__m256 in){
      return in;
    };

    static inline __m256d Permute0(__m256d in){
-      return _mm256_permute2f128_pd(in,in,0x01);
+      return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
    };
-    static inline __m256d Permute1(__m256d in){
+    static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
      return _mm256_shuffle_pd(in,in,0x5);
    };
    static inline __m256d Permute2(__m256d in){
@ -437,6 +437,111 @@ namespace Optimization {

  };

+#if defined (AVX2) || defined (AVXFMA4) 
+#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
+#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
+#endif
+
+#if defined (AVX1) 
+
+#define _mm256_alignr_epi32(ret,a,b,n) {	\
+    __m128 aa, bb;				\
+						\
+    aa  = _mm256_extractf128_ps(a,1);		\
+    bb  = _mm256_extractf128_ps(b,1);		\
+    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
+    ret = _mm256_insertf128_ps(ret,aa,1);	\
+						\
+    aa  = _mm256_extractf128_ps(a,0);		\
+    bb  = _mm256_extractf128_ps(b,0);		\
+    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
+    ret = _mm256_insertf128_ps(ret,aa,0);	\
+  }
+
+#define _mm256_alignr_epi64(ret,a,b,n) {	\
+    __m128d aa, bb;				\
+						\
+    aa  = _mm256_extractf128_pd(a,1);		\
+    bb  = _mm256_extractf128_pd(b,1);		\
+    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
+    ret = _mm256_insertf128_pd(ret,aa,1);	\
+						\
+    aa  = _mm256_extractf128_pd(a,0);		\
+    bb  = _mm256_extractf128_pd(b,0);		\
+    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
+    ret = _mm256_insertf128_pd(ret,aa,0);	\
+  }
+
+#endif
+
+    inline std::ostream & operator << (std::ostream& stream, const __m256 a)
+    {
+      const float *p=(const float *)&a;
+      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
+      return stream;
+    };
+    inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
+    {
+      const double *p=(const double *)&a;
+      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
+      return stream;
+    };
+
+  struct Rotate{
+
+    static inline __m256 rotate(__m256 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m256d rotate(__m256d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      default: assert(0);
+      }
+    }
+  
+    
+    template<int n>
+    static inline __m256 tRotate(__m256 in){ 
+      __m256 tmp = Permute::Permute0(in);
+      __m256 ret;
+      if ( n > 3 ) { 
+	_mm256_alignr_epi32(ret,in,tmp,n);  
+      } else {
+        _mm256_alignr_epi32(ret,tmp,in,n);          
+      }
+      //      std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
+      return ret;
+    };
+
+    template<int n>
+    static inline __m256d tRotate(__m256d in){ 
+      __m256d tmp = Permute::Permute0(in);
+      __m256d ret;
+      if ( n > 1 ) {
+	_mm256_alignr_epi64(ret,in,tmp,n);          
+      } else {
+        _mm256_alignr_epi64(ret,tmp,in,n);          
+      }
+      //      std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
+      return ret;
+    };
+
+  };
+
+

  //Complex float Reduce
  template<>
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@ -39,7 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <immintrin.h>


-
+namespace Grid{
 namespace Optimization {
  
  struct Vsplat{
@ -246,26 +246,30 @@ namespace Optimization {
  struct TimesMinusI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
-      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
-      return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));   // 0x4E??
+      //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
+      //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0));   // 0x4E??
+      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
-      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
-      return _mm512_shuffle_pd(tmp,tmp,0x55);
+      //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
+      //return _mm512_shuffle_pd(tmp,tmp,0x55);
+      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
+      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
    } 
  };

  struct TimesI{
    //Complex single
    inline __m512 operator()(__m512 in, __m512 ret){
-      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
-      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); 
+      __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); 
    }
    //Complex double
    inline __m512d operator()(__m512d in, __m512d ret){
-      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
-      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); 
+      __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
+      return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); 
    }


@ -305,6 +309,54 @@ namespace Optimization {
  };


+  struct Rotate{
+
+    static inline __m512 rotate(__m512 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+
+      case 8 : return tRotate<8>(in);break;
+      case 9 : return tRotate<9>(in);break;
+      case 10: return tRotate<10>(in);break;
+      case 11: return tRotate<11>(in);break;
+      case 12: return tRotate<12>(in);break;
+      case 13: return tRotate<13>(in);break;
+      case 14: return tRotate<14>(in);break;
+      case 15: return tRotate<15>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m512d rotate(__m512d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+
+    template<int n> static inline __m512 tRotate(__m512 in){ 
+      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
+    };
+
+    template<int n> static inline __m512d tRotate(__m512d in){ 
+      return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);          
+    };
+
+  };
+
  //////////////////////////////////////////////
  // Some Template specialization
  
@ -345,7 +397,7 @@ namespace Optimization {

 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
-namespace Grid {
+
  typedef __m512 SIMD_Ftype;  // Single precision type
  typedef __m512d SIMD_Dtype; // Double precision type
  typedef __m512i SIMD_Itype; // Integer type
--- a/lib/simd/Grid_empty.h
+++ b/lib/simd/Grid_empty.h
@ -35,6 +35,7 @@ Author: neo <cossu@post.kek.jp>
 // Time-stamp: <2015-06-09 14:28:02 neo>
 //----------------------------------------------------------------------

+namespace Grid {
 namespace Optimization {

  template<class vtype>
@ -54,51 +55,67 @@ namespace Optimization {
  
  struct Vsplat{
    //Complex float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(float a, float b){
+      u128f out; 
+      out.f[0] = a;
+      out.f[1] = b;
+      out.f[2] = a;
+      out.f[3] = b;
+      return out;
    }
    // Real float
-    inline float operator()(float a){
-      return 0;
+    inline u128f operator()(float a){
+      u128f out; 
+      out.f[0] = a;
+      out.f[1] = a;
+      out.f[2] = a;
+      out.f[3] = a;
+      return out;
    }
    //Complex double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(double a, double b){
+      u128d out; 
+      out.f[0] = a;
+      out.f[1] = b;
+      return out;
    }
    //Real double
-    inline double operator()(double a){
-      return 0;
+    inline u128d operator()(double a){
+      u128d out; 
+      out.f[0] = a;
+      out.f[1] = a;
+      return out;
    }
    //Integer
    inline int operator()(Integer a){
-      return 0;
+      return a;
    }
  };

  struct Vstore{
    //Float 
-    inline void operator()(float a, float* F){
-      
+    inline void operator()(u128f a, float* F){
+      memcpy(F,a.f,4*sizeof(float));
    }
    //Double
-    inline void operator()(double a, double* D){
-     
+    inline void operator()(u128d a, double* D){
+      memcpy(D,a.f,2*sizeof(double));
    }
    //Integer
    inline void operator()(int a, Integer* I){
-      
+      I[0] = a;
    }

  };

  struct Vstream{
    //Float
-    inline void operator()(float * a, float b){
-     
+    inline void operator()(float * a, u128f b){
+      memcpy(a,b.f,4*sizeof(float));
    }
    //Double
-    inline void operator()(double * a, double b){
-     
+    inline void operator()(double * a, u128d b){
+      memcpy(a,b.f,2*sizeof(double));
    }


@ -106,24 +123,40 @@ namespace Optimization {

  struct Vset{
    // Complex float 
-    inline float operator()(Grid::ComplexF *a){
-      return 0;
+    inline u128f operator()(Grid::ComplexF *a){
+      u128f out; 
+      out.f[0] = a[0].real();
+      out.f[1] = a[0].imag();
+      out.f[2] = a[1].real();
+      out.f[3] = a[1].imag();
+      return out;
    }
    // Complex double 
-    inline double operator()(Grid::ComplexD *a){
-      return 0;
+    inline u128d operator()(Grid::ComplexD *a){
+      u128d out; 
+      out.f[0] = a[0].real();
+      out.f[1] = a[0].imag();
+      return out;
    }
    // Real float 
-    inline float operator()(float *a){
-      return  0;
+    inline u128f operator()(float *a){
+      u128f out; 
+      out.f[0] = a[0];
+      out.f[1] = a[1];
+      out.f[2] = a[2];
+      out.f[3] = a[3];
+      return out;
    }
    // Real double
-    inline double operator()(double *a){
-      return 0;
+    inline u128d operator()(double *a){
+      u128d out; 
+      out.f[0] = a[0];
+      out.f[1] = a[1];
+      return out;
    }
    // Integer
    inline int operator()(Integer *a){
-      return 0;
+      return a[0];
    }


@ -145,130 +178,279 @@ namespace Optimization {
  /////////////////////////////////////////////////////
  struct Sum{
    //Complex/Real float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0] + b.f[0];
+      out.f[1] = a.f[1] + b.f[1];
+      out.f[2] = a.f[2] + b.f[2];
+      out.f[3] = a.f[3] + b.f[3];
+      return out;
    }
    //Complex/Real double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0] + b.f[0];
+      out.f[1] = a.f[1] + b.f[1];
+      return out;
    }
    //Integer
    inline int operator()(int a, int b){
-      return 0;
+      return a + b;
    }
  };

  struct Sub{
    //Complex/Real float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0] - b.f[0];
+      out.f[1] = a.f[1] - b.f[1];
+      out.f[2] = a.f[2] - b.f[2];
+      out.f[3] = a.f[3] - b.f[3];
+      return out;
    }
    //Complex/Real double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0] - b.f[0];
+      out.f[1] = a.f[1] - b.f[1];
+      return out;
    }
    //Integer
    inline int operator()(int a, int b){
-      return 0;
+      return a-b;
    }
  };

  struct MultComplex{
    // Complex float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
+      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
+      out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
+      out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
+      return out;
    }
    // Complex double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
+      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
+      return out;
    }
  };

  struct Mult{
-    inline float  mac(float a, float b,double c){
-      return 0;
-    }
-    inline double mac(double a, double b,double c){
-      return 0;
-    }
+    //CK: Appear unneeded
+    // inline float  mac(float a, float b,double c){
+    //   return 0;
+    // }
+    // inline double mac(double a, double b,double c){
+    //   return 0;
+    // }
+
    // Real float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0]*b.f[0];
+      out.f[1] = a.f[1]*b.f[1];
+      out.f[2] = a.f[2]*b.f[2];
+      out.f[3] = a.f[3]*b.f[3];
+      return out;
    }
    // Real double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0]*b.f[0];
+      out.f[1] = a.f[1]*b.f[1];
+      return out;
    }
    // Integer
    inline int operator()(int a, int b){
-      return 0;
+      return a*b;
    }
  };

  struct Conj{
    // Complex single
-    inline float operator()(float in){
-      return 0;
+    inline u128f operator()(u128f in){
+      u128f out;
+      out.f[0] = in.f[0];
+      out.f[1] = -in.f[1];
+      out.f[2] = in.f[2];
+      out.f[3] = -in.f[3];
+      return out;
    }
    // Complex double
-    inline double operator()(double in){
-      return 0;
+    inline u128d operator()(u128d in){
+      u128d out;
+      out.f[0] = in.f[0];
+      out.f[1] = -in.f[1];
+      return out;
    }
    // do not define for integer input
  };

  struct TimesMinusI{
    //Complex single
-    inline float operator()(float in, float ret){
-      return 0;
+    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
+      u128f out;
+      out.f[0] = in.f[1];
+      out.f[1] = -in.f[0];
+      out.f[2] = in.f[3];
+      out.f[3] = -in.f[2];
+      return out;
    }
    //Complex double
-    inline double operator()(double in, double ret){
-      return 0;
+    inline u128d operator()(u128d in, u128d ret){
+      u128d out;
+      out.f[0] = in.f[1];
+      out.f[1] = -in.f[0];
+      return out;
    }
-
-
  };

  struct TimesI{
    //Complex single
-    inline float operator()(float in, float ret){
-      return 0;
+    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
+      u128f out;
+      out.f[0] = -in.f[1];
+      out.f[1] = in.f[0];
+      out.f[2] = -in.f[3];
+      out.f[3] = in.f[2];
+      return out;
    }
    //Complex double
-    inline double operator()(double in, double ret){
-      return 0;
+    inline u128d operator()(u128d in, u128d ret){
+      u128d out;
+      out.f[0] = -in.f[1];
+      out.f[1] = in.f[0];
+      return out;
    }
  };

  //////////////////////////////////////////////
  // Some Template specialization
+  struct Permute{
+    //We just have to mirror the permutes of Grid_sse4.h
+    static inline u128f Permute0(u128f in){ //AB CD -> CD AB
+      u128f out;
+      out.f[0] = in.f[2];
+      out.f[1] = in.f[3];
+      out.f[2] = in.f[0];
+      out.f[3] = in.f[1];
+      return out;
+    };
+    static inline u128f Permute1(u128f in){ //AB CD -> BA DC
+      u128f out;
+      out.f[0] = in.f[1];
+      out.f[1] = in.f[0];
+      out.f[2] = in.f[3];
+      out.f[3] = in.f[2];
+      return out;
+    };
+    static inline u128f Permute2(u128f in){
+      return in;
+    };
+    static inline u128f Permute3(u128f in){
+      return in;
+    };
+
+    static inline u128d Permute0(u128d in){ //AB -> BA
+      u128d out;
+      out.f[0] = in.f[1];
+      out.f[1] = in.f[0];
+      return out;      
+    };
+    static inline u128d Permute1(u128d in){
+      return in;
+    };
+    static inline u128d Permute2(u128d in){
+      return in;
+    };
+    static inline u128d Permute3(u128d in){
+      return in;
+    };
+
+  };
+  
  template < typename vtype > 
    void permute(vtype &a, vtype b, int perm) {
-   }; 
+   };
+    
+  struct Rotate{
+
+    static inline u128f rotate(u128f in,int n){
+      u128f out;
+      switch(n){
+      case 0:
+        out.f[0] = in.f[0];
+        out.f[1] = in.f[1];
+        out.f[2] = in.f[2];
+        out.f[3] = in.f[3];
+        break;
+      case 1:
+        out.f[0] = in.f[1];
+        out.f[1] = in.f[2];
+        out.f[2] = in.f[3];
+        out.f[3] = in.f[0];
+        break;
+      case 2:
+        out.f[0] = in.f[2];
+        out.f[1] = in.f[3];
+        out.f[2] = in.f[0];
+        out.f[3] = in.f[1];
+        break;
+      case 3:
+        out.f[0] = in.f[3];
+        out.f[1] = in.f[0];
+        out.f[2] = in.f[1];
+        out.f[3] = in.f[2];
+        break;
+      default: assert(0);
+      }
+      return out;
+    }
+    static inline u128d rotate(u128d in,int n){
+      u128d out;
+      switch(n){
+      case 0:
+        out.f[0] = in.f[0];
+        out.f[1] = in.f[1];
+        break;
+      case 1:
+        out.f[0] = in.f[1];
+        out.f[1] = in.f[0];
+        break;
+      default: assert(0);
+      }
+      return out;
+    }
+  };

  //Complex float Reduce
  template<>
-  inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
-    return 0;
+  inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
+    return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
  }
  //Real float Reduce
  template<>
-  inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
-    return 0;
+  inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
+    return in.f[0] + in.f[1] + in.f[2] + in.f[3];
  }
  
  
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
-    return 0;
+  inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
+    return Grid::ComplexD(in.f[0],in.f[1]);
  }
  
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
-    return 0;
+  inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
+    return in.f[0] + in.f[1];
  }

  //Integer Reduce
@ -282,10 +464,9 @@ namespace Optimization {

 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
-namespace Grid {

-  typedef float SIMD_Ftype;  // Single precision type
-  typedef double SIMD_Dtype; // Double precision type
+  typedef Optimization::u128f SIMD_Ftype;  // Single precision type
+  typedef Optimization::u128d SIMD_Dtype; // Double precision type
  typedef int SIMD_Itype; // Integer type

  // prefetch utilities
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 //----------------------------------------------------------------------

 #include <immintrin.h>
+#include <zmmintrin.h>

+namespace Grid{
 namespace Optimization {
  
  struct Vsplat{
@ -316,6 +318,54 @@ namespace Optimization {

  };
 
+  struct Rotate{
+
+    static inline __m512 rotate(__m512 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+
+      case 8 : return tRotate<8>(in);break;
+      case 9 : return tRotate<9>(in);break;
+      case 10: return tRotate<10>(in);break;
+      case 11: return tRotate<11>(in);break;
+      case 12: return tRotate<12>(in);break;
+      case 13: return tRotate<13>(in);break;
+      case 14: return tRotate<14>(in);break;
+      case 15: return tRotate<15>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m512d rotate(__m512d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+
+    template<int n> static inline __m512 tRotate(__m512 in){ 
+      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
+    };
+
+    template<int n> static inline __m512d tRotate(__m512d in){ 
+      return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);          
+    };
+
+  };
+


  //////////////////////////////////////////////
@ -358,7 +408,7 @@ namespace Optimization {

 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
-namespace Grid {
+
  typedef __m512 SIMD_Ftype;  // Single precision type
  typedef __m512d SIMD_Dtype; // Double precision type
  typedef __m512i SIMD_Itype; // Integer type
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@ -267,10 +267,10 @@ namespace Optimization {
  struct Permute{

    static inline __m128 Permute0(__m128 in){
-      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
    };
    static inline __m128 Permute1(__m128 in){
-      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
    };
    static inline __m128 Permute2(__m128 in){
      return in;
@ -279,7 +279,7 @@ namespace Optimization {
      return in;
    };

-    static inline __m128d Permute0(__m128d in){
+    static inline __m128d Permute0(__m128d in){ //AB -> BA
      return _mm_shuffle_pd(in,in,0x1);
    };
    static inline __m128d Permute1(__m128d in){
@ -294,6 +294,32 @@ namespace Optimization {

  };

+  struct Rotate{
+
+    static inline __m128 rotate(__m128 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m128d rotate(__m128d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      default: assert(0);
+      }
+    }
+  
+#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
+#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
+    
+    template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
+    template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
+
+  };
  //////////////////////////////////////////////
  // Some Template specialization

--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@ -299,16 +299,44 @@ namespace Grid {
    }
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
    {
-      if      (perm==3) permute3(y,b);
-      else if (perm==2) permute2(y,b);
-      else if (perm==1) permute1(y,b);
-      else if (perm==0) permute0(y,b);
+      if ( perm & RotateBit ) {
+	int dist = perm&0xF;
+        y=rotate(b,dist);
+	return;
+      }
+      switch(perm){
+      case 3: permute3(y,b); break;
+      case 2: permute2(y,b); break;
+      case 1: permute1(y,b); break;
+      case 0: permute0(y,b); break;
+      default: assert(0);
+      }
    }
-
-
    
  };// end of Grid_simd class definition 

+  ////////////////////////////////////////////////////////////////////
+  // General rotate
+  ////////////////////////////////////////////////////////////////////
+  template <class S, class V, IfNotComplex<S> =0> 
+  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
+  {
+    nrot = nrot % Grid_simd<S,V>::Nsimd();
+    Grid_simd<S,V> ret;
+    //    std::cout << "Rotate Real by "<<nrot<<std::endl;
+    ret.v = Optimization::Rotate::rotate(b.v,nrot);
+    return ret;
+  }
+  template <class S, class V, IfComplex<S> =0> 
+  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
+  {
+    nrot = nrot % Grid_simd<S,V>::Nsimd();
+    Grid_simd<S,V> ret;
+    //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
+    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
+    return ret;
+  }
+
  ///////////////////////
  // Splat
  ///////////////////////
@ -339,6 +367,9 @@ namespace Grid {
  template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,S(0.0,0.0)); }// use xor?
  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 

+  template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));} 
+  template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));} 
+
  // if not complex overload here 
  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
--- a/lib/simd/Intel512avx.h
+++ b/lib/simd/Intel512avx.h
@ -0,0 +1,197 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_AV512_H
+#define GRID_ASM_AV512_H
+
+////////////////////////////////////////////////////////////	  
+// Knights Landing specials
+////////////////////////////////////////////////////////////	  
+
+#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
+#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
+
+#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
+#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
+
+#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
+#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
+
+#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
+#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
+
+#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMULMEMf(O,P,B,Biirr) \
+  VMULMEMf(O,P,C,Ciirr) \
+  VMULf(tmp,B,Briir) \
+  VMULf(tmp,C,Criir)
+
+#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMd(O,P,tmp)  \
+  VMULMEMd(O,P,B,Biirr)  \ 
+  VMULMEMd(O,P,C,Ciirr)  \
+  VMULd(tmp,B,Briir)  \
+  VMULd(tmp,C,Criir) 
+
+#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMADDMEMf(O,P,B,Biirr) \
+  VMADDMEMf(O,P,C,Ciirr) \
+  VMADDf(tmp,B,Briir) \
+  VMADDf(tmp,C,Criir)
+
+#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
+  VSHUFMEMd(O,P,tmp) \
+  VMADDMEMd(O,P,B,Biirr) \
+  VMADDMEMd(O,P,C,Ciirr) \
+  VMADDd(tmp,B,Briir) \
+  VMADDd(tmp,C,Criir)
+
+// Merges accumulation for complex dot chain; less efficient under avx512
+#define ZEND1f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Criir "," #Criir "," #tmp   ";\n"\
+                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
+
+#define ZEND2f(Criir,Ciirr, tmp)  "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp   ";\n"\
+                                  "vsubps  " #tmp "," #Ciirr "," #Criir"{%k7}"  ";\n"
+
+#define ZEND1d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Criir "," #Criir "," #tmp  ";\n"\ 
+                                  "vaddps  " #tmp "," #Criir "," #Criir"{%k6}"  ";\n"
+
+#define ZEND2d(Criir,Ciirr, tmp)  "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp   ";\n"\
+                         	  "vsubpd  " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
+
+#define VMOVRDUPd(OFF,A,DEST)       "vpshufd  $0x44," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
+#define VMOVIDUPd(OFF,A,DEST)       "vpshufd  $0xee," #OFF "*64(" #A ")," #DEST  ";\n" // 32 bit level: 3,2,3,2
+#define VMOVRDUPf(OFF,PTR,DEST)         "vmovsldup " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VMOVIDUPf(OFF,PTR,DEST)         "vmovshdup " #OFF "*64(" #PTR "), " #DEST  ";\n"
+
+#define VRDUPd(SRC,DEST)       "vpshufd  $0x44," #SRC"," #DEST  ";\n" // 32 bit level: 1,0,3,2
+#define VRDUPf(SRC,DEST)         "vmovsldup " #SRC ", " #DEST  ";\n"
+#define VIDUPd(SRC,DEST)       "vpshufd  $0xee," #SRC"," #DEST  ";\n" // 32 bit level: 3,2,3,2
+#define VIDUPf(SRC,DEST)         "vmovshdup " #SRC ", " #DEST  ";\n"
+
+#define VBCASTRDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST  ";\n" 
+#define VBCASTIDUPd(OFF,A,DEST)           "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST  ";\n" 
+#define VBCASTRDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST  ";\n"
+#define VBCASTIDUPf(OFF,PTR,DEST)         "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST  ";\n"
+
+#define VMADDSUBf(A,B,accum) "vfmaddsub231ps   " #A "," #B "," #accum  ";\n"
+#define VMADDSUBd(A,B,accum) "vfmaddsub231pd   " #A "," #B "," #accum  ";\n"
+#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps   " #O"*64("#P "),"#B "," #accum  ";\n"
+#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd   " #O"*64("#P "),"#B "," #accum  ";\n"
+
+
+#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMULRDUPf(O,P,B,accum) "vmulps   (" #O"*8+0)("#P "){1to16},"#B "," #accum  ";\n"
+#define VMULIDUPf(O,P,B,accum) "vmulps   (" #O"*8+4)("#P "){1to16},"#B "," #accum  ";\n"
+
+#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMULRDUPd(O,P,B,accum) "vmulpd   (" #O"*16+0)("#P "){1to8},"#B "," #accum  ";\n"
+#define VMULIDUPd(O,P,B,accum) "vmulpd   (" #O"*16+8)("#P "){1to8},"#B "," #accum  ";\n"
+  /*
+   * TimesI is used only in the XP recon
+   * Could zero the regs and use RECON_ACCUM
+   */
+
+#define VTIMESI0f(A,DEST, Z)   VSHUFf(A,DEST)	  
+#define VTIMESI1f(A,DEST, Z)   "vaddps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESI2f(A,DEST, Z)   "vsubps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VTIMESI0d(A,DEST, Z)   VSHUFd(A,DEST)	 
+#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VTIMESMINUSI0f(A,DEST,Z)  VSHUFf(A,DEST)					
+#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#define VTIMESMINUSI0d(A,DEST,Z)  VSHUFd(A,DEST)					
+#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #DEST "," #Z "," #DEST"{%k6}"  ";\n"
+#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #DEST "," #Z "," #DEST"{%k7}"  ";\n"
+
+#if 0
+
+#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
+#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
+#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
+#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
+#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
+#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
+
+#else
+
+// o_p must point to floating 1.0f/d
+//
+// Ai, Ar -> tmp (r i)
+// tmp *1.0 
+// ACC i - Ar ; ACC r + Ai
+#define VACCTIMESMINUSI0f(A,ACC,tmp)  VSHUFf(A,tmp)					
+#define VACCTIMESMINUSI1f(A,ACC,tmp)  VMADDMEMf(1,%r10,tmp,ACC)
+#define VACCTIMESMINUSI2f(A,ACC,tmp)  
+
+
+#define VACCTIMESMINUSI0d(A,ACC,tmp)  VSHUFd(A,tmp)					
+#define VACCTIMESMINUSI1d(A,ACC,tmp)  VMADDMEMd(1,%r10,tmp,ACC)  
+#define VACCTIMESMINUSI2d(A,ACC,tmp)
+
+// Ai, Ar -> tmp (r i)
+// tmp *1.0 
+// ACC i + Ar ; ACC r - Ai
+#define  VACCTIMESI0f(A,ACC,tmp)  VSHUFf(A,tmp)	
+#define  VACCTIMESI1f(A,ACC,tmp)  VMADDMEMf(0,%r10,tmp,ACC)  
+#define  VACCTIMESI2f(A,ACC,tmp)
+
+#define  VACCTIMESI0d(A,ACC,tmp)  VSHUFd(A,tmp)	
+#define  VACCTIMESI1d(A,ACC,tmp)  VMADDMEMd(0,%r10,tmp,ACC)  
+#define  VACCTIMESI2d(A,ACC,tmp)
+
+#endif
+
+#define VPERM0f(A,B) "vshuff32x4  $0x4e," #A "," #B "," #B ";\n"
+#define VPERM1f(A,B) "vshuff32x4  $0xb1," #A "," #B "," #B ";\n"
+#define VPERM2f(A,B) "vshufps     $0x4e," #A "," #B "," #B ";\n"
+#define VPERM3f(A,B) "vshufps     $0xb1," #A "," #B "," #B ";\n"
+
+#define VPERM0d(A,B) "vshuff64x2  $0x4e," #A "," #B "," #B ";\n"
+#define VPERM1d(A,B) "vshuff64x2  $0xb1," #A "," #B "," #B ";\n"
+#define VPERM2d(A,B) "vshufpd     $0x55," #A "," #B "," #B ";\n"
+#define VPERM3d(A,B) VMOVd(A,B)
+
+
+#endif
--- a/lib/simd/Intel512common.h
+++ b/lib/simd/Intel512common.h
@ -0,0 +1,141 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_INTEL_COMMON_512_H
+#define GRID_ASM_INTEL_COMMON_512_H
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Opcodes common 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define MASK_REGS \
+  __asm__ ("mov     $0xAAAA, %%eax \n"\ 
+           "kmovw    %%eax, %%k6 \n"\
+           "mov     $0x5555, %%eax \n"\
+           "kmovw    %%eax, %%k7 \n" : : : "%eax");
+
+#define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
+#define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
+
+#define VTIMESIf(A,DEST, Z) \
+  VTIMESI0f(A,DEST, Z) \
+  VTIMESI1f(A,DEST, Z) \
+  VTIMESI2f(A,DEST, Z) 
+
+#define VTIMESId(A,DEST, Z) \
+  VTIMESI0d(A,DEST, Z) \
+  VTIMESI1d(A,DEST, Z) \
+  VTIMESI2d(A,DEST, Z) 
+
+#define VTIMESMINUSIf(A,DEST, Z) \
+        VTIMESMINUSI0f(A,DEST, Z) \
+        VTIMESMINUSI1f(A,DEST, Z) \
+        VTIMESMINUSI2f(A,DEST, Z) 
+
+#define VTIMESMINUSId(A,DEST, Z) \
+        VTIMESMINUSI0d(A,DEST, Z) \
+        VTIMESMINUSI1d(A,DEST, Z) \
+        VTIMESMINUSI2d(A,DEST, Z) 
+
+#define VACCTIMESIf(A,ACC,tmp)			\
+ VACCTIMESI0f(A,ACC,tmp)			\
+ VACCTIMESI1f(A,ACC,tmp)			\
+ VACCTIMESI2f(A,ACC,tmp)			
+
+#define VACCTIMESId(A,ACC,tmp)			\
+ VACCTIMESI0d(A,ACC,tmp)			\
+ VACCTIMESI1d(A,ACC,tmp)			\
+ VACCTIMESI2d(A,ACC,tmp)			
+
+#define VACCTIMESMINUSIf(A,ACC,tmp)			\
+  VACCTIMESMINUSI0f(A,ACC,tmp)				\
+  VACCTIMESMINUSI1f(A,ACC,tmp)				\
+  VACCTIMESMINUSI2f(A,ACC,tmp)			
+
+#define VACCTIMESMINUSId(A,ACC,tmp)			\
+  VACCTIMESMINUSI0d(A,ACC,tmp)				\
+  VACCTIMESMINUSI1d(A,ACC,tmp)				\
+  VACCTIMESMINUSI2d(A,ACC,tmp)			
+
+#define LOAD64i(A,ptr)  __asm__ ( "movq %0, %" #A :  : "r"(ptr)  : #A  );
+#define LOAD64(A,ptr)  LOAD64i(A,ptr)
+
+#define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
+#define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
+
+#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
+#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
+#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
+#define VEVICT(O,A)   
+
+//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
+//  "clevict0 "#O"*64("#A");\n" 
+
+#define VLOADf(OFF,PTR,DEST)   "vmovaps  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+#define VLOADd(OFF,PTR,DEST)   "vmovapd  " #OFF "*64(" #PTR "), " #DEST  ";\n"
+
+#define VADDf(A,B,DEST)        "vaddps   " #A "," #B "," #DEST  ";\n"
+#define VADDd(A,B,DEST)        "vaddpd   " #A "," #B "," #DEST  ";\n"
+
+#define VSUBf(A,B,DEST)        "vsubps   " #A "," #B "," #DEST  ";\n"
+#define VSUBd(A,B,DEST)        "vsubpd   " #A "," #B "," #DEST  ";\n"
+
+#define VADDMEMf(O,A,B,DEST)        "vaddps   "#O"*64("#A ")," #B "," #DEST  ";\n"
+#define VADDMEMd(O,A,B,DEST)        "vaddpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
+
+#define VSUBMEMf(O,A,B,DEST)        "vsubps   "#O"*64("#A ")," #B "," #DEST  ";\n"
+#define VSUBMEMd(O,A,B,DEST)        "vsubpd   "#O"*64("#A ")," #B "," #DEST  ";\n"
+
+#define VMULf(A,B,DEST)        "vmulps   " #A "," #B "," #DEST  ";\n"
+#define VMULd(A,B,DEST)        "vmulpd   " #A "," #B "," #DEST  ";\n"
+
+#define VMADDf(A,B,DEST)       "vfmadd231ps   " #A "," #B "," #DEST  ";\n"
+#define VMADDd(A,B,DEST)       "vfmadd231pd   " #A "," #B "," #DEST  ";\n"
+
+#define VMULMEMf(O,A,B,DEST)   "vmulps   " #O"*64("#A ")," #B "," #DEST  ";\n"
+#define VMULMEMd(O,A,B,DEST)   "vmulpd   " #O"*64("#A ")," #B "," #DEST  ";\n"
+
+#define VMADDMEMf(O,A,B,DEST)       "vfmadd231ps   " #O"*64("#A "),"#B "," #DEST  ";\n"
+#define VMADDMEMd(O,A,B,DEST)       "vfmadd231pd   " #O"*64("#A "),"#B "," #DEST  ";\n"
+
+#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
+#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
+
+#define VPREFETCHNTA(O,A) 
+#define VPREFETCH(O,A)    
+
+#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
+
+// Swaps Re/Im ; could unify this with IMCI
+#define VSHUFd(A,DEST)         "vpshufd  $0x4e," #A "," #DEST  ";\n"    
+#define VSHUFf(A,DEST)         "vpshufd  $0xb1," #A "," #DEST  ";\n"    
+#define VSHUFMEMd(OFF,A,DEST)  "vpshufd  $0x4e, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 1,0,3,2
+#define VSHUFMEMf(OFF,A,DEST)  "vpshufd  $0xb1, " #OFF"*64("#A ")," #DEST  ";\n" // 32 bit level: 2,3,0,1
+
+#define TRAP " int3 ;\n"
+
+#endif
--- a/lib/simd/Intel512double.h
+++ b/lib/simd/Intel512double.h
@ -0,0 +1,154 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearage
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROd(A)
+#define VMOV(A,B)                 VMOVd(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDd(A,B,C)
+#define VSUB(A,B,C)               VSUBd(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0d(A,B)
+#define VPERM1(A,B)               VPERM1d(A,B)
+#define VPERM2(A,B)               VPERM2d(A,B)
+#define VPERM3(A,B)               VPERM3d(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
+#define VSHUF(A,B)                                       VSHUFd(A,B)
+
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
+
+#undef VRDUP
+#undef VIDUP
+#undef VMADDSUBMEM
+#undef VMADDMEM
+#undef VMULMEM
+#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST) 
+#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST) 
+#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
+#define VMADDMEM(O,P,B,accum)    VMADDMEMd(O,P,B,accum)
+#define VMULMEM(O,P,B,accum)     VMULMEMd(O,P,B,accum)
+#undef VMADDSUBRDUP   
+#undef VMADDSUBIDUP   
+#undef VMULRDUP   
+#undef VMULIDUP   
+#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) 
+#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) 
+#define VMULRDUP(O,P,B,accum)     VMULRDUPd(O,P,B,accum)      
+#define VMULIDUP(O,P,B,accum)     VMULIDUPd(O,P,B,accum) 
--- a/lib/simd/Intel512imci.h
+++ b/lib/simd/Intel512imci.h
@ -0,0 +1,127 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_AV512_H
+#define GRID_ASM_AV512_H
+
+////////////////////////////////////////////////////////////	  
+// Knights Corner specials
+////////////////////////////////////////////////////////////	  
+
+#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
+#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
+
+#define ZMULf(Ari,Air,B,Criir,Ciirr)  VMULf(Ari,B,Criir)  VMULf(Air,B,Ciirr)
+#define ZMULd(Ari,Air,B,Criir,Ciirr)  VMULd(Ari,B,Criir)  VMULd(Air,B,Ciirr)
+
+#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
+#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
+
+#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
+#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
+
+#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMULMEMf(O,P,B,Biirr) \
+  VMULMEMf(O,P,C,Ciirr) \
+  VMULf(tmp,B,Briir) \
+  VMULf(tmp,C,Criir)
+
+#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMd(O,P,tmp)  \
+  VMULMEMd(O,P,B,Biirr)  \ 
+  VMULMEMd(O,P,C,Ciirr)  \
+  VMULd(tmp,B,Briir)  \
+  VMULd(tmp,C,Criir) 
+
+#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
+  VSHUFMEMf(O,P,tmp) \
+  VMADDMEMf(O,P,B,Biirr) \
+  VMADDMEMf(O,P,C,Ciirr) \
+  VMADDf(tmp,B,Briir) \
+  VMADDf(tmp,C,Criir)
+
+#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)	\
+  VSHUFMEMd(O,P,tmp) \
+  VMADDMEMd(O,P,B,Biirr) \
+  VMADDMEMd(O,P,C,Ciirr) \
+  VMADDd(tmp,B,Briir) \
+  VMADDd(tmp,C,Criir)
+
+#define ZEND1d(Criir,Ciirr, tmp) "vaddpd  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
+#define ZEND2d(Criir,Ciirr, tmp) "vsubpd  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
+
+#define ZEND1f(Criir,Ciirr, tmp) "vaddps  " #Criir "{cdab} ," #Criir "," #Criir"{%k6}"  ";\n"
+#define ZEND2f(Criir,Ciirr, tmp) "vsubps  " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}"  ";\n"
+
+#define VTIMESI0f(A,DEST, Z)   
+#define VTIMESI1f(A,DEST, Z)   "vaddps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESI2f(A,DEST, Z)   "vsubps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define VTIMESI0d(A,DEST, Z)   
+#define VTIMESI1d(A,DEST, Z)   "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESI2d(A,DEST, Z)   "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define VTIMESMINUSI0f(A,DEST,Z)  
+#define VTIMESMINUSI1f(A,DEST,Z)  "vsubps  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESMINUSI2f(A,DEST,Z)  "vaddps  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define VTIMESMINUSI0d(A,DEST,Z)  
+#define VTIMESMINUSI1d(A,DEST,Z)  "vsubpd  " #A "{cdab}," #Z "," #DEST"{%k7}"  ";\n"
+#define VTIMESMINUSI2d(A,DEST,Z)  "vaddpd  " #A "{cdab}," #Z "," #DEST"{%k6}"  ";\n"
+
+#define  VACCTIMESI0f(A,ACC,tmp)
+#define  VACCTIMESI1f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define  VACCTIMESI2f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+#define  VACCTIMESI0d(A,ACC,tmp)
+#define  VACCTIMESI1d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define  VACCTIMESI2d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+#define VACCTIMESMINUSI0f(A,ACC,tmp)  
+#define VACCTIMESMINUSI1f(A,ACC,tmp)  "vsubps  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define VACCTIMESMINUSI2f(A,ACC,tmp)  "vaddps  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+	   // Acc = Acc - i A
+#define VACCTIMESMINUSI0d(A,ACC,tmp)  
+#define VACCTIMESMINUSI1d(A,ACC,tmp)  "vsubpd  " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
+#define VACCTIMESMINUSI2d(A,ACC,tmp)  "vaddpd  " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
+
+//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
+//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
+
+#define VPERM0f(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
+#define VPERM1f(A,B) "vpermf32x4  $0xb1," #A "," #B ";\n"
+#define VPERM2f(A,B) "vmovaps     " #A "{badc}," #B ";\n"
+#define VPERM3f(A,B) "vmovaps     " #A "{cdab}," #B ";\n"
+
+#define VPERM0d(A,B) "vpermf32x4  $0x4e," #A "," #B ";\n"
+#define VPERM1d(A,B) "vmovapd     " #A "{badc}," #B ";\n"
+#define VPERM2d(A,B) "vmovapd     " #A "{cdab}," #B ";\n"
+#define VPERM3d(A,B) VMOVd(A,B)
+
+#endif
--- a/lib/simd/Intel512single.h
+++ b/lib/simd/Intel512single.h
@ -0,0 +1,155 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+// No guard can be multiply included as undef clearge of macros
+#undef VZERO
+#undef VMOV
+#undef VLOAD
+#undef VSTORE
+#define VZERO(A)                  VZEROf(A)
+#define VMOV(A,B)                 VMOVf(A,B)
+#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
+#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
+
+#undef VADD
+#undef VSUB
+#undef VMUL
+#undef VMADD
+#define VADD(A,B,C)               VADDf(A,B,C)
+#define VSUB(A,B,C)               VSUBf(A,B,C)
+#define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
+#define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
+
+
+#undef VTIMESI
+#undef VTIMESI0 
+#undef VTIMESI1
+#undef VTIMESI2 
+#define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
+#define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
+#define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
+#define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
+
+#undef VTIMESMINUSI
+#undef VTIMESMINUSI0
+#undef VTIMESMINUSI1
+#undef VTIMESMINUSI2
+#define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
+#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
+#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
+#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI
+#undef VACCTIMESI0
+#undef VACCTIMESI1
+#undef VACCTIMESI2
+#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
+#define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
+#define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
+#define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
+
+#undef VACCTIMESMINUSI
+#undef VACCTIMESMINUSI0
+#undef VACCTIMESMINUSI1
+#undef VACCTIMESMINUSI2
+#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
+#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
+#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
+#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
+
+#undef VACCTIMESI1MEM
+#undef VACCTIMESI2MEM
+#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
+#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
+
+#undef VACCTIMESMINUSI1MEM
+#undef VACCTIMESMINUSI2MEM
+#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
+#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
+
+#undef VPERM0
+#undef VPERM1
+#undef VPERM2
+#undef VPERM3
+#define VPERM0(A,B)               VPERM0f(A,B)
+#define VPERM1(A,B)               VPERM1f(A,B)
+#define VPERM2(A,B)               VPERM2f(A,B)
+#define VPERM3(A,B)               VPERM3f(A,B)
+
+#undef VSHUFMEM
+#undef VADDMEM
+#undef VSUBMEM
+#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
+#define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
+#define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
+
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef VMADDSUB
+#undef VSHUF
+#define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
+#define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
+#define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
+#define VSHUF(A,B)                                       VSHUFf(A,B)
+
+#undef ZEND1
+#undef ZEND2
+#undef ZLOAD
+#undef ZMUL
+#undef ZMADD
+#undef ZMULMEM2SP
+#undef ZMADDMEM2SP
+
+#define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
+#define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
+#define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
+#define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
+#define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
+#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
+
+#undef VRDUP
+#undef VIDUP
+#undef VMADDSUBMEM
+#undef VMADDMEM
+#undef VMULMEM
+
+#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST) 
+#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST) 
+#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
+#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
+#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
+
+#undef VMADDSUBRDUP   
+#undef VMADDSUBIDUP   
+#undef VMULRDUP   
+#undef VMULIDUP   
+#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) 
+#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) 
+#define VMULRDUP(O,P,B,accum)     VMULRDUPf(O,P,B,accum)      
+#define VMULIDUP(O,P,B,accum)     VMULIDUPf(O,P,B,accum) 
+   
--- a/lib/simd/Intel512wilson.h
+++ b/lib/simd/Intel512wilson.h
@ -0,0 +1,849 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/simd/Avx512Asm.h
+
+    Copyright (C) 2015
+
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_ASM_INTEL_512_QCD_H
+#define GRID_ASM_INTEL_512_QCD_H
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Register allocations for Wilson Kernel are precision indept
+//////////////////////////////////////////////////////////////////////////////////////////
+#define result_00 %zmm0 
+#define result_01 %zmm1
+#define result_02 %zmm2
+  
+#define result_10 %zmm3
+#define result_11 %zmm4
+#define result_12 %zmm5
+
+#define result_20 %zmm6
+#define result_21 %zmm7
+#define result_22 %zmm8
+
+#define result_30 %zmm9
+#define result_31 %zmm10
+#define result_32 %zmm11
+
+#define Chi_00 %zmm12  
+#define Chi_01 %zmm13
+#define Chi_02 %zmm14
+
+#define Chi_10 %zmm15
+#define Chi_11 %zmm16
+#define Chi_12 %zmm17  
+
+#define UChi_00 %zmm18 
+#define UChi_01 %zmm19
+#define UChi_02 %zmm20
+
+#define UChi_10 %zmm21
+#define UChi_11 %zmm22
+#define UChi_12 %zmm23 
+
+#define Uir %zmm24 
+#define Uri %zmm25  
+#define T1 %zmm24
+#define T2 %zmm25
+
+#define Z0 %zmm26
+#define Z1 %zmm27
+#define Z2 %zmm28
+#define Z3 %zmm29
+#define Z4 %zmm30
+#define Z5 %zmm31
+
+#define TMP Chi_00
+
+#define Chimu_00 Chi_00
+#define Chimu_01 Chi_01
+#define Chimu_02 Chi_02
+#define Chimu_10 Chi_10
+#define Chimu_11 Chi_11
+#define Chimu_12 Chi_12
+#define Chimu_20 UChi_00
+#define Chimu_21 UChi_01
+#define Chimu_22 UChi_02
+#define Chimu_30 UChi_10
+#define Chimu_31 UChi_11
+#define Chimu_32 UChi_12
+
+#include <simd/Intel512common.h>
+#include <simd/Intel512avx.h>
+
+//////////////////////////////////////////////////////////////////
+// Macros used to build wilson kernel -- can rationalise and simplify
+// a little as some duplication developed during trying different
+// variants during optimisation. Could cut back to only those used.
+//////////////////////////////////////////////////////////////////
+
+//  const SiteSpinor * ptr = & in._odata[offset];	
+#define LOAD_CHIMU(PTR)	 LOAD_CHIMUi(PTR)
+#define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
+#define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
+#define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
+#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
+
+#define LOAD_CHIMUi \
+	   LOAD_CHIMU01i	\
+	   LOAD_CHIMU23i	);
+
+
+#define LOAD_CHIMU01i\
+	   VLOAD(0,%r8,Chimu_00)		\
+	   VLOAD(1,%r8,Chimu_01)		\
+	   VLOAD(2,%r8,Chimu_02)		\
+	   VLOAD(3,%r8,Chimu_10)		\
+	   VLOAD(4,%r8,Chimu_11)		\
+	   VLOAD(5,%r8,Chimu_12)		
+
+#define LOAD_CHIMU23i\
+	   VLOAD(6,%r8,Chimu_20)		\
+	   VLOAD(7,%r8,Chimu_21)		\
+	   VLOAD(8,%r8,Chimu_22)		\
+	   VLOAD(9,%r8,Chimu_30)		\
+	   VLOAD(10,%r8,Chimu_31)		\
+	   VLOAD(11,%r8,Chimu_32)		
+
+#define SHUF_CHIMU23i\
+	   VSHUFMEM(6,%r8,Chimu_20)		\
+	   VSHUFMEM(7,%r8,Chimu_21)		\
+	   VSHUFMEM(8,%r8,Chimu_22)		\
+	   VSHUFMEM(9,%r8,Chimu_30)		\
+	   VSHUFMEM(10,%r8,Chimu_31)		\
+	   VSHUFMEM(11,%r8,Chimu_32)		
+
+
+//  const SiteHalfSpinor *ptr = &buf[offset];	
+
+#define LOAD_CHIi				\
+  VLOAD(0,%r8,Chi_00)					\
+  VLOAD(1,%r8,Chi_01)					\
+  VLOAD(2,%r8,Chi_02)					\
+  VLOAD(3,%r8,Chi_10)					\
+  VLOAD(4,%r8,Chi_11)					\
+  VLOAD(5,%r8,Chi_12)	
+	
+
+#define SAVE_UCHIi(PTR)				\
+  LOAD64(%r8,PTR)				\
+  __asm__ (					\
+  VSTORE(0,%r8,UChi_00)				\
+  VSTORE(1,%r8,UChi_01)				\
+  VSTORE(2,%r8,UChi_02)				\
+  VSTORE(3,%r8,UChi_10)				\
+  VSTORE(4,%r8,UChi_11)				\
+  VSTORE(5,%r8,UChi_12)				\
+						);
+
+#define SAVE_CHIi(PTR)				\
+  LOAD64(%r8,PTR)				\
+  __asm__ (					\
+  VSTORE(0,%r8,Chi_00)				\
+  VSTORE(1,%r8,Chi_01)				\
+  VSTORE(2,%r8,Chi_02)				\
+  VSTORE(3,%r8,Chi_10)				\
+  VSTORE(4,%r8,Chi_11)				\
+  VSTORE(5,%r8,Chi_12)				\
+						);
+
+#define SAVE_RESULTi(PTR)\
+	   LOAD64(%r8,PTR)			\
+  __asm__ (					\
+	   VSTORE(0,%r8,result_00)		\
+	   VSTORE(1,%r8,result_01)		\
+	   VSTORE(2,%r8,result_02)		\
+	   VSTORE(3,%r8,result_10)		\
+	   VSTORE(4,%r8,result_11)		\
+	   VSTORE(5,%r8,result_12)		\
+	   VSTORE(6,%r8,result_20)		\
+	   VSTORE(7,%r8,result_21)		\
+	   VSTORE(8,%r8,result_22)		\
+	   VSTORE(9,%r8,result_30)		\
+	   VSTORE(10,%r8,result_31)		\
+	   VSTORE(11,%r8,result_32) 		\
+						);
+
+#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p)
+
+#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
+#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
+
+#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
+#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
+
+//////////////////////////////////////////////////////////////////
+// Dirac algebra
+//////////////////////////////////////////////////////////////////
+
+//      hspin(0)=fspin(0)+timesI(fspin(3));
+//      hspin(1)=fspin(1)+timesI(fspin(2));
+#define XP_PROJMEM(PTR) \
+  LOAD64(%r8,PTR)							\
+  __asm__ (								\
+	   LOAD_CHIi						\
+	   SHUF_CHIMU23i						\
+	   VACCTIMESI1(Chi_00,Chi_00,Chimu_30)		\
+	   VACCTIMESI1(Chi_01,Chi_01,Chimu_31)		\
+	   VACCTIMESI1(Chi_02,Chi_02,Chimu_32)		\
+	   VACCTIMESI1(Chi_10,Chi_10,Chimu_20)		\
+	   VACCTIMESI1(Chi_11,Chi_11,Chimu_21)		\
+	   VACCTIMESI1(Chi_12,Chi_12,Chimu_22)		\
+	   VACCTIMESI2(Chi_00,Chi_00,Chimu_30)		\
+	   VACCTIMESI2(Chi_01,Chi_01,Chimu_31)		\
+	   VACCTIMESI2(Chi_02,Chi_02,Chimu_32)		\
+	   VACCTIMESI2(Chi_10,Chi_10,Chimu_20)		\
+	   VACCTIMESI2(Chi_11,Chi_11,Chimu_21)		\
+	   VACCTIMESI2(Chi_12,Chi_12,Chimu_22)		);
+
+
+#define YP_PROJMEM(ptr) \
+  LOAD64(%r8,ptr)		\
+  __asm__ (					\
+  LOAD_CHIMU01i					\
+  VSUBMEM(9,%r8 ,Chimu_00,Chi_00)		\
+  VSUBMEM(10,%r8,Chimu_01,Chi_01)		\
+  VSUBMEM(11,%r8,Chimu_02,Chi_02)		\
+  VADDMEM(6,%r8,Chimu_10,Chi_10)		\
+  VADDMEM(7,%r8,Chimu_11,Chi_11)		\
+  VADDMEM(8,%r8,Chimu_12,Chi_12)		);
+
+#define ZP_PROJMEM(PTR) \
+  LOAD64(%r8,PTR)							\
+  __asm__ (								\
+	   LOAD_CHIi						\
+	   SHUF_CHIMU23i						\
+	   VACCTIMESI1(Chi_00,Chi_00,Chimu_20)				\
+	   VACCTIMESI1(Chi_01,Chi_01,Chimu_21)		   	        \
+	   VACCTIMESI1(Chi_02,Chi_02,Chimu_22)				\
+	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30)			\
+	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31)			\
+	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32)			\
+	   VACCTIMESI2(Chi_00,Chi_00,Chimu_20)				\
+	   VACCTIMESI2(Chi_01,Chi_01,Chimu_21)				\
+	   VACCTIMESI2(Chi_02,Chi_02,Chimu_22)				\
+	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30)		\
+	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31)		\
+	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32)	);
+
+
+#define TP_PROJMEM(ptr)				\
+  LOAD64(%r8,ptr)				\
+  __asm__ (					\
+	   LOAD_CHIMU01i			\
+	   VADDMEM(6,%r8 ,Chimu_00,Chi_00)	\
+	   VADDMEM(7,%r8,Chimu_01,Chi_01)	\
+	   VADDMEM(8,%r8,Chimu_02,Chi_02)	\
+	   VADDMEM(9,%r8,Chimu_10,Chi_10)	\
+	   VADDMEM(10,%r8,Chimu_11,Chi_11)	\
+	   VADDMEM(11,%r8,Chimu_12,Chi_12)	);
+
+//      hspin(0)=fspin(0)-timesI(fspin(3))
+//      hspin(1)=fspin(1)-timesI(fspin(2))
+
+#define XM_PROJMEM(PTR) \
+  LOAD64(%r8,PTR)\
+  __asm__ (								\
+	   SHUF_CHIMU23i						\
+	   LOAD_CHIi \
+	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
+	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
+	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
+	   VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
+	   VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
+	   VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
+	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
+	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
+	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
+	   VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
+	   VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
+	   VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
+
+#define YM_PROJMEM(ptr)				\
+  LOAD64(%r8,ptr)				\
+  __asm__ (					\
+  LOAD_CHIMU01i					\
+  VADDMEM(9,%r8 ,Chimu_00,Chi_00)		\
+  VADDMEM(10,%r8,Chimu_01,Chi_01)		\
+  VADDMEM(11,%r8,Chimu_02,Chi_02)		\
+  VSUBMEM(6,%r8,Chimu_10,Chi_10)		\
+  VSUBMEM(7,%r8,Chimu_11,Chi_11)		\
+  VSUBMEM(8,%r8,Chimu_12,Chi_12)			);
+
+#define ZM_PROJMEM(PTR) \
+  LOAD64(%r8,PTR)							\
+  __asm__ (								\
+	   SHUF_CHIMU23i						\
+           LOAD_CHIi \
+	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
+	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
+	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
+	   VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
+	   VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
+	   VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
+	   VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
+	   VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
+	   VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
+	   VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
+	   VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
+	   VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
+
+#define TM_PROJMEM(ptr)				\
+  LOAD64(%r8,ptr)				\
+  __asm__ (					\
+  LOAD_CHIMU01i					\
+  VSUBMEM(6,%r8,Chimu_00,Chi_00)		\
+  VSUBMEM(7,%r8,Chimu_01,Chi_01)		\
+  VSUBMEM(8,%r8,Chimu_02,Chi_02)		\
+  VSUBMEM(9,%r8,Chimu_10,Chi_10)		\
+  VSUBMEM(10,%r8,Chimu_11,Chi_11)		\
+  VSUBMEM(11,%r8,Chimu_12,Chi_12)		);
+
+//      fspin(0)=hspin(0)
+//      fspin(1)=hspin(1)
+//      fspin(2)=timesMinusI(hspin(1))
+//      fspin(3)=timesMinusI(hspin(0))
+#define XP_RECON __asm__ (			\
+			  VZERO(TMP)		\
+			  VTIMESMINUSI0(UChi_00,result_30,TMP)	\
+			  VTIMESMINUSI0(UChi_10,result_20,TMP)	\
+			  VTIMESMINUSI0(UChi_01,result_31,TMP)	\
+			  VTIMESMINUSI0(UChi_11,result_21,TMP)	\
+			  VTIMESMINUSI0(UChi_02,result_32,TMP)   \
+			  VTIMESMINUSI0(UChi_12,result_22,TMP)	\
+			  VMOV(UChi_00,result_00)	\
+			  VMOV(UChi_10,result_10)	\
+			  VMOV(UChi_01,result_01)	\
+			  VMOV(UChi_11,result_11)	\
+			  VMOV(UChi_02,result_02)	\
+			  VMOV(UChi_12,result_12)	\
+			  VTIMESMINUSI1(UChi_10,result_20,TMP)	\
+			  VTIMESMINUSI1(UChi_11,result_21,TMP)	\
+			  VTIMESMINUSI1(UChi_12,result_22,TMP)	\
+			  VTIMESMINUSI1(UChi_00,result_30,TMP)	\
+			  VTIMESMINUSI1(UChi_01,result_31,TMP)	\
+			  VTIMESMINUSI1(UChi_02,result_32,TMP)   \
+			  VTIMESMINUSI2(UChi_10,result_20,TMP)	\
+			  VTIMESMINUSI2(UChi_11,result_21,TMP)	\
+			  VTIMESMINUSI2(UChi_12,result_22,TMP)	\
+			  VTIMESMINUSI2(UChi_00,result_30,TMP)	\
+			  VTIMESMINUSI2(UChi_01,result_31,TMP)	\
+			  VTIMESMINUSI2(UChi_02,result_32,TMP)   \
+						);
+  // NB could save 6 ops using addsub => 12 cycles
+#define XP_RECON_ACCUM __asm__ ( \
+  VZERO(TMP)\
+  VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
+  VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
+  VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
+  VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
+  VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
+  VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
+  VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
+  VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
+  VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
+  VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
+  VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
+  VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
+  VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
+  VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
+  VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
+  VACCTIMESMINUSI2(UChi_00,result_30,Z3)\
+  VACCTIMESMINUSI2(UChi_01,result_31,Z4)\
+  VACCTIMESMINUSI2(UChi_02,result_32,Z5)\
+				 );
+
+#define XM_RECON __asm__ ( \
+  VZERO(TMP)\
+  VTIMESI0(UChi_00,result_30,TMP)\
+  VTIMESI0(UChi_10,result_20,TMP)\
+  VTIMESI0(UChi_01,result_31,TMP)\
+  VTIMESI0(UChi_11,result_21,TMP)\
+  VTIMESI0(UChi_02,result_32,TMP)\
+  VTIMESI0(UChi_12,result_22,TMP)\
+  VMOV(UChi_00,result_00)\
+  VMOV(UChi_10,result_10)\
+  VMOV(UChi_01,result_01)\
+  VMOV(UChi_11,result_11)\
+  VMOV(UChi_02,result_02)\
+  VMOV(UChi_12,result_12)\
+  VTIMESI1(UChi_00,result_30,TMP)\
+  VTIMESI1(UChi_10,result_20,TMP)\
+  VTIMESI1(UChi_01,result_31,TMP)\
+  VTIMESI1(UChi_11,result_21,TMP)\
+  VTIMESI1(UChi_02,result_32,TMP)\
+  VTIMESI1(UChi_12,result_22,TMP)\
+  VTIMESI2(UChi_10,result_20,TMP)\
+  VTIMESI2(UChi_11,result_21,TMP)\
+  VTIMESI2(UChi_12,result_22,TMP)\
+  VTIMESI2(UChi_00,result_30,TMP)\
+  VTIMESI2(UChi_01,result_31,TMP)\
+  VTIMESI2(UChi_02,result_32,TMP)\
+			   );
+
+#define XM_RECON_ACCUM __asm__ ( \
+  VACCTIMESI0(UChi_10,result_20,Z0)\
+  VACCTIMESI0(UChi_00,result_30,Z3)\
+  VACCTIMESI0(UChi_11,result_21,Z1)\
+  VACCTIMESI0(UChi_01,result_31,Z4)\
+  VACCTIMESI0(UChi_12,result_22,Z2)\
+  VACCTIMESI0(UChi_02,result_32,Z5)\
+  \
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_12,result_12,result_12)\
+  VADD(UChi_02,result_02,result_02)\
+  \
+  VACCTIMESI1(UChi_10,result_20,Z0)\
+  VACCTIMESI1(UChi_00,result_30,Z3)\
+  VACCTIMESI1(UChi_11,result_21,Z1)\
+  VACCTIMESI1(UChi_01,result_31,Z4)\
+  VACCTIMESI1(UChi_12,result_22,Z2)\
+  VACCTIMESI1(UChi_02,result_32,Z5)\
+  VACCTIMESI2(UChi_10,result_20,Z0)\
+  VACCTIMESI2(UChi_11,result_21,Z1)\
+  VACCTIMESI2(UChi_12,result_22,Z2)\
+  VACCTIMESI2(UChi_00,result_30,Z3)\
+  VACCTIMESI2(UChi_01,result_31,Z4)\
+  VACCTIMESI2(UChi_02,result_32,Z5)\
+				 );
+
+#define YP_RECON_ACCUM __asm__ ( \
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
+  VADD(UChi_10,result_20,result_20)\
+  VADD(UChi_11,result_21,result_21)\
+  VADD(UChi_12,result_22,result_22)\
+  VSUB(UChi_00,result_30,result_30)\
+  VSUB(UChi_01,result_31,result_31)\
+  VSUB(UChi_02,result_32,result_32) );
+
+#define YM_RECON_ACCUM __asm__ ( \
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
+  VSUB(UChi_10,result_20,result_20)\
+  VSUB(UChi_11,result_21,result_21)\
+  VSUB(UChi_12,result_22,result_22)\
+  VADD(UChi_00,result_30,result_30)\
+  VADD(UChi_01,result_31,result_31)\
+  VADD(UChi_02,result_32,result_32) );
+
+#define ZP_RECON_ACCUM __asm__ ( \
+  VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
+  VACCTIMESI0(UChi_10,result_30,Z3)\
+  VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
+  VACCTIMESI0(UChi_11,result_31,Z4)\
+  VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
+  VACCTIMESI0(UChi_12,result_32,Z5)\
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
+  VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
+  VACCTIMESI1(UChi_10,result_30,Z3)\
+  VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
+  VACCTIMESI1(UChi_11,result_31,Z4)\
+  VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
+  VACCTIMESI1(UChi_12,result_32,Z5)\
+  VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
+  VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
+  VACCTIMESMINUSI2(UChi_02,result_22,Z2)\
+  VACCTIMESI2(UChi_10,result_30,Z3)\
+  VACCTIMESI2(UChi_11,result_31,Z4)\
+  VACCTIMESI2(UChi_12,result_32,Z5)\
+				 );
+
+#define ZM_RECON_ACCUM __asm__ ( \
+  VACCTIMESI0(UChi_00,result_20,Z0)\
+  VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
+  VACCTIMESI0(UChi_01,result_21,Z1)\
+  VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
+  VACCTIMESI0(UChi_02,result_22,Z2)\
+  VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
+  VACCTIMESI1(UChi_00,result_20,Z0)\
+  VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
+  VACCTIMESI1(UChi_01,result_21,Z1)\
+  VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
+  VACCTIMESI1(UChi_02,result_22,Z2)\
+  VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
+  VACCTIMESI2(UChi_00,result_20,Z0)\
+  VACCTIMESI2(UChi_01,result_21,Z1)\
+  VACCTIMESI2(UChi_02,result_22,Z2)\
+  VACCTIMESMINUSI2(UChi_10,result_30,Z3)\
+  VACCTIMESMINUSI2(UChi_11,result_31,Z4)\
+  VACCTIMESMINUSI2(UChi_12,result_32,Z5)\
+				 );
+
+#define TP_RECON_ACCUM __asm__ ( \
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
+  VADD(UChi_00,result_20,result_20)\
+  VADD(UChi_10,result_30,result_30)\
+  VADD(UChi_01,result_21,result_21)\
+  VADD(UChi_11,result_31,result_31)\
+  VADD(UChi_02,result_22,result_22)\
+  VADD(UChi_12,result_32,result_32) );
+
+#define TM_RECON_ACCUM __asm__ ( \
+  VADD(UChi_00,result_00,result_00)\
+  VADD(UChi_10,result_10,result_10)\
+  VADD(UChi_01,result_01,result_01)\
+  VADD(UChi_11,result_11,result_11)\
+  VADD(UChi_02,result_02,result_02)\
+  VADD(UChi_12,result_12,result_12)\
+  VSUB(UChi_00,result_20,result_20)\
+  VSUB(UChi_10,result_30,result_30)\
+  VSUB(UChi_01,result_21,result_21)\
+  VSUB(UChi_11,result_31,result_31)\
+  VSUB(UChi_02,result_22,result_22)\
+  VSUB(UChi_12,result_32,result_32) );
+
+#define PREFETCH_CHIMU(A) \
+  LOAD64(%r9,A)						\
+	   __asm__ (						\
+  VPREFETCHG(12,%r9)\
+  VPREFETCHG(13,%r9)\
+  VPREFETCHG(14,%r9)\
+  VPREFETCHG(15,%r9)\
+  VPREFETCHG(16,%r9)\
+  VPREFETCHG(17,%r9)\
+  VPREFETCHG(18,%r9)\
+  VPREFETCHG(19,%r9)\
+  VPREFETCHG(20,%r9)\
+  VPREFETCHG(21,%r9)\
+  VPREFETCHG(22,%r9)\
+  VPREFETCHG(23,%r9));
+
+#define PERMUTE_DIR0 __asm__ ( 	\
+  VPERM0(Chi_00,Chi_00)	\
+  VPERM0(Chi_01,Chi_01)	\
+  VPERM0(Chi_02,Chi_02)	\
+  VPERM0(Chi_10,Chi_10)	\
+  VPERM0(Chi_11,Chi_11)	\
+  VPERM0(Chi_12,Chi_12) );
+
+#define PERMUTE_DIR1 __asm__ (	\
+  VPERM1(Chi_00,Chi_00)	\
+  VPERM1(Chi_01,Chi_01)	\
+  VPERM1(Chi_02,Chi_02)	\
+  VPERM1(Chi_10,Chi_10)	\
+  VPERM1(Chi_11,Chi_11)	\
+  VPERM1(Chi_12,Chi_12));
+
+#define PERMUTE_DIR2 __asm__ (	\
+  VPERM2(Chi_00,Chi_00)	\
+  VPERM2(Chi_01,Chi_01)	\
+  VPERM2(Chi_02,Chi_02)	\
+  VPERM2(Chi_10,Chi_10)	\
+  VPERM2(Chi_11,Chi_11)	\
+  VPERM2(Chi_12,Chi_12) );
+
+#define PERMUTE_DIR3 __asm__ (	\
+  VPERM3(Chi_00,Chi_00)	\
+  VPERM3(Chi_01,Chi_01)	\
+  VPERM3(Chi_02,Chi_02)	\
+  VPERM3(Chi_10,Chi_10)	\
+  VPERM3(Chi_11,Chi_11)	\
+  VPERM3(Chi_12,Chi_12) );
+
+
+#define MULT_ADDSUB_2SPIN(ptr,pf)					\
+  LOAD64(%r8,ptr)						\
+  LOAD64(%r9,pf)						\
+	   __asm__ (						\
+	   VPREFETCH2(9,%r8)				   \
+	   VPREFETCH2(10,%r8)					   \
+	   VPREFETCH2(11,%r8)					   \
+	   VPREFETCH2(12,%r8)					   \
+	   VPREFETCH2(13,%r8)					   \
+	   VPREFETCH2(14,%r8)					   \
+	   VPREFETCH2(15,%r8)					   \
+	   VPREFETCH2(16,%r8)					   \
+	   VPREFETCH2(17,%r8)					   \
+	   VSHUF(Chi_00,T1)				\
+	   VMOVIDUP(0,%r8,Z0 )					\
+           VMOVIDUP(3,%r8,Z1 )					\
+           VMOVIDUP(6,%r8,Z2 )	          VSHUF(Chi_10,T2)		\
+	   /*6*/							\
+           VMUL(Z0,T1,UChi_00)            VMOVRDUP(0,%r8,Z3 )	\
+           VMUL(Z0,T2,UChi_10)            VMOVRDUP(3,%r8,Z4 )	\
+           VMUL(Z1,T1,UChi_01)            VMOVRDUP(6,%r8,Z5 )	\
+           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
+           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
+           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
+	   VPREFETCHG(0,%r9)					   \
+	   VPREFETCHG(1,%r9)					   \
+	   VPREFETCHG(2,%r9)					   \
+	   VPREFETCHG(3,%r9)					   \
+	   /*18*/						\
+           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
+           VMADDSUB(Z3,Chi_10,UChi_10)				\
+           VMADDSUB(Z4,Chi_00,UChi_01)    VMOVRDUP(1,%r8,Z3 )	\
+           VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
+           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
+           VMADDSUB(Z5,Chi_10,UChi_12)				\
+	   VPREFETCHG(4,%r9)					   \
+	   VPREFETCHG(5,%r9)					   \
+	   VPREFETCHG(6,%r9)					   \
+	   VPREFETCHG(7,%r9)					   \
+	   /*28*/						\
+           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
+           VMADDSUB(Z0,T2,UChi_10)				\
+           VMADDSUB(Z1,T1,UChi_01)        VMOVIDUP(2,%r8,Z0 )	\
+           VMADDSUB(Z1,T2,UChi_11)				\
+           VMADDSUB(Z2,T1,UChi_02)        VMOVIDUP(5,%r8,Z1 )	\
+           VMADDSUB(Z2,T2,UChi_12)        VMOVIDUP(8,%r8,Z2 )	\
+	   VPREFETCH2(12,%r9)					   \
+	   VPREFETCH2(13,%r9)					   \
+	   VPREFETCH2(14,%r9)					   \
+	   VPREFETCH2(15,%r9)					   \
+	   VPREFETCH2(16,%r9)					   \
+	   VPREFETCH2(17,%r9)					   \
+	   VPREFETCH2(18,%r9)					   \
+	   VPREFETCH2(19,%r9)					   \
+	   VPREFETCH2(20,%r9)					   \
+	   VPREFETCH2(21,%r9)					   \
+	   VPREFETCH2(22,%r9)					   \
+	   VPREFETCH2(23,%r9)					   \
+           /*38*/						\
+           VMADDSUB(Z3,Chi_01,UChi_00)    VSHUF(Chi_02,T1)	\
+           VMADDSUB(Z3,Chi_11,UChi_10)				\
+           VMADDSUB(Z4,Chi_01,UChi_01)    VMOVRDUP(2,%r8,Z3 )	\
+           VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
+           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
+           VMADDSUB(Z5,Chi_11,UChi_12)				\
+	   VPREFETCHG(9,%r8)				   \
+	   VPREFETCHG(10,%r8)					   \
+	   VPREFETCHG(11,%r8)					   \
+	   VPREFETCHG(12,%r8)					   \
+	   VPREFETCHG(13,%r8)					   \
+	   VPREFETCHG(14,%r8)					   \
+	   VPREFETCHG(15,%r8)					   \
+	   VPREFETCHG(16,%r8)					   \
+	   VPREFETCHG(17,%r8)					   \
+	   /*48*/						\
+           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
+           VMADDSUB(Z0,T2,UChi_10)			      \
+           VMADDSUB(Z1,T1,UChi_01)			      \
+           VMADDSUB(Z1,T2,UChi_11)			      \
+           VMADDSUB(Z2,T1,UChi_02)			      \
+           VMADDSUB(Z2,T2,UChi_12)			      \
+	   VPREFETCHG(8,%r9)					   \
+	   VPREFETCHG(9,%r9)					   \
+	   VPREFETCHG(10,%r9)					   \
+	   VPREFETCHG(11,%r9)					   \
+	   /*55*/					      \
+           VMADDSUB(Z3,Chi_02,UChi_00)			      \
+           VMADDSUB(Z3,Chi_12,UChi_10)			      \
+           VMADDSUB(Z4,Chi_02,UChi_01)			      \
+           VMADDSUB(Z4,Chi_12,UChi_11)			      \
+           VMADDSUB(Z5,Chi_02,UChi_02)			      \
+           VMADDSUB(Z5,Chi_12,UChi_12)			      \
+	   /*61 insns*/							);
+
+
+#define MULT_ADDSUB_2SPIN_LS(ptr,pf)				   \
+  LOAD64(%r8,ptr)						   \
+  LOAD64(%r9,pf)						   \
+  __asm__ (							   \
+           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
+           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
+           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
+           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
+	   VPREFETCHG(0,%r9)					   \
+	   VPREFETCHG(1,%r9)					   \
+	   VPREFETCHG(2,%r9)					   \
+	   VPREFETCHG(3,%r9)					   \
+	   /*8*/						   \
+           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
+           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
+           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
+           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
+	   VPREFETCHG(4,%r9)					   \
+	   VPREFETCHG(5,%r9)					   \
+	   VPREFETCHG(6,%r9)					   \
+	   VPREFETCHG(7,%r9)					   \
+	   /*16*/					  	   \
+           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
+           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
+	   VPREFETCHG(8,%r9)					   \
+	   VPREFETCHG(9,%r9)					   \
+	   VPREFETCHG(10,%r9)					   \
+	   VPREFETCHG(11,%r9)					   \
+           /*22*/						   \
+           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
+           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
+           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
+           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
+	   VPREFETCH2(12,%r9)					   \
+	   VPREFETCH2(13,%r9)					   \
+	   VPREFETCH2(14,%r9)					   \
+	   VPREFETCH2(15,%r9)					   \
+	   /*30*/						   \
+           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
+	   VPREFETCH2(16,%r9)					   \
+	   VPREFETCH2(17,%r9)					   \
+	   VPREFETCH2(18,%r9)					   \
+	   VPREFETCH2(19,%r9)					   \
+           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
+	   /*36*/					           \
+           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
+           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
+           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
+	   VPREFETCH2(20,%r9)					   \
+	   VPREFETCH2(21,%r9)					   \
+	   VPREFETCH2(22,%r9)					   \
+	   VPREFETCH2(23,%r9)					   \
+	   VPREFETCHG(2,%r8)					   \
+	   VPREFETCHG(3,%r8)					   \
+	   VPREFETCH2(4,%r8)					   \
+	   VPREFETCH2(5,%r8)					   \
+	   /*42 insns*/						);
+
+#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
+  LOAD64(%r8,ptr)						   \
+  LOAD64(%r9,pf)						   \
+  __asm__ (							   \
+           VSHUF(Chi_00,T1)      VSHUF(Chi_10,T2)		   \
+           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
+           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
+           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
+	   /*8*/						   \
+           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
+           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
+           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
+           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
+	   /*16*/					  	   \
+           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
+           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
+           /*22*/						   \
+           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
+           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
+           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
+           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
+	   /*30*/						   \
+           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
+           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
+           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
+	   /*36*/					           \
+           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
+           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
+           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
+	   /*	   VPREFETCHG(2,%r8)*/				   \
+	   /*	   VPREFETCHG(3,%r8)*/				   \
+	   /*42 insns*/						);
+
+
+#define Z6 Chi_00
+#define MULT_ADDSUB_2SPIN_NEW(ptr,pf)			       \
+  LOAD64(%r8,ptr)					       \
+  __asm__ (							  \
+   VSHUFMEM(0,%r8,Z0)					          \
+   VRDUP(Chi_00,T1)           VIDUP(Chi_00,Chi_00)	          \
+   VRDUP(Chi_10,T2)           VIDUP(Chi_10,Chi_10)		  \
+   VMUL(Z0,Chi_00,Z1)         VMUL(Z0,Chi_10,Z2)		  \
+   VSHUFMEM(3,%r8,Z0)						  \
+   VMUL(Z0,Chi_00,Z3)         VMUL(Z0,Chi_10,Z4)		  \
+   VSHUFMEM(6,%r8,Z0)						  \
+   VMUL(Z0,Chi_00,Z5)         VMUL(Z0,Chi_10,Z6)		  \
+   VMULMEM(0,%r8,T1,UChi_00)  VMULMEM(0,%r8,T2,UChi_10)		  \
+   VMULMEM(3,%r8,T1,UChi_01)  VMULMEM(3,%r8,T2,UChi_11)		  \
+   VMULMEM(6,%r8,T1,UChi_02)  VMULMEM(6,%r8,T2,UChi_12)		  \
+   /*11 cycles*/						  \
+   VSHUFMEM(1,%r8,Z0)						  \
+   VRDUP(Chi_01,T1)           VIDUP(Chi_01,Chi_01)		  \
+   VRDUP(Chi_11,T2)           VIDUP(Chi_11,Chi_11)		  \
+   VMADD(Z0,Chi_01,Z1)        VMADD(Z0,Chi_11,Z2)		  \
+   VSHUFMEM(4,%r8,Z0)						  \
+   VMADD(Z0,Chi_01,Z3)        VMADD(Z0,Chi_11,Z4)		  \
+   VSHUFMEM(7,%r8,Z0)						  \
+   VMADD(Z0,Chi_01,Z5)        VMADD(Z0,Chi_11,Z6)		  \
+   VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10)	  \
+   VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11)	  \
+   VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12)	  \
+   /*22 cycles*/						  \
+   VSHUFMEM(2,%r8,Z0)						  \
+   VRDUP(Chi_02,T1)        VIDUP(Chi_02,Chi_02)			  \
+   VRDUP(Chi_12,T2)        VIDUP(Chi_12,Chi_12)			  \
+   VMADD(Z0,Chi_02,Z1)        VMADD(Z0,Chi_12,Z2)		  \
+   VSHUFMEM(5,%r8,Z0)						  \
+   VMADD(Z0,Chi_02,Z3)        VMADD(Z0,Chi_12,Z4)		  \
+   VSHUFMEM(8,%r8,Z0)						  \
+   VMADD(Z0,Chi_02,Z5)        VMADD(Z0,Chi_12,Z6)		  \
+   /*33 cycles*/						  \
+   VMADDSUBMEM(2,%r8,T1,Z1)   VMADDSUBMEM(2,%r8,T2,Z2)		  \
+   VMADDSUBMEM(5,%r8,T1,Z3)   VMADDSUBMEM(5,%r8,T2,Z4)	          \
+   VMADDSUBMEM(8,%r8,T1,Z5)   VMADDSUBMEM(8,%r8,T2,Z6)	       \
+  /*stall*/						       \
+  /*stall*/						       \
+  /*stall*/						       \
+  VADD(Z1,UChi_00,UChi_00)   VADD(Z2,UChi_10,UChi_10)	       \
+  VADD(Z3,UChi_01,UChi_01)   VADD(Z4,UChi_11,UChi_11)	       \
+  VADD(Z5,UChi_02,UChi_02)   VADD(Z6,UChi_12,UChi_12) )
+
+
+#endif
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@ -103,9 +103,11 @@ void LebesgueOrder::IterateI(int ND,
    } else {
      for(int d=0;d<ND;d++){
 	x[d]=xi[d]+xo[d];
+//	std::cout << x[d]<<" ";
      }
+//      std::cout << "\n";
      IndexInteger index;
-      grid->IndexFromCoor(x,index,grid->_rdimensions);
+      Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
      _LebesgueReorder.push_back(index);
    }
  }
@ -188,6 +190,7 @@ void LebesgueOrder::ZGraph(void)
  }
  assert( _LebesgueReorder.size() == vol );

+  /*
  std::vector<int> coor(4);
  for(IndexInteger asite=0;asite<vol;asite++){
    grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
@ -198,5 +201,6 @@ void LebesgueOrder::ZGraph(void)
 		<< coor[3]<<"]"
 		<<std::endl;
  }
+  */
 }
 }
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@ -44,8 +44,8 @@ template<class vsimd,class scalar>
 inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y, 
 		    std::vector<scalar *> &extracted,int offset){
  // FIXME: bounce off memory is painful
+  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
  int Nextr=extracted.size();
-  int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr;

  scalar*buf = (scalar *)y;
@ -59,8 +59,10 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 template<class vsimd,class scalar>
 inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y, 
 		  std::vector<scalar *> &extracted,int offset){
+
+  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
+
  int Nextr=extracted.size();
-  int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
                     // replicate n-fold. Use to allow Integer masks to 
                     // predicate floating point of various width assignments and maintain conformable.
@ -85,6 +87,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
  scalar *buf = (scalar *)&y;
  for(int i=0;i<Nextr;i++){
    extracted[i]=buf[i*s];
+#ifdef PARANOID
    for(int ii=1;ii<s;ii++){
      if ( buf[i*s]!=buf[i*s+ii] ){
 	std::cout<<GridLogMessage << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
@ -96,6 +99,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
      }
      assert(buf[i*s]==buf[i*s+ii]);
    }
+#endif
  }

 };
@ -106,7 +110,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 template<class vsimd,class scalar>
 inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type  &y,std::vector<scalar> &extracted){
  int Nextr=extracted.size();
-  int Nsimd=vsimd::Nsimd();
+  static const int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr;
  scalar *buf = (scalar *)&y;

@ -125,9 +129,9 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;

-  const int Nsimd=vobj::vector_type::Nsimd();
+  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
+  static const int words=sizeof(vobj)/sizeof(vector_type);
  int Nextr=extracted.size();
-  const int words=sizeof(vobj)/sizeof(vector_type);
  int s=Nsimd/Nextr;

  std::vector<scalar_type *> pointers(Nextr);
@ -148,8 +152,8 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;

-  const int words=sizeof(vobj)/sizeof(vector_type);
-  const int Nsimd=vobj::vector_type::Nsimd();
+  static const int words=sizeof(vobj)/sizeof(vector_type);
+  static const int Nsimd=vobj::vector_type::Nsimd();

  int Nextr=extracted.size();
  int s = Nsimd/Nextr;
@ -172,8 +176,8 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
  
-  const int Nsimd=vobj::vector_type::Nsimd();
-  const int words=sizeof(vobj)/sizeof(vector_type);
+  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
+  static const int words=sizeof(vobj)/sizeof(vector_type);

  int Nextr = extracted.size();
  int splat=Nsimd/Nextr;
@ -197,7 +201,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
  
-  const int Nsimd=vobj::vector_type::Nsimd();
+  const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
  const int words=sizeof(vobj)/sizeof(vector_type);

  int Nextr=extracted.size();
@ -224,20 +228,17 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
  
-  const int Nsimd=vobj::vector_type::Nsimd();
-  const int words=sizeof(vobj)/sizeof(vector_type);
+  static const int Nsimd=vobj::vector_type::Nsimd();
+  static const int words=sizeof(vobj)/sizeof(vector_type);

-  scalar_type *pointer;
  scalar_type *vp = (scalar_type *)&vec;

  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);

+  for(int w=0;w<words;w++){
  for(int i=0;i<Nsimd;i++){
-    pointer=(scalar_type *)&extracted[i][offset];
-    for(int w=0;w<words;w++){
-      vp[w*Nsimd+i] = pointer[w];
-    }
-  }
+      vp[w*Nsimd+i] = ((scalar_type *)&extracted[i][offset])[w];
+  }}
 }

 template<class vobj> inline 
--- a/scripts/filelist
+++ b/scripts/filelist
@ -1,5 +1,5 @@
 #!/bin/bash
-
+ 
 cd lib

 HFILES=`find . -type f -name '*.h'`
@ -18,7 +18,7 @@ TESTS=`ls T*.cc`
 TESTLIST=`echo ${TESTS} | sed s/.cc//g `

 echo > Make.inc
-echo bin_PROGRAMS = ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
+echo bin_PROGRAMS += ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
 echo >> Make.inc

 for f in $TESTS
--- a/tests/Make.inc
+++ b/tests/Make.inc
@ -1,5 +1,5 @@

-bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
+bin_PROGRAMS += Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 


 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
@ -50,6 +50,14 @@ Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
 Test_cshift_red_black_LDADD=-lGrid


+Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
+Test_cshift_red_black_rotate_LDADD=-lGrid
+
+
+Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
+Test_cshift_rotate_LDADD=-lGrid
+
+
 Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
 Test_dwf_cg_prec_LDADD=-lGrid

@ -90,6 +98,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 Test_dwf_lanczos_LDADD=-lGrid


+Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
+Test_dwf_rb5d_LDADD=-lGrid
+
+
 Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid

--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@ -8,8 +8,20 @@ endif
 AM_CXXFLAGS = -I$(top_srcdir)/lib
 AM_LDFLAGS = -L$(top_builddir)/lib

+if USE_LAPACK
+AM_CXXFLAGS += -DUSE_LAPACK
+if USE_LAPACK_LIB
+#if test "X${ac_LAPACK}X" != XyesX 
+AM_CXXFLAGS += -I$(ac_LAPACK)/include
+AM_LDFLAGS += -L$(ac_LAPACK)/lib
+#fi
+endif
+endif
+
 if BUILD_ZMM
  bin_PROGRAMS=Test_zmm
+else
+  bin_PROGRAMS=
 endif

 include Make.inc
--- a/tests/Test_cshift.cc
+++ b/tests/Test_cshift.cc
@ -96,13 +96,13 @@ int main (int argc, char ** argv)
 	  std::vector<int> peer(4);
 	  Complex tmp  =cm;
 	  Integer index=real(tmp);
-	  Fine.CoorFromIndex(peer,index,latt_size);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);

 	  if (nrm > 0){
 	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
-	    Fine.CoorFromIndex(peer,index,latt_size);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	  }
 	}}}}
--- a/tests/Test_cshift_red_black.cc
+++ b/tests/Test_cshift_red_black.cc
@ -132,7 +132,7 @@ int main (int argc, char ** argv)
 	  std::vector<int> peer(4);
 	  Complex ctmp = cm;
 	  Integer index=real(ctmp);
-	  Fine.CoorFromIndex(peer,index,latt_size);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);

 	  if (nrm > 0){
 	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
@ -140,7 +140,7 @@ int main (int argc, char ** argv)
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
-	    Fine.CoorFromIndex(peer,index,latt_size);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exit(-1);
 	  }
@ -180,7 +180,7 @@ int main (int argc, char ** argv)
 	  std::vector<int> peer(4);
 	  Complex ctmp=cmeo;
 	  Integer index=real(ctmp);
-	  Fine.CoorFromIndex(peer,index,latt_size);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);

 	  double nrm = abs(cmeo()()()-scm);
 	  if (nrm != 0) {
@ -189,7 +189,7 @@ int main (int argc, char ** argv)
 		     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
-	    Fine.CoorFromIndex(peer,index,latt_size);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exx=1;

@ -205,7 +205,7 @@ int main (int argc, char ** argv)
 		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    index=real(scm);
-	    Fine.CoorFromIndex(peer,index,latt_size);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
 	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 	    exx=1;
 	  } else if (1) { 
--- a/tests/Test_cshift_red_black_rotate.cc
+++ b/tests/Test_cshift_red_black_rotate.cc
@ -0,0 +1,223 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift_red_black.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  int Nd = latt_size.size();
+  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<int> mask(Nd,1);
+  mask[0]=0;
+
+  GridCartesian         Fine  (latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
+
+  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
+
+  LatticeComplex U(&Fine);
+  LatticeComplex ShiftU(&Fine);
+  LatticeComplex rbShiftU(&Fine);
+  LatticeComplex Ue(&RBFine); 
+  LatticeComplex Uo(&RBFine);
+  LatticeComplex ShiftUe(&RBFine);
+  LatticeComplex ShiftUo(&RBFine);
+  LatticeComplex lex(&Fine);
+  lex=zero;
+  Integer stride =1;
+  {
+    double nrm;
+    LatticeComplex coor(&Fine);
+
+    for(int d=0;d<Nd;d++){
+      //      Integer i=10000;
+      Integer i=0;
+      LatticeCoordinate(coor,d);
+      lex = lex + coor*stride+i;
+      stride=stride*latt_size[d];
+    }
+    U=lex;
+  }
+
+  pickCheckerboard(Even,Ue,U);
+  pickCheckerboard(Odd,Uo,U);
+
+  //  std::cout<<GridLogMessage << U<<std::endl;
+  std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
+  std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
+
+
+  TComplex cm;
+  TComplex cmeo;
+  for(int dir=0;dir<Nd;dir++){
+    //    if ( dir!=1 ) continue;
+    for(int shift=0;shift<latt_size[dir];shift++){
+
+	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
+
+	std::cout<<GridLogMessage<<"Even grid"<<std::endl;
+	ShiftUe = Cshift(Ue,dir,shift);    // Shift everything cb by cb
+	std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
+
+	std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
+	ShiftUo = Cshift(Uo,dir,shift);    
+	std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
+
+	std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
+	setCheckerboard(rbShiftU,ShiftUe);
+	setCheckerboard(rbShiftU,ShiftUo);
+	std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
+
+	std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
+	ShiftU  = Cshift(U,dir,shift);    // Shift everything
+	std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
+
+	std::vector<int> coor(4);
+
+	std::cout<<GridLogMessage << "Checking the non-checkerboard shift"<<std::endl;
+	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
+	  
+	  peekSite(cm,ShiftU,coor);
+
+	  /////////	  double nrm=norm2(U);
+
+	  std::vector<int> scoor(coor);
+	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
+	  
+	  Integer slex = scoor[0]
+	    + latt_size[0]*scoor[1]
+	    + latt_size[0]*latt_size[1]*scoor[2]
+	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
+
+	  Complex scm(slex);
+	  
+	  double nrm = abs(scm-cm()()());
+	  std::vector<int> peer(4);
+	  Complex ctmp = cm;
+	  Integer index=real(ctmp);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);
+
+	  if (nrm > 0){
+	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
+		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
+		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    index=real(scm);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
+	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    exit(-1);
+	  }
+	}}}}
+
+	int exx=0;
+	std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
+	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
+	  
+	  peekSite(cm,rbShiftU,coor);
+
+	  Integer checkerboard = RBFine.CheckerBoard(coor);
+
+	  //	  std::cout << " coor "<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] \n ";
+	  //	  std::cout << "shift "<< shift <<" dir "<<dir<< " checker board "<< checkerboard << " ";
+	  //	  std::cout << "Uo "   << ShiftUo.checkerboard << " Ue "<<ShiftUe.checkerboard<<std::endl;
+	  if ( checkerboard == ShiftUo.checkerboard ) {
+	    peekSite(cmeo,ShiftUo,coor);
+	  } else { 
+	    peekSite(cmeo,ShiftUe,coor);
+	  }
+
+
+	  std::vector<int> scoor(coor);
+	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
+	  
+	  Integer slex = scoor[0]
+	    + latt_size[0]*scoor[1]
+	    + latt_size[0]*latt_size[1]*scoor[2]
+	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
+
+	  Complex scm(slex);
+
+	  std::vector<int> peer(4);
+	  Complex ctmp=cmeo;
+	  Integer index=real(ctmp);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);
+
+	  double nrm = abs(cmeo()()()-scm);
+	  if (nrm != 0) {
+	    std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir
+		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
+		     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    index=real(scm);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
+	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    exx=1;
+
+	  }
+
+	  ctmp=cm;
+	  index=real(ctmp);
+	  nrm = abs(scm-cm()()());
+
+	  if (nrm > 0){
+	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
+		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
+		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    index=real(scm);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
+	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    exx=1;
+	  } else if (1) { 
+	    std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
+		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
+		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	  }
+	}}}}
+	if (exx) exit(-1);
+
+    }
+  }
+
+  Grid_finalize();
+}
--- a/tests/Test_cshift_rotate.cc
+++ b/tests/Test_cshift_rotate.cc
@ -0,0 +1,125 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
+
+  LatticeComplex U(&Fine);
+  LatticeComplex ShiftU(&Fine);
+
+  LatticeComplex lex(&Fine);
+  lex=zero;
+  Integer stride =1;
+  {
+    double nrm;
+    LatticeComplex coor(&Fine);
+
+    for(int d=0;d<4;d++){
+      LatticeCoordinate(coor,d);
+      lex = lex + coor*stride;
+      stride=stride*latt_size[d];
+    }
+    U=lex;
+  }
+
+
+  TComplex cm;
+  
+  for(int dir=0;dir<4;dir++){
+    for(int shift=0;shift<latt_size[dir];shift++){
+      if ( Fine.IsBoss() ) 
+	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
+
+	ShiftU  = Cshift(U,dir,shift);    // Shift everything
+
+	/*
+	std::cout << "U[0]" << U[0]<<std::endl;
+	std::cout << "U[1]" << U[1]<<std::endl;
+	std::cout << "ShiftU[0]" << ShiftU[0]<<std::endl;
+	std::cout << "ShiftU[1]" << ShiftU[1]<<std::endl;
+	*/
+	std::vector<int> coor(4);
+
+	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
+	  
+	  peekSite(cm,ShiftU,coor);
+
+	  double nrm=norm2(U);
+
+	  std::vector<int> scoor(coor);
+	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
+	  
+	  Integer slex = scoor[0]
+	    + latt_size[0]*scoor[1]
+	    + latt_size[0]*latt_size[1]*scoor[2]
+	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
+
+	  Complex scm(slex);
+	  
+	  nrm = abs(scm-cm()()());
+	  std::vector<int> peer(4);
+	  Complex tmp  =cm;
+	  Integer index=real(tmp);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);
+
+	  if (nrm > 0){
+	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    index=real(scm);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
+	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	  }
+	  /*
+	  else {
+	    std::cerr<<"PASS shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	  }
+	  */
+	}}}}
+    }
+  }
+
+  Grid_finalize();
+}
--- a/tests/Test_dwf_hdcr.cc
+++ b/tests/Test_dwf_hdcr.cc
@ -42,6 +42,8 @@ public:
 			  int, domaindecompose,
 			  int, domainsize,
 			  int, order,
+			  int, Ls,
+			  double, mq,
 			  double, lo,
 			  double, hi,
 			  int, steps);
@ -263,11 +265,6 @@ public:
      resid = norm2(r) /norm2(src); 
      std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;

-
-// Npoly*outer*2 1/2 vol matmuls.
-// 71 iters => 20*71 = 1400 matmuls.
-// 2*71 = 140 comms.
-
      // Even domain solve
      r= where(subset==(Integer)0,r,zz);
      _SmootherOperator.AdjOp(r,vec1);
@ -332,7 +329,7 @@ public:
    CoarseVector Ctmp(_CoarseOperator.Grid());
    CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;

-    ConjugateGradient<CoarseVector>  CG(1.0e-3,100000);
+    ConjugateGradient<CoarseVector>  CG(3.0e-3,100000);
    //    ConjugateGradient<FineField>    fCG(3.0e-2,1000);

    HermitianLinearOperator<CoarseOperator,CoarseVector>  HermOp(_CoarseOperator);
@ -345,14 +342,14 @@ public:

    //    Chebyshev<FineField> Cheby    (0.5,70.0,30,InverseApproximation);
    //    Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
-    Chebyshev<FineField> Cheby    (2.0,70.0,15,InverseApproximation);
-    Chebyshev<FineField> ChebyAccu(2.0,70.0,15,InverseApproximation);
+    Chebyshev<FineField> Cheby    (params.lo,params.hi,params.order,InverseApproximation);
+    Chebyshev<FineField> ChebyAccu(params.lo,params.hi,params.order,InverseApproximation);
    //    Cheby.JacksonSmooth();
    //    ChebyAccu.JacksonSmooth();

-    _Aggregates.ProjectToSubspace  (Csrc,in);
-    _Aggregates.PromoteFromSubspace(Csrc,out);
-    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
+    //    _Aggregates.ProjectToSubspace  (Csrc,in);
+    //    _Aggregates.PromoteFromSubspace(Csrc,out);
+    //    std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
    
    //    ofstream fout("smoother");
    //    Cheby.csv(fout);
@ -479,7 +476,7 @@ int main (int argc, char ** argv)
  read(RD,"params",params);
  std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;

-  const int Ls=8;
+  const int Ls=params.Ls;

  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@ -490,10 +487,12 @@ int main (int argc, char ** argv)
  ///////////////////////////////////////////////////
  // Construct a coarsened grid; utility for this?
  ///////////////////////////////////////////////////
-  const int block=2;
+  std::vector<int> block ({2,2,2,2});
+  const int nbasis= 32;
+
  std::vector<int> clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/block;
+    clatt[d] = clatt[d]/block[d];
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
@ -539,7 +538,7 @@ int main (int argc, char ** argv)
  //  SU3::HotConfiguration(RNG4,Umu);
  //  Umu=zero;

-  RealD mass=0.01;
+  RealD mass=params.mq;
  RealD M5=1.8;

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
@ -548,9 +547,6 @@ int main (int argc, char ** argv)
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);

-  const int nbasis = 32;
-  //  const int nbasis = 4;
-
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis>              Subspace;
  typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis>          CoarseOperator;
  typedef CoarseOperator::CoarseVector                                 CoarseVector;
@ -564,7 +560,8 @@ int main (int argc, char ** argv)
  assert ( (nbasis & 0x1)==0);
  int nb=nbasis/2;
  std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
-  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
+  //  Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
+  Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
  for(int n=0;n<nb;n++){
    G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
    std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
@ -600,7 +597,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
  ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
-  CG(PosdefLdop,c_src,c_res);
+  //  CG(PosdefLdop,c_src,c_res);

  //  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  //  std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
@ -625,17 +622,17 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  Precon.SmootherTest(src);
+  //  Precon.SmootherTest(src);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PreconDD.SmootherTest(src);
+  //  PreconDD.SmootherTest(src);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PreconDD.SAP(src,result);
+  //  PreconDD.SAP(src,result);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
@ -663,18 +660,18 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
-  result=zero;
-  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  PGCRDD(HermIndefOp,src,result);
+  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
+  //  result=zero;
+  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  //  PGCRDD(HermIndefOp,src,result);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
-  //  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
-  //  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
-  //  result=zero;
-  //  PGCR(HermIndefOp,src,result);
+  PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,8);
+  std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
+  result=zero;
+  PGCR(HermIndefOp,src,result);

  std::cout<<GridLogMessage << "**************************************************"<< std::endl;
  std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
--- a/Show More
+++ b/Show More