Merge branch 'master' into hadrons

2026-02-01 12:53:28 +00:00 · 2016-04-30 00:18:31 -07:00
parent 1869d28429 f6c53e5039
commit dc5f32e5f0
33 changed files with 1549 additions and 520 deletions
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -127,7 +127,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  t1=usecond();
  mfa = flops*ncall/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s =   "<< mfa<<std::endl;
-
+  /*
  int dag=DaggerNo;
  t0=usecond();
  for(int i=0;i<1;i++){
@@ -144,12 +144,12 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  t1=usecond();
  mfl1= flops*100/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
-
  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
     << mfc<<" "
     << mfa<<" "
     << mfo<<" "
     << mfl1<<std::endl;
+  */

 #if 0
  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
@@ -166,6 +166,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
  diff = resulto-resulta;
  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
  std::cout<<std::endl;
+  return 0;
 }


--- a/configure.ac
+++ b/configure.ac
@@ -55,15 +55,6 @@ echo :::::::::::::::::::::::::::::::::::::::::::

 AC_CHECK_FUNCS([gettimeofday])

-#AC_CHECK_LIB([gmp],[__gmpf_init],,
-#        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.gmplib.org)])
-
-#AC_CHECK_LIB([mpfr],[mpfr_init],,
-#        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
-#Please install or provide the correct path to your installation
-#Info at: http://www.mpfr.org/)])

 #
 # SIMD instructions selection
@@ -124,7 +115,7 @@ case ${ac_SIMD} in
       echo Configuring for IMCI
       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
       supported="cross compilation"
-       ac_ZMM=yes;
+       ac_ZMM=no;
     ;;
     NEONv8)
       echo Configuring for experimental ARMv8a support 
--- a/lib/Make.inc
+++ b/lib/Make.inc
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -47,6 +47,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
 #define _MM_SELECT_TWO_TWO  (A,B)     _MM_SELECT_FOUR_TWO(0,0,A,B)

+#define RotateBit (0x100)
+
 namespace Grid {

  typedef uint32_t Integer;
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -321,6 +321,9 @@ PARALLEL_FOR_LOOP
 	int simd_layout     = _grid->_simd_layout[dimension];
 	int comm_dim        = _grid->_processors[dimension] >1 ;
 	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+	int rotate_dim      = _grid->_simd_layout[dimension]>2;
+
+	assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported

 	int sshift[2];
 	
@@ -368,6 +371,7 @@ PARALLEL_FOR_LOOP
      int rd = _grid->_rdimensions[dimension];
      int ld = _grid->_ldimensions[dimension];
      int gd = _grid->_gdimensions[dimension];
+      int ly = _grid->_simd_layout[dimension];

      // Map to always positive shift modulo global full dimension.
      int shift = (shiftpm+fd)%fd;
@@ -398,7 +402,7 @@ PARALLEL_FOR_LOOP
 	  int wrap = sshift/rd;
 	  int  num = sshift%rd;
 	  if ( x< rd-num ) permute_slice=wrap;
-	  else permute_slice = 1-wrap;
+	  else permute_slice = (wrap+1)%ly;
 	}

  	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
@@ -418,7 +422,6 @@ PARALLEL_FOR_LOOP
      int simd_layout     = _grid->_simd_layout[dimension];
      int comm_dim        = _grid->_processors[dimension] >1 ;
      
-      //      assert(simd_layout==1); // Why?
      assert(comm_dim==1);
      int shift = (shiftpm + fd) %fd;
      assert(shift>=0);
@@ -591,8 +594,11 @@ PARALLEL_FOR_LOOP
      template<class compressor>
      void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
      {
-	auto thr = HaloExchangeBegin(source,compress);
-        HaloExchangeComplete(thr);
+	Mergers.resize(0); 
+	Packets.resize(0);
+	HaloGather(source,compress);
+	Communicate();
+	CommsMerge();
      }

      void HaloExchangeComplete(std::thread &thr) 
@@ -749,6 +755,7 @@ PARALLEL_FOR_LOOP
 	  int comm_dim        = _grid->_processors[dimension] >1 ;

 	  assert(comm_dim==1);
+	  // This will not work with a rotate dim
 	  assert(simd_layout==2);
 	  assert(shift>=0);
 	  assert(shift<fd);
@@ -794,6 +801,8 @@ PARALLEL_FOR_LOOP

 	      for(int i=0;i<Nsimd;i++){
 		
+		// FIXME 
+		// This logic is hard coded to simd_layout ==2 and not allowing >2
 		//		std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;

 		int inner_bit = (Nsimd>>(permute_type+1));
--- a/lib/algorithms/approx/Remez.h
+++ b/lib/algorithms/approx/Remez.h
@@ -16,9 +16,13 @@
 #define INCLUDED_ALG_REMEZ_H

 #include <stddef.h>
+#include <Config.h>

-//#include <algorithms/approx/bigfloat.h>
+#ifdef HAVE_GMP_H
+#include <algorithms/approx/bigfloat.h>
+#else
 #include <algorithms/approx/bigfloat_double.h>
+#endif

 #define JMAX 10000 //Maximum number of iterations of Newton's approximation
 #define SUM_MAX 10 // Maximum number of terms in exponential
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -564,6 +564,7 @@ until convergence
 	  
 	  for(int j=k1-1; j<k2+1; ++j){
 	    for(int k=0; k<Nm; ++k){
+	    B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+Nm*j] * evec[k];
 	    }
 	  }
@@ -592,6 +593,7 @@ until convergence
 	  
 	  for(int j = 0; j<Nk; ++j){
 	    for(int k = 0; k<Nk; ++k){
+	    B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+j*Nm] * evec[k];
 	    }
 //	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -138,6 +138,25 @@ public:
    }
    inline int PermuteType(int dimension){
      int permute_type=0;
+      //
+      // FIXME:
+      //
+      // Best way to encode this would be to present a mask 
+      // for which simd dimensions are rotated, and the rotation
+      // size. If there is only one simd dimension rotated, this is just 
+      // a permute. 
+      //
+      // Cases: PermuteType == 1,2,4,8
+      // Distance should be either 0,1,2..
+      //
+      if ( _simd_layout[dimension] > 2 ) { 
+	for(int d=0;d<_ndimension;d++){
+	  if ( d != dimension ) assert ( (_simd_layout[d]==1)  );
+	}
+	permute_type = RotateBit; // How to specify distance; this is not just direction.
+	return permute_type;
+      }
+
      for(int d=_ndimension-1;d>dimension;d--){
 	if (_simd_layout[d]>1 ) permute_type++;
      }
@@ -147,12 +166,12 @@ public:
    // Array sizing queries
    ////////////////////////////////////////////////////////////////

-    inline int iSites(void) { return _isites; };
-    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
-    inline int oSites(void) { return _osites; };
-    inline int lSites(void) { return _isites*_osites; }; 
-    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
-    inline int Nd    (void) { return _ndimension;};
+    inline int iSites(void) const { return _isites; };
+    inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
+    inline int oSites(void) const { return _osites; };
+    inline int lSites(void) const { return _isites*_osites; }; 
+    inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
+    inline int Nd    (void) const { return _ndimension;};

    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
@@ -165,6 +184,9 @@ public:
    void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
      Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
    }
+    void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
+      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
+    }
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
      gidx=0;
      int mult=1;
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -324,6 +324,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
  int rd = grid->_rdimensions[dimension];
  int ld = grid->_ldimensions[dimension];
  int gd = grid->_gdimensions[dimension];
+  int ly = grid->_simd_layout[dimension];

  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;
@@ -332,6 +333,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
  // the permute type
  int permute_dim =grid->PermuteDim(dimension);
  int permute_type=grid->PermuteType(dimension);
+  int permute_type_dist;

  for(int x=0;x<rd;x++){       

@@ -343,15 +345,31 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;

+    // FIXME : This must change where we have a 
+    // Rotate slice.
+    
+    // Document how this works ; why didn't I do this when I first wrote it...
+    // wrap is whether sshift > rd.
+    //  num is sshift mod rd.
+    // 
    int permute_slice=0;
    if(permute_dim){
      int wrap = sshift/rd;
      int  num = sshift%rd;
+
      if ( x< rd-num ) permute_slice=wrap;
-      else permute_slice = 1-wrap;
+      else permute_slice = (wrap+1)%ly;
+
+      if ( (ly>2) && (permute_slice) ) {
+	assert(permute_type & RotateBit);
+	permute_type_dist = permute_type|permute_slice;
+      } else {
+	permute_type_dist = permute_type;
      }
      
-    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+    }
+
+    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 

  
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -152,7 +152,7 @@ PARALLEL_FOR_LOOP
    // Peek a scalar object from the SIMD array
    //////////////////////////////////////////////////////////
    template<class vobj,class sobj>
-    void peekLocalSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
        
      GridBase *grid=l._grid;

--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -142,6 +142,10 @@ namespace Grid {
        mult(&phi(),&U(mu),&chi());
      }

+      template<class ref>
+      inline void loadLinkElement(Simd & reg,ref &memory){
+	reg = memory;
+      }
      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
      {
        conformable(Uds._grid,GaugeGrid);
@@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP

    };

+
+
+    ///////
+    // Single flavour four spinors with colour index, 5d redblack
+    ///////
+    template<class S,int Nrepresentation=Nc>
+    class DomainWallRedBlack5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
+    public:
+
+      typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
+
+      INHERIT_GIMPL_TYPES(Gimpl);
+      
+      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
+      template<typename vtype> using iImplHalfSpinor         = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
+      template<typename vtype> using iImplDoubledGaugeField  = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
+      template<typename vtype> using iImplGaugeField         = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
+      template<typename vtype> using iImplGaugeLink          = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
+    
+      typedef iImplSpinor    <Simd>           SiteSpinor;
+      typedef iImplHalfSpinor<Simd>           SiteHalfSpinor;
+      typedef Lattice<SiteSpinor>             FermionField;
+
+      // Make the doubled gauge field a *scalar*
+      typedef iImplDoubledGaugeField<typename Simd::scalar_type>    SiteDoubledGaugeField; // This is a scalar
+      typedef iImplGaugeField<typename Simd::scalar_type>           SiteScalarGaugeField;  // scalar
+      typedef iImplGaugeLink <typename Simd::scalar_type>           SiteScalarGaugeLink;   // scalar
+
+      typedef Lattice<SiteDoubledGaugeField>                  DoubledGaugeField;
+
+      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
+      typedef WilsonImplParams ImplParams;
+      typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
+
+      ImplParams Params;
+
+      DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
+
+      bool overlapCommsCompute(void) { return false; };
+    
+      template<class ref>
+      inline void loadLinkElement(Simd & reg,ref &memory){
+	vsplat(reg,memory);
+      }
+      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
+      {
+	SiteGaugeLink UU;
+	for(int i=0;i<Nrepresentation;i++){
+	  for(int j=0;j<Nrepresentation;j++){
+	    vsplat(UU()()(i,j),U(mu)()(i,j));
+	  }
+	}
+        mult(&phi(),&UU(),&chi());
+      }
+
+      inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
+      {
+	SiteScalarGaugeField  ScalarUmu;
+	SiteDoubledGaugeField ScalarUds;
+
+        GaugeLinkField U   (Umu._grid);
+	GaugeField     Uadj(Umu._grid);
+        for(int mu=0;mu<Nd;mu++){
+  	  U = PeekIndex<LorentzIndex>(Umu,mu);
+	  U = adj(Cshift(U,mu,-1));
+	  PokeIndex<LorentzIndex>(Uadj,U,mu);
+	}
+
+	for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
+	  std::vector<int> lcoor;
+	  GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
+
+	  peekLocalSite(ScalarUmu,Umu,lcoor);
+	  for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
+
+	  peekLocalSite(ScalarUmu,Uadj,lcoor);
+	  for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
+
+	  pokeLocalSite(ScalarUds,Uds,lcoor);
+	}
+
+      }
+	
+      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+	assert(0);
+      }   
+
+      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
+	assert(0);
+      }
+
+    };
+
+
    ////////////////////////////////////////////////////////////////////////////////////////
    // Flavour doubled spinors; is Gparity the only? what about C*?
    ////////////////////////////////////////////////////////////////////////////////////////
@@ -290,8 +388,8 @@ PARALLEL_FOR_LOOP
 	conformable(Uds._grid,GaugeGrid);
 	conformable(Umu._grid,GaugeGrid);
 	
-	GaugeLinkField Utmp(GaugeGrid);
-	GaugeLinkField U(GaugeGrid);
+	GaugeLinkField Utmp (GaugeGrid);
+	GaugeLinkField U    (GaugeGrid);
 	GaugeLinkField Uconj(GaugeGrid);
 	
 	Lattice<iScalar<vInteger> > coor(GaugeGrid);
@@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
    typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
    typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double

+    typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
+    typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
+    typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
+
    typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
    typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
    typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -288,12 +288,8 @@ PARALLEL_FOR_LOOP
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag) 
  {
-    if ( Impl::overlapCommsCompute () ) { 
-      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
-    } else { 
    DhopInternalCommsThenCompute(st,U,in,out,dag);
  }
-  }
  template<class Impl>
  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 							 const FermionField &in, FermionField &out,int dag) {
@@ -331,15 +327,6 @@ PARALLEL_FOR_LOOP
  };

 
-  template<class Impl>
-  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
-						     const FermionField &in, FermionField &out,int dag) {
-
-    assert(0);
-
-  };
-
- 
  FermOpTemplateInstantiate(WilsonFermion);
  GparityFermOpTemplateInstantiate(WilsonFermion);

--- a/lib/qcd/action/fermion/WilsonFermion.h
+++ b/lib/qcd/action/fermion/WilsonFermion.h
@@ -116,9 +116,6 @@ namespace Grid {

      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 				    const FermionField &in, FermionField &out,int dag) ;
-      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
-				    const FermionField &in, FermionField &out,int dag) ;
-

      // Constructor
      WilsonFermion(GaugeField &_Umu,
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -68,10 +68,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
-  
  assert(FiveDimRedBlackGrid._ndimension==5);
  assert(FourDimRedBlackGrid._ndimension==4);
-
  assert(FiveDimRedBlackGrid._checker_dim==1);

  // Dimension zero of the five-d is the Ls direction
@@ -106,6 +104,70 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  dslashtime=0;
  dslash1time=0;
 }  
+
+template<class Impl>
+WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
+				       GridCartesian         &FiveDimGrid,
+				       GridRedBlackCartesian &FiveDimRedBlackGrid,
+				       GridCartesian         &FourDimGrid,
+				       GridRedBlackCartesian &FourDimRedBlackGrid,
+				       RealD _M5,const ImplParams &p) :
+  Kernels(p),
+  _FiveDimGrid        (&FiveDimGrid),
+  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
+  _FourDimGrid        (&FourDimGrid),
+  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
+  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
+  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
+  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
+  M5(_M5),
+  Umu(_FourDimGrid),
+  UmuEven(_FourDimRedBlackGrid),
+  UmuOdd (_FourDimRedBlackGrid),
+  Lebesgue(_FourDimGrid),
+  LebesgueEvenOdd(_FourDimRedBlackGrid)
+{
+  int nsimd = Simd::Nsimd();
+
+  // some assertions
+  assert(FiveDimGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._ndimension==5);
+  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
+  assert(FourDimGrid._ndimension==4);
+  assert(FourDimRedBlackGrid._ndimension==4);
+
+  // Dimension zero of the five-d is the Ls direction
+  Ls=FiveDimGrid._fdimensions[0];
+  assert(FiveDimGrid._processors[0]         ==1);
+  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
+
+  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
+  assert(FiveDimRedBlackGrid._processors[0] ==1);
+  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
+
+  // Other dimensions must match the decomposition of the four-D fields 
+  for(int d=0;d<4;d++){
+    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
+
+    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
+    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
+
+    assert(FourDimGrid._simd_layout[d]=1);
+    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
+    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
+    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
+
+    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
+    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
+    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
+  }
+
+  // Allocate the required comms buffer
+  ImportGauge(_Umu);
+}  
+
+
 template<class Impl>
 void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 {
@@ -294,15 +356,12 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
  Compressor compressor(dag);

  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
+  int LLs = in._grid->_rdimensions[0];
  
  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  st.HaloExchangeComplete(handle);
+  //  auto handle = st.HaloExchangeBegin(in,compressor);
+  //  st.HaloExchangeComplete(handle);
+  st.HaloExchange(in,compressor);
  commtime +=usecond();

  jointime -=usecond();
@@ -318,97 +377,48 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
    if( this->HandOptDslash ) {
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
+	for(int s=0;s<LLs;s++){
 	  int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
+	  int sF = s+LLs*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	  }
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
+	for(int s=0;s<LLs;s++){
 	  int sU=ss;
-	    int sF = sd+Ls*sU;
+	  int sF = s+LLs*sU;
 	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    }
-    }
  } else {
    if( this->AsmOptDslash ) {
-      //      for(int i=0;i<1;i++){
-      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-      //	PerformanceCounter Counter(i);
-      //	Counter.Start();
-
-#pragma omp parallel for 
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  for(int s=soff;s<soff+swork;s++){
-
-	    sU=ss+ ssoff;
-
-	    if ( LebesgueOrder::UseLebesgueOrder ) {
-	      sU = lo.Reorder(sU);
-	    }
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-      //      Counter.Stop();
-      //      Counter.Report();
-      //      }
-    } else if( this->HandOptDslash ) {
-      /*
-
-#pragma omp parallel for schedule(static)
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  sU=ss+ ssoff;
-	  for(int s=soff;s<soff+swork;s++){
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-      */

 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
+	for(int s=0;s<LLs;s++){
 	  int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
+	  int sF = s+LLs*sU;
+	  Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    } else if( this->HandOptDslash ) {
+PARALLEL_FOR_LOOP     
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	for(int s=0;s<LLs;s++){
+	  int sU=ss;
+	  int sF = s+LLs*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
+	for(int s=0;s<LLs;s++){
 	  int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
+	  int sF = s+LLs*sU; 
 	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 	}
      }
@@ -418,251 +428,6 @@ PARALLEL_FOR_LOOP
  alltime+=usecond();
 }

-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
-						 DoubledGaugeField & U,
-						 const FermionField &in, FermionField &out,int dag)
-{
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  alltime-=usecond();
-  Compressor compressor(dag);
-
-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
-  
-  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  st.HaloExchangeComplete(handle);
-  commtime +=usecond();
-
-  jointime -=usecond();
-  jointime +=usecond();
-  
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  // Not loop ordering and data layout.
-  // Designed to create 
-  // - per thread reuse in L1 cache for U
-  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
-
-#pragma omp parallel 
-  {
-  for(int jjj=0;jjj<100;jjj++){
-#pragma omp barrier
-  dslashtime -=usecond();
-  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-      }
-    } else { 
-
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=ss;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-    }
-  } else {
-    if( this->AsmOptDslash ) {
-      //      for(int i=0;i<1;i++){
-      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-      //	PerformanceCounter Counter(i);
-      //	Counter.Start();
-
-#pragma omp for
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  for(int s=soff;s<soff+swork;s++){
-
-	    sU=ss+ ssoff;
-
-	    if ( LebesgueOrder::UseLebesgueOrder ) {
-	      sU = lo.Reorder(sU);
-	    }
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-      //      Counter.Stop();
-      //      Counter.Report();
-      //      }
-    } else if( this->HandOptDslash ) {
-#pragma omp for
-
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    } else { 
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=ss;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    }
-  }
-  }
-  }
-  dslashtime +=usecond();
-  alltime+=usecond();
-}
-
-
-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
-						DoubledGaugeField & U,
-						const FermionField &in, FermionField &out,int dag)
-{
-  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  alltime-=usecond();
-  Compressor compressor(dag);
-
-  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
-
-  int threads = GridThread::GetThreads();
-  int HT      = GridThread::GetHyperThreads();
-  int cores   = GridThread::GetCores();
-  int nwork = U._grid->oSites();
-  
-  commtime -=usecond();
-  auto handle = st.HaloExchangeBegin(in,compressor);
-  st.HaloExchangeComplete(handle);
-  commtime +=usecond();
-
-  jointime -=usecond();
-  jointime +=usecond();
-  
-  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
-  // Not loop ordering and data layout.
-  // Designed to create 
-  // - per thread reuse in L1 cache for U
-  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
-
-#pragma omp parallel 
-  {
-  for(int jjj=0;jjj<100;jjj++){
-#pragma omp barrier
-  dslashtime -=usecond();
-  if ( dag == DaggerYes ) {
-    if( this->HandOptDslash ) {
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=0;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-      }
-    } else { 
-
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	{
-	  int sd;
-	  for(sd=0;sd<Ls;sd++){
-	    int sU=0;
-	    int sF = sd+Ls*sU;
-	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-    }
-  } else {
-    if( this->AsmOptDslash ) {
-      //      for(int i=0;i<1;i++){
-      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-      //	PerformanceCounter Counter(i);
-      //	Counter.Start();
-
-#pragma omp for
-      for(int t=0;t<threads;t++){
-
-	int hyperthread = t%HT;
-	int core        = t/HT;
-
-        int sswork, swork,soff,ssoff,  sU,sF;
-	
-	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
-	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
-
-	for(int ss=0;ss<sswork;ss++){
-	  for(int s=soff;s<soff+swork;s++){
-
-	    sU=0;
-	    sF = s+Ls*sU;
-	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	  }
-	}
-      }
-      //      Counter.Stop();
-      //      Counter.Report();
-      //      }
-    } else if( this->HandOptDslash ) {
-#pragma omp for
-
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=0;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    } else { 
-#pragma omp for
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	int sU=0;
-	for(int s=0;s<Ls;s++){
-	  int sF = s+Ls*sU; 
-	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
-	}
-      }
-    }
-  }
-  }
-  }
-  dslashtime +=usecond();
-  alltime+=usecond();
-}
-
-
-template<class Impl>
-void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
-						     DoubledGaugeField & U,
-						     const FermionField &in, FermionField &out,int dag)
-{
-  assert(0);
-}

 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
@@ -706,6 +471,8 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag

 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
+template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
+template class WilsonFermion5D<DomainWallRedBlack5dImplD>;

 }}

--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -87,6 +87,7 @@ namespace Grid {
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
+      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

      // These can be overridden by fancy 5d chiral action
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
@@ -121,32 +122,12 @@ namespace Grid {
 			FermionField &out,
 			int dag);

-      void DhopInternalOMPbench(StencilImpl & st,
-				LebesgueOrder &lo,
-				DoubledGaugeField &U,
-				const FermionField &in, 
-				FermionField &out,
-				int dag);
-
-      void DhopInternalL1bench(StencilImpl & st,
-				LebesgueOrder &lo,
-				DoubledGaugeField &U,
-				const FermionField &in, 
-				FermionField &out,
-				int dag);
-
      void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,
 			const FermionField &in, 
 			FermionField &out,
 			int dag);
-      void DhopInternalCommsOverlapCompute(StencilImpl & st,
-			LebesgueOrder &lo,
-			DoubledGaugeField &U,
-			const FermionField &in, 
-			FermionField &out,
-			int dag);

      // Constructors
      WilsonFermion5D(GaugeField &_Umu,
@@ -156,6 +137,15 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      double _M5,const ImplParams &p= ImplParams());

+      // Constructors
+      WilsonFermion5D(int simd, 
+		      GaugeField &_Umu,
+		      GridCartesian         &FiveDimGrid,
+		      GridRedBlackCartesian &FiveDimRedBlackGrid,
+		      GridCartesian         &FourDimGrid,
+		      GridRedBlackCartesian &FourDimRedBlackGrid,
+		      double _M5,const ImplParams &p= ImplParams());
+
      // DoubleStore
      void ImportGauge(const GaugeField &_Umu);

--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -529,5 +529,7 @@ void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 #endif

  FermOpTemplateInstantiate(WilsonKernels);
+template class WilsonKernels<DomainWallRedBlack5dImplF>;		
+template class WilsonKernels<DomainWallRedBlack5dImplD>;

 }}
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -27,7 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid.h>
-#if defined(AVX512) || defined (IMCI)
+#if defined(AVX512) 
 //#if defined (IMCI)

 #include <simd/Intel512wilson.h>
@@ -256,5 +256,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
  template class WilsonKernels<WilsonImplD>; 
  template class WilsonKernels<GparityWilsonImplF>;
  template class WilsonKernels<GparityWilsonImplD>;
+  template class WilsonKernels<DomainWallRedBlack5dImplF>;
+  template class WilsonKernels<DomainWallRedBlack5dImplD>;
 }}
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chi_11 = ref()(1)(1);\
    Chi_12 = ref()(1)(2);

+// To splat or not to splat depends on the implementation
 #define MULT_2SPIN(A)\
   auto & ref(U._odata[sU](A));	\
-    U_00 = ref()(0,0);\
-    U_10 = ref()(1,0);\
-    U_20 = ref()(2,0);\
-    U_01 = ref()(0,1);\
-    U_11 = ref()(1,1);				\
-    U_21 = ref()(2,1);\
+   Impl::loadLinkElement(U_00,ref()(0,0));	\
+   Impl::loadLinkElement(U_10,ref()(1,0));	\
+   Impl::loadLinkElement(U_20,ref()(2,0));	\
+   Impl::loadLinkElement(U_01,ref()(0,1));	\
+   Impl::loadLinkElement(U_11,ref()(1,1));	\
+   Impl::loadLinkElement(U_21,ref()(2,1));	\
    UChi_00 = U_00*Chi_00;\
    UChi_10 = U_00*Chi_10;\
    UChi_01 = U_10*Chi_00;\
@@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_11+= U_11*Chi_11;\
    UChi_02+= U_21*Chi_01;\
    UChi_12+= U_21*Chi_11;\
-    U_00 = ref()(0,2);\
-    U_10 = ref()(1,2);\
-    U_20 = ref()(2,2);\
+    Impl::loadLinkElement(U_00,ref()(0,2));	\
+    Impl::loadLinkElement(U_10,ref()(1,2));	\
+    Impl::loadLinkElement(U_20,ref()(2,2));	\
    UChi_00+= U_00*Chi_02;\
    UChi_10+= U_00*Chi_12;\
    UChi_01+= U_10*Chi_02;\
@@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    UChi_02+= U_20*Chi_02;\
    UChi_12+= U_20*Chi_12;

+
 #define PERMUTE_DIR(dir)			\
      permute##dir(Chi_00,Chi_00);\
      permute##dir(Chi_01,Chi_01);\
@@ -809,7 +811,6 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
 							     int sF,int sU,const FermionField &in, FermionField &out)
 {
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
-  //check consistency of return types between these functions and the ones in WilsonKernels.cc
  return 0;
  
 }
@@ -843,6 +844,47 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D



+  //////////////
+/*
+template<>
+int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							     int sF,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
+  return 0;
+  
+}
+
+template<>
+int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								int sF,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  return 0;
+}
+
+template<>
+int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							     int sF,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  return 0;
+}
+
+template<>
+int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								int sF,int sU,const FermionField &in, FermionField &out)
+{
+  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
+  return 0;
+}
+
+*/
+
 template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							       int ss,int sU,const FermionField &in, FermionField &out);
@@ -870,4 +912,21 @@ template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilI
 									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 									 int ss,int sU,const FermionField &in, FermionField &out);

+
+
+
+template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
+								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+								      int ss,int sU,const FermionField &in, FermionField &out);
+template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
+									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+									 int ss,int sU,const FermionField &in, FermionField &out);
+
+
 }}
--- a/lib/qcd/action/gauge/GaugeImpl.h
+++ b/lib/qcd/action/gauge/GaugeImpl.h
@@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
 #define INHERIT_GIMPL_TYPES(GImpl) \
    typedef typename GImpl::Simd                           Simd;\
    typedef typename GImpl::GaugeLinkField       GaugeLinkField;\
-    typedef typename GImpl::GaugeField               GaugeField;	
+    typedef typename GImpl::GaugeField               GaugeField;\
+    typedef typename GImpl::SiteGaugeField       SiteGaugeField;\
+    typedef typename GImpl::SiteGaugeLink         SiteGaugeLink;


    // 
--- a/lib/qcd/utils/SpaceTimeGrid.cc
+++ b/lib/qcd/utils/SpaceTimeGrid.cc
@@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 {
  return new GridRedBlackCartesian(FourDimGrid); 
 }
-
+GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
+{
+  std::vector<int> simd(4,1);
+  return makeFourDimGrid(latt,simd,mpi);
+}
 GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@@ -58,6 +62,7 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
  return new GridCartesian(latt5,simd5,mpi5); 
 }

+
 GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;
@@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
 }

+
+GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
+{
+  int N4=FourDimGrid->_ndimension;
+  int nsimd = FourDimGrid->Nsimd();
+
+  std::vector<int> latt5(1,Ls);
+  std::vector<int> simd5(1,nsimd);
+  std::vector<int>  mpi5(1,1);
+  
+  for(int d=0;d<N4;d++){
+    latt5.push_back(FourDimGrid->_fdimensions[d]);
+    simd5.push_back(1);
+     mpi5.push_back(FourDimGrid->_processors[d]);
+  }
+  return new GridCartesian(latt5,simd5,mpi5); 
+}
+
+GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
+{
+  int N4=FourDimGrid->_ndimension;
+  int nsimd = FourDimGrid->Nsimd();
+  int cbd=0;
+  std::vector<int> latt5(1,Ls);
+  std::vector<int> simd5(1,nsimd);
+  std::vector<int>  mpi5(1,1);
+  std::vector<int>   cb5(1,1);
+    
+  for(int d=0;d<N4;d++){
+    latt5.push_back(FourDimGrid->_fdimensions[d]);
+    simd5.push_back(1);
+     mpi5.push_back(FourDimGrid->_processors[d]);
+      cb5.push_back(1);
+    }
+  return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd); 
+}
+
+
 }}
--- a/lib/qcd/utils/SpaceTimeGrid.h
+++ b/lib/qcd/utils/SpaceTimeGrid.h
@@ -35,9 +35,14 @@ class SpaceTimeGrid {

  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);
+
  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
  static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);

+  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
+  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
+  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
+
 };

 }}
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -410,22 +410,22 @@ namespace Optimization {
  struct Permute{

    static inline __m256 Permute0(__m256 in){
-      return _mm256_permute2f128_ps(in,in,0x01);
+      return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
    };
    static inline __m256 Permute1(__m256 in){
-      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
    };
    static inline __m256 Permute2(__m256 in){
-      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
    };
    static inline __m256 Permute3(__m256 in){
      return in;
    };

    static inline __m256d Permute0(__m256d in){
-      return _mm256_permute2f128_pd(in,in,0x01);
+      return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
    };
-    static inline __m256d Permute1(__m256d in){
+    static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
      return _mm256_shuffle_pd(in,in,0x5);
    };
    static inline __m256d Permute2(__m256d in){
@@ -437,6 +437,111 @@ namespace Optimization {

  };

+#if defined (AVX2) || defined (AVXFMA4) 
+#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
+#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
+#endif
+
+#if defined (AVX1) 
+
+#define _mm256_alignr_epi32(ret,a,b,n) {	\
+    __m128 aa, bb;				\
+						\
+    aa  = _mm256_extractf128_ps(a,1);		\
+    bb  = _mm256_extractf128_ps(b,1);		\
+    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
+    ret = _mm256_insertf128_ps(ret,aa,1);	\
+						\
+    aa  = _mm256_extractf128_ps(a,0);		\
+    bb  = _mm256_extractf128_ps(b,0);		\
+    aa  = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16);	\
+    ret = _mm256_insertf128_ps(ret,aa,0);	\
+  }
+
+#define _mm256_alignr_epi64(ret,a,b,n) {	\
+    __m128d aa, bb;				\
+						\
+    aa  = _mm256_extractf128_pd(a,1);		\
+    bb  = _mm256_extractf128_pd(b,1);		\
+    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
+    ret = _mm256_insertf128_pd(ret,aa,1);	\
+						\
+    aa  = _mm256_extractf128_pd(a,0);		\
+    bb  = _mm256_extractf128_pd(b,0);		\
+    aa  = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16);	\
+    ret = _mm256_insertf128_pd(ret,aa,0);	\
+  }
+
+#endif
+
+    inline std::ostream & operator << (std::ostream& stream, const __m256 a)
+    {
+      const float *p=(const float *)&a;
+      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
+      return stream;
+    };
+    inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
+    {
+      const double *p=(const double *)&a;
+      stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
+      return stream;
+    };
+
+  struct Rotate{
+
+    static inline __m256 rotate(__m256 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m256d rotate(__m256d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      default: assert(0);
+      }
+    }
+  
+    
+    template<int n>
+    static inline __m256 tRotate(__m256 in){ 
+      __m256 tmp = Permute::Permute0(in);
+      __m256 ret;
+      if ( n > 3 ) { 
+	_mm256_alignr_epi32(ret,in,tmp,n);  
+      } else {
+        _mm256_alignr_epi32(ret,tmp,in,n);          
+      }
+      //      std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
+      return ret;
+    };
+
+    template<int n>
+    static inline __m256d tRotate(__m256d in){ 
+      __m256d tmp = Permute::Permute0(in);
+      __m256d ret;
+      if ( n > 1 ) {
+	_mm256_alignr_epi64(ret,in,tmp,n);          
+      } else {
+        _mm256_alignr_epi64(ret,tmp,in,n);          
+      }
+      //      std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
+      return ret;
+    };
+
+  };
+
+

  //Complex float Reduce
  template<>
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -309,6 +309,54 @@ namespace Optimization {
  };


+  struct Rotate{
+
+    static inline __m512 rotate(__m512 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+
+      case 8 : return tRotate<8>(in);break;
+      case 9 : return tRotate<9>(in);break;
+      case 10: return tRotate<10>(in);break;
+      case 11: return tRotate<11>(in);break;
+      case 12: return tRotate<12>(in);break;
+      case 13: return tRotate<13>(in);break;
+      case 14: return tRotate<14>(in);break;
+      case 15: return tRotate<15>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m512d rotate(__m512d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+
+    template<int n> static inline __m512 tRotate(__m512 in){ 
+      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
+    };
+
+    template<int n> static inline __m512d tRotate(__m512d in){ 
+      return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);          
+    };
+
+  };
+
  //////////////////////////////////////////////
  // Some Template specialization
  
--- a/lib/simd/Grid_empty.h
+++ b/lib/simd/Grid_empty.h
@@ -55,51 +55,67 @@ namespace Optimization {
  
  struct Vsplat{
    //Complex float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(float a, float b){
+      u128f out; 
+      out.f[0] = a;
+      out.f[1] = b;
+      out.f[2] = a;
+      out.f[3] = b;
+      return out;
    }
    // Real float
-    inline float operator()(float a){
-      return 0;
+    inline u128f operator()(float a){
+      u128f out; 
+      out.f[0] = a;
+      out.f[1] = a;
+      out.f[2] = a;
+      out.f[3] = a;
+      return out;
    }
    //Complex double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(double a, double b){
+      u128d out; 
+      out.f[0] = a;
+      out.f[1] = b;
+      return out;
    }
    //Real double
-    inline double operator()(double a){
-      return 0;
+    inline u128d operator()(double a){
+      u128d out; 
+      out.f[0] = a;
+      out.f[1] = a;
+      return out;
    }
    //Integer
    inline int operator()(Integer a){
-      return 0;
+      return a;
    }
  };

  struct Vstore{
    //Float 
-    inline void operator()(float a, float* F){
-      
+    inline void operator()(u128f a, float* F){
+      memcpy(F,a.f,4*sizeof(float));
    }
    //Double
-    inline void operator()(double a, double* D){
-     
+    inline void operator()(u128d a, double* D){
+      memcpy(D,a.f,2*sizeof(double));
    }
    //Integer
    inline void operator()(int a, Integer* I){
-      
+      I[0] = a;
    }

  };

  struct Vstream{
    //Float
-    inline void operator()(float * a, float b){
-     
+    inline void operator()(float * a, u128f b){
+      memcpy(a,b.f,4*sizeof(float));
    }
    //Double
-    inline void operator()(double * a, double b){
-     
+    inline void operator()(double * a, u128d b){
+      memcpy(a,b.f,2*sizeof(double));
    }


@@ -107,24 +123,40 @@ namespace Optimization {

  struct Vset{
    // Complex float 
-    inline float operator()(Grid::ComplexF *a){
-      return 0;
+    inline u128f operator()(Grid::ComplexF *a){
+      u128f out; 
+      out.f[0] = a[0].real();
+      out.f[1] = a[0].imag();
+      out.f[2] = a[1].real();
+      out.f[3] = a[1].imag();
+      return out;
    }
    // Complex double 
-    inline double operator()(Grid::ComplexD *a){
-      return 0;
+    inline u128d operator()(Grid::ComplexD *a){
+      u128d out; 
+      out.f[0] = a[0].real();
+      out.f[1] = a[0].imag();
+      return out;
    }
    // Real float 
-    inline float operator()(float *a){
-      return  0;
+    inline u128f operator()(float *a){
+      u128f out; 
+      out.f[0] = a[0];
+      out.f[1] = a[1];
+      out.f[2] = a[2];
+      out.f[3] = a[3];
+      return out;
    }
    // Real double
-    inline double operator()(double *a){
-      return 0;
+    inline u128d operator()(double *a){
+      u128d out; 
+      out.f[0] = a[0];
+      out.f[1] = a[1];
+      return out;
    }
    // Integer
    inline int operator()(Integer *a){
-      return 0;
+      return a[0];
    }


@@ -146,129 +178,198 @@ namespace Optimization {
  /////////////////////////////////////////////////////
  struct Sum{
    //Complex/Real float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0] + b.f[0];
+      out.f[1] = a.f[1] + b.f[1];
+      out.f[2] = a.f[2] + b.f[2];
+      out.f[3] = a.f[3] + b.f[3];
+      return out;
    }
    //Complex/Real double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0] + b.f[0];
+      out.f[1] = a.f[1] + b.f[1];
+      return out;
    }
    //Integer
    inline int operator()(int a, int b){
-      return 0;
+      return a + b;
    }
  };

  struct Sub{
    //Complex/Real float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0] - b.f[0];
+      out.f[1] = a.f[1] - b.f[1];
+      out.f[2] = a.f[2] - b.f[2];
+      out.f[3] = a.f[3] - b.f[3];
+      return out;
    }
    //Complex/Real double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0] - b.f[0];
+      out.f[1] = a.f[1] - b.f[1];
+      return out;
    }
    //Integer
    inline int operator()(int a, int b){
-      return 0;
+      return a-b;
    }
  };

  struct MultComplex{
    // Complex float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
+      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
+      out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
+      out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
+      return out;
    }
    // Complex double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
+      out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
+      return out;
    }
  };

  struct Mult{
-    inline float  mac(float a, float b,double c){
-      return 0;
-    }
-    inline double mac(double a, double b,double c){
-      return 0;
-    }
+    //CK: Appear unneeded
+    // inline float  mac(float a, float b,double c){
+    //   return 0;
+    // }
+    // inline double mac(double a, double b,double c){
+    //   return 0;
+    // }
+
    // Real float
-    inline float operator()(float a, float b){
-      return 0;
+    inline u128f operator()(u128f a, u128f b){
+      u128f out;
+      out.f[0] = a.f[0]*b.f[0];
+      out.f[1] = a.f[1]*b.f[1];
+      out.f[2] = a.f[2]*b.f[2];
+      out.f[3] = a.f[3]*b.f[3];
+      return out;
    }
    // Real double
-    inline double operator()(double a, double b){
-      return 0;
+    inline u128d operator()(u128d a, u128d b){
+      u128d out;
+      out.f[0] = a.f[0]*b.f[0];
+      out.f[1] = a.f[1]*b.f[1];
+      return out;
    }
    // Integer
    inline int operator()(int a, int b){
-      return 0;
+      return a*b;
    }
  };

  struct Conj{
    // Complex single
-    inline float operator()(float in){
-      return 0;
+    inline u128f operator()(u128f in){
+      u128f out;
+      out.f[0] = in.f[0];
+      out.f[1] = -in.f[1];
+      out.f[2] = in.f[2];
+      out.f[3] = -in.f[3];
+      return out;
    }
    // Complex double
-    inline double operator()(double in){
-      return 0;
+    inline u128d operator()(u128d in){
+      u128d out;
+      out.f[0] = in.f[0];
+      out.f[1] = -in.f[1];
+      return out;
    }
    // do not define for integer input
  };

  struct TimesMinusI{
    //Complex single
-    inline float operator()(float in, float ret){
-      return 0;
+    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
+      u128f out;
+      out.f[0] = in.f[1];
+      out.f[1] = -in.f[0];
+      out.f[2] = in.f[3];
+      out.f[3] = -in.f[2];
+      return out;
    }
    //Complex double
-    inline double operator()(double in, double ret){
-      return 0;
+    inline u128d operator()(u128d in, u128d ret){
+      u128d out;
+      out.f[0] = in.f[1];
+      out.f[1] = -in.f[0];
+      return out;
    }
-
-
  };

  struct TimesI{
    //Complex single
-    inline float operator()(float in, float ret){
-      return 0;
+    inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
+      u128f out;
+      out.f[0] = -in.f[1];
+      out.f[1] = in.f[0];
+      out.f[2] = -in.f[3];
+      out.f[3] = in.f[2];
+      return out;
    }
    //Complex double
-    inline double operator()(double in, double ret){
-      return 0;
+    inline u128d operator()(u128d in, u128d ret){
+      u128d out;
+      out.f[0] = -in.f[1];
+      out.f[1] = in.f[0];
+      return out;
    }
  };

  //////////////////////////////////////////////
  // Some Template specialization
  struct Permute{
-
-    static inline float Permute0(float in){
+    //We just have to mirror the permutes of Grid_sse4.h
+    static inline u128f Permute0(u128f in){ //AB CD -> CD AB
+      u128f out;
+      out.f[0] = in.f[2];
+      out.f[1] = in.f[3];
+      out.f[2] = in.f[0];
+      out.f[3] = in.f[1];
+      return out;
+    };
+    static inline u128f Permute1(u128f in){ //AB CD -> BA DC
+      u128f out;
+      out.f[0] = in.f[1];
+      out.f[1] = in.f[0];
+      out.f[2] = in.f[3];
+      out.f[3] = in.f[2];
+      return out;
+    };
+    static inline u128f Permute2(u128f in){
      return in;
    };
-    static inline float Permute1(float in){
-      return in;
-    };
-    static inline float Permute2(float in){
-      return in;
-    };
-    static inline float Permute3(float in){
+    static inline u128f Permute3(u128f in){
      return in;
    };

-    static inline double Permute0(double in){
+    static inline u128d Permute0(u128d in){ //AB -> BA
+      u128d out;
+      out.f[0] = in.f[1];
+      out.f[1] = in.f[0];
+      return out;      
+    };
+    static inline u128d Permute1(u128d in){
      return in;
    };
-    static inline double Permute1(double in){
+    static inline u128d Permute2(u128d in){
      return in;
    };
-    static inline double Permute2(double in){
-      return in;
-    };
-    static inline double Permute3(double in){
+    static inline u128d Permute3(u128d in){
      return in;
    };

@@ -280,26 +381,26 @@ namespace Optimization {

  //Complex float Reduce
  template<>
-  inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
-    return 0;
+  inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
+    return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
  }
  //Real float Reduce
  template<>
-  inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
-    return 0;
+  inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
+    return in.f[0] + in.f[1] + in.f[2] + in.f[3];
  }
  
  
  //Complex double Reduce
  template<>
-  inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
-    return 0;
+  inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
+    return Grid::ComplexD(in.f[0],in.f[1]);
  }
  
  //Real double Reduce
  template<>
-  inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
-    return 0;
+  inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
+    return in.f[0] + in.f[1];
  }

  //Integer Reduce
@@ -314,8 +415,8 @@ namespace Optimization {
 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 

-  typedef float SIMD_Ftype;  // Single precision type
-  typedef double SIMD_Dtype; // Double precision type
+  typedef Optimization::u128f SIMD_Ftype;  // Single precision type
+  typedef Optimization::u128d SIMD_Dtype; // Double precision type
  typedef int SIMD_Itype; // Integer type

  // prefetch utilities
--- a/lib/simd/Grid_imci.h
+++ b/lib/simd/Grid_imci.h
@@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 //----------------------------------------------------------------------

 #include <immintrin.h>
+#include <zmmintrin.h>

+namespace Grid{
 namespace Optimization {
  
  struct Vsplat{
@@ -316,6 +318,54 @@ namespace Optimization {

  };
 
+  struct Rotate{
+
+    static inline __m512 rotate(__m512 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+
+      case 8 : return tRotate<8>(in);break;
+      case 9 : return tRotate<9>(in);break;
+      case 10: return tRotate<10>(in);break;
+      case 11: return tRotate<11>(in);break;
+      case 12: return tRotate<12>(in);break;
+      case 13: return tRotate<13>(in);break;
+      case 14: return tRotate<14>(in);break;
+      case 15: return tRotate<15>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m512d rotate(__m512d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      case 4: return tRotate<4>(in);break;
+      case 5: return tRotate<5>(in);break;
+      case 6: return tRotate<6>(in);break;
+      case 7: return tRotate<7>(in);break;
+      default: assert(0);
+      }
+    }
+
+    template<int n> static inline __m512 tRotate(__m512 in){ 
+      return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);          
+    };
+
+    template<int n> static inline __m512d tRotate(__m512d in){ 
+      return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);          
+    };
+
+  };
+


  //////////////////////////////////////////////
@@ -358,7 +408,7 @@ namespace Optimization {

 //////////////////////////////////////////////////////////////////////////////////////
 // Here assign types 
-namespace Grid {
+
  typedef __m512 SIMD_Ftype;  // Single precision type
  typedef __m512d SIMD_Dtype; // Double precision type
  typedef __m512i SIMD_Itype; // Integer type
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -267,10 +267,10 @@ namespace Optimization {
  struct Permute{

    static inline __m128 Permute0(__m128 in){
-      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
    };
    static inline __m128 Permute1(__m128 in){
-      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
+      return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
    };
    static inline __m128 Permute2(__m128 in){
      return in;
@@ -279,7 +279,7 @@ namespace Optimization {
      return in;
    };

-    static inline __m128d Permute0(__m128d in){
+    static inline __m128d Permute0(__m128d in){ //AB -> BA
      return _mm_shuffle_pd(in,in,0x1);
    };
    static inline __m128d Permute1(__m128d in){
@@ -294,6 +294,32 @@ namespace Optimization {

  };

+  struct Rotate{
+
+    static inline __m128 rotate(__m128 in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      case 2: return tRotate<2>(in);break;
+      case 3: return tRotate<3>(in);break;
+      default: assert(0);
+      }
+    }
+    static inline __m128d rotate(__m128d in,int n){ 
+      switch(n){
+      case 0: return tRotate<0>(in);break;
+      case 1: return tRotate<1>(in);break;
+      default: assert(0);
+      }
+    }
+  
+#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
+#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
+    
+    template<int n> static inline __m128  tRotate(__m128  in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
+    template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
+
+  };
  //////////////////////////////////////////////
  // Some Template specialization

--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -299,16 +299,44 @@ namespace Grid {
    }
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
    {
-      if      (perm==3) permute3(y,b);
-      else if (perm==2) permute2(y,b);
-      else if (perm==1) permute1(y,b);
-      else if (perm==0) permute0(y,b);
+      if ( perm & RotateBit ) {
+	int dist = perm&0xF;
+        y=rotate(b,dist);
+	return;
+      }
+      switch(perm){
+      case 3: permute3(y,b); break;
+      case 2: permute2(y,b); break;
+      case 1: permute1(y,b); break;
+      case 0: permute0(y,b); break;
+      default: assert(0);
+      }
    }
    
-
-    
  };// end of Grid_simd class definition 

+  ////////////////////////////////////////////////////////////////////
+  // General rotate
+  ////////////////////////////////////////////////////////////////////
+  template <class S, class V, IfNotComplex<S> =0> 
+  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
+  {
+    nrot = nrot % Grid_simd<S,V>::Nsimd();
+    Grid_simd<S,V> ret;
+    //    std::cout << "Rotate Real by "<<nrot<<std::endl;
+    ret.v = Optimization::Rotate::rotate(b.v,nrot);
+    return ret;
+  }
+  template <class S, class V, IfComplex<S> =0> 
+  inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
+  {
+    nrot = nrot % Grid_simd<S,V>::Nsimd();
+    Grid_simd<S,V> ret;
+    //    std::cout << "Rotate Complex by "<<nrot<<std::endl;
+    ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
+    return ret;
+  }
+
  ///////////////////////
  // Splat
  ///////////////////////
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -44,7 +44,7 @@ template<class vsimd,class scalar>
 inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y, 
 		    std::vector<scalar *> &extracted,int offset){
  // FIXME: bounce off memory is painful
-  static const int Nsimd=vsimd::Nsimd();
+  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
  int Nextr=extracted.size();
  int s=Nsimd/Nextr;

@@ -59,7 +59,9 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
 template<class vsimd,class scalar>
 inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y, 
 		  std::vector<scalar *> &extracted,int offset){
-  static const int Nsimd=vsimd::Nsimd();
+
+  static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
+
  int Nextr=extracted.size();
  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
                     // replicate n-fold. Use to allow Integer masks to 
@@ -127,7 +129,7 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;

-  static const int Nsimd=vobj::vector_type::Nsimd();
+  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
  static const int words=sizeof(vobj)/sizeof(vector_type);
  int Nextr=extracted.size();
  int s=Nsimd/Nextr;
@@ -174,7 +176,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
  
-  static const int Nsimd=vobj::vector_type::Nsimd();
+  static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
  static const int words=sizeof(vobj)/sizeof(vector_type);

  int Nextr = extracted.size();
@@ -199,7 +201,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
  
-  const int Nsimd=vobj::vector_type::Nsimd();
+  const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
  const int words=sizeof(vobj)/sizeof(vector_type);

  int Nextr=extracted.size();
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,5 +1,5 @@

-bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 
+bin_PROGRAMS = Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd 


 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
@@ -58,6 +58,14 @@ Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
 Test_cshift_red_black_LDADD=-lGrid


+Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
+Test_cshift_red_black_rotate_LDADD=-lGrid
+
+
+Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
+Test_cshift_rotate_LDADD=-lGrid
+
+
 Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
 Test_dwf_cg_prec_LDADD=-lGrid

@@ -98,6 +106,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 Test_dwf_lanczos_LDADD=-lGrid


+Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
+Test_dwf_rb5d_LDADD=-lGrid
+
+
 Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid

--- a/tests/Test_cshift_red_black_rotate.cc
+++ b/tests/Test_cshift_red_black_rotate.cc
@@ -0,0 +1,223 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift_red_black.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  int Nd = latt_size.size();
+  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  std::vector<int> mask(Nd,1);
+  mask[0]=0;
+
+  GridCartesian         Fine  (latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
+
+  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
+
+  LatticeComplex U(&Fine);
+  LatticeComplex ShiftU(&Fine);
+  LatticeComplex rbShiftU(&Fine);
+  LatticeComplex Ue(&RBFine); 
+  LatticeComplex Uo(&RBFine);
+  LatticeComplex ShiftUe(&RBFine);
+  LatticeComplex ShiftUo(&RBFine);
+  LatticeComplex lex(&Fine);
+  lex=zero;
+  Integer stride =1;
+  {
+    double nrm;
+    LatticeComplex coor(&Fine);
+
+    for(int d=0;d<Nd;d++){
+      //      Integer i=10000;
+      Integer i=0;
+      LatticeCoordinate(coor,d);
+      lex = lex + coor*stride+i;
+      stride=stride*latt_size[d];
+    }
+    U=lex;
+  }
+
+  pickCheckerboard(Even,Ue,U);
+  pickCheckerboard(Odd,Uo,U);
+
+  //  std::cout<<GridLogMessage << U<<std::endl;
+  std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
+  std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
+
+
+  TComplex cm;
+  TComplex cmeo;
+  for(int dir=0;dir<Nd;dir++){
+    //    if ( dir!=1 ) continue;
+    for(int shift=0;shift<latt_size[dir];shift++){
+
+	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
+
+	std::cout<<GridLogMessage<<"Even grid"<<std::endl;
+	ShiftUe = Cshift(Ue,dir,shift);    // Shift everything cb by cb
+	std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
+
+	std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
+	ShiftUo = Cshift(Uo,dir,shift);    
+	std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
+
+	std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
+	setCheckerboard(rbShiftU,ShiftUe);
+	setCheckerboard(rbShiftU,ShiftUo);
+	std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
+
+	std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
+	ShiftU  = Cshift(U,dir,shift);    // Shift everything
+	std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
+
+	std::vector<int> coor(4);
+
+	std::cout<<GridLogMessage << "Checking the non-checkerboard shift"<<std::endl;
+	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
+	  
+	  peekSite(cm,ShiftU,coor);
+
+	  /////////	  double nrm=norm2(U);
+
+	  std::vector<int> scoor(coor);
+	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
+	  
+	  Integer slex = scoor[0]
+	    + latt_size[0]*scoor[1]
+	    + latt_size[0]*latt_size[1]*scoor[2]
+	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
+
+	  Complex scm(slex);
+	  
+	  double nrm = abs(scm-cm()()());
+	  std::vector<int> peer(4);
+	  Complex ctmp = cm;
+	  Integer index=real(ctmp);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);
+
+	  if (nrm > 0){
+	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
+		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
+		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    index=real(scm);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
+	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    exit(-1);
+	  }
+	}}}}
+
+	int exx=0;
+	std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
+	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
+	  
+	  peekSite(cm,rbShiftU,coor);
+
+	  Integer checkerboard = RBFine.CheckerBoard(coor);
+
+	  //	  std::cout << " coor "<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] \n ";
+	  //	  std::cout << "shift "<< shift <<" dir "<<dir<< " checker board "<< checkerboard << " ";
+	  //	  std::cout << "Uo "   << ShiftUo.checkerboard << " Ue "<<ShiftUe.checkerboard<<std::endl;
+	  if ( checkerboard == ShiftUo.checkerboard ) {
+	    peekSite(cmeo,ShiftUo,coor);
+	  } else { 
+	    peekSite(cmeo,ShiftUe,coor);
+	  }
+
+
+	  std::vector<int> scoor(coor);
+	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
+	  
+	  Integer slex = scoor[0]
+	    + latt_size[0]*scoor[1]
+	    + latt_size[0]*latt_size[1]*scoor[2]
+	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
+
+	  Complex scm(slex);
+
+	  std::vector<int> peer(4);
+	  Complex ctmp=cmeo;
+	  Integer index=real(ctmp);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);
+
+	  double nrm = abs(cmeo()()()-scm);
+	  if (nrm != 0) {
+	    std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir
+		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
+		     << cmeo()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    index=real(scm);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
+	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    exx=1;
+
+	  }
+
+	  ctmp=cm;
+	  index=real(ctmp);
+	  nrm = abs(scm-cm()()());
+
+	  if (nrm > 0){
+	    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
+		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
+		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    index=real(scm);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
+	    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    exx=1;
+	  } else if (1) { 
+	    std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
+		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
+		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	  }
+	}}}}
+	if (exx) exit(-1);
+
+    }
+  }
+
+  Grid_finalize();
+}
--- a/tests/Test_cshift_rotate.cc
+++ b/tests/Test_cshift_rotate.cc
@@ -0,0 +1,125 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size   = GridDefaultLatt();
+  std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout  = GridDefaultMpi();
+
+  GridCartesian        Fine(latt_size,simd_layout,mpi_layout);
+
+  GridParallelRNG      FineRNG(&Fine);  FineRNG.SeedRandomDevice();
+
+  LatticeComplex U(&Fine);
+  LatticeComplex ShiftU(&Fine);
+
+  LatticeComplex lex(&Fine);
+  lex=zero;
+  Integer stride =1;
+  {
+    double nrm;
+    LatticeComplex coor(&Fine);
+
+    for(int d=0;d<4;d++){
+      LatticeCoordinate(coor,d);
+      lex = lex + coor*stride;
+      stride=stride*latt_size[d];
+    }
+    U=lex;
+  }
+
+
+  TComplex cm;
+  
+  for(int dir=0;dir<4;dir++){
+    for(int shift=0;shift<latt_size[dir];shift++){
+      if ( Fine.IsBoss() ) 
+	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
+
+	ShiftU  = Cshift(U,dir,shift);    // Shift everything
+
+	/*
+	std::cout << "U[0]" << U[0]<<std::endl;
+	std::cout << "U[1]" << U[1]<<std::endl;
+	std::cout << "ShiftU[0]" << ShiftU[0]<<std::endl;
+	std::cout << "ShiftU[1]" << ShiftU[1]<<std::endl;
+	*/
+	std::vector<int> coor(4);
+
+	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
+	  
+	  peekSite(cm,ShiftU,coor);
+
+	  double nrm=norm2(U);
+
+	  std::vector<int> scoor(coor);
+	  scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
+	  
+	  Integer slex = scoor[0]
+	    + latt_size[0]*scoor[1]
+	    + latt_size[0]*latt_size[1]*scoor[2]
+	    + latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
+
+	  Complex scm(slex);
+	  
+	  nrm = abs(scm-cm()()());
+	  std::vector<int> peer(4);
+	  Complex tmp  =cm;
+	  Integer index=real(tmp);
+	  Lexicographic::CoorFromIndex(peer,index,latt_size);
+
+	  if (nrm > 0){
+	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	    index=real(scm);
+	    Lexicographic::CoorFromIndex(peer,index,latt_size);
+	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	  }
+	  /*
+	  else {
+	    std::cerr<<"PASS shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
+	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
+	  }
+	  */
+	}}}}
+    }
+  }
+
+  Grid_finalize();
+}
--- a/tests/Test_dwf_rb5d.cc
+++ b/tests/Test_dwf_rb5d.cc
@@ -0,0 +1,138 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_dwf_even_odd.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::GammaMatrix Gmu [] = {
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT
+  };
+
+typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
+
+typedef WilsonFermion5D<WilsonImplR> WilsonFermion5D_OKR;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+
+  const int Ls=32;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+
+  LatticeFermion src   (FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=zero;
+  LatticeFermion    ref(FGrid);    ref=zero;
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+
+  LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
+  std::vector<LatticeColourMatrix> U(4,UGrid);
+
+  // Only one non-zero (y)
+  /*
+  Umu=zero;
+  for(int nn=0;nn<Nd;nn++){
+    random(RNG4,U[nn]);
+    PokeIndex<LorentzIndex>(Umu,U[nn],nn);
+  }
+  */
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  typename WilsonFermion5DR::ImplParams params;
+
+  WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,M5,params);
+
+  Dw.Dhop(src,result,0);
+
+  std::cout << "Norm src = "<<norm2(src)<<" Norm res = "<<norm2(result) << std::endl;
+
+
+
+  GridCartesian         * FokGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FokrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  WilsonFermion5D_OKR Dok(Umu,*FokGrid,*FokrbGrid,*UGrid,*UrbGrid,M5,params);
+  
+  LatticeFermion src_ok   (FokGrid);
+  LatticeFermion ref_ok(FokGrid); 
+  LatticeFermion result_ok(FokGrid);
+  
+  
+  for(int lidx=0;lidx<FGrid->lSites();lidx++){
+    std::vector<int> lcoor;
+    FGrid->LocalIndexToLocalCoor(lidx,lcoor);
+    
+    SpinColourVector siteSrc;
+
+    peekLocalSite(siteSrc,src,lcoor);
+    pokeLocalSite(siteSrc,src_ok,lcoor);
+
+    peekLocalSite(siteSrc,result,lcoor);
+    pokeLocalSite(siteSrc,result_ok,lcoor);
+  }
+  
+  Dok.Dhop(src_ok,ref_ok,0);
+  
+  std::cout << "Reference = "<<norm2(src_ok)<<" res = "<<norm2(ref_ok) << std::endl;
+  ref_ok = ref_ok - result_ok;
+  std::cout << "Reference diff = "<<norm2(result_ok)<< std::endl;
+  std::cout << "Reference diff = "<<norm2(ref_ok)<< std::endl;
+
+  Grid_finalize();
+}
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -69,7 +69,6 @@ public:
  template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesI(i1);}
  std::string name(void) const { return std::string("timesI"); }
 };
-
 class funcTimesMinusI {
 public:
  funcTimesMinusI() {};
@@ -97,6 +96,7 @@ public:
 //  zeroit
 //  permute

+
 class funcReduce {
 public:
  funcReduce() {};
@@ -208,6 +208,100 @@ void ReductionTester(const functor &func)



+class funcPermute {
+public:
+  int n;
+  funcPermute(int _n) { n=_n;};
+  template<class vec>    void operator()(vec &rr,vec &i1,vec &i2) const { permute(rr,i1,n);}
+  template<class scal>   void apply(std::vector<scal> &rr,std::vector<scal> &in)  const { 
+    int sz=in.size();
+    int msk = sz>>(n+1);
+    for(int i=0;i<sz;i++){
+      rr[i] = in[ i^msk ];
+    }
+  }
+  std::string name(void) const { return std::string("Permute"); }
+};
+class funcRotate {
+public:
+  int n;
+  funcRotate(int _n) { n=_n;};
+  template<class vec>    void operator()(vec &rr,vec &i1,vec &i2) const { rr=rotate(i1,n);}
+  template<class scal>   void apply(std::vector<scal> &rr,std::vector<scal> &in)  const { 
+    int sz = in.size();
+    for(int i=0;i<sz;i++){
+      rr[i] = in[(i+n)%sz];
+    }
+  }
+  std::string name(void) const { return std::string("Rotate"); }
+};
+
+
+template<class scal, class vec,class functor > 
+void PermTester(const functor &func)
+{
+  GridSerialRNG          sRNG;
+  sRNG.SeedRandomDevice();
+  
+  int Nsimd = vec::Nsimd();
+
+  std::vector<scal> input1(Nsimd);
+  std::vector<scal> input2(Nsimd);
+  std::vector<scal> result(Nsimd);
+  std::vector<scal> reference(Nsimd);
+
+  std::vector<vec,alignedAllocator<vec> > buf(3);
+  vec & v_input1 = buf[0];
+  vec & v_input2 = buf[1];
+  vec & v_result = buf[2];
+
+  for(int i=0;i<Nsimd;i++){
+    random(sRNG,input1[i]);
+    random(sRNG,input2[i]);
+    random(sRNG,result[i]);
+  }
+
+  merge<vec,scal>(v_input1,input1);
+  merge<vec,scal>(v_input2,input2);
+  merge<vec,scal>(v_result,result);
+
+  func(v_result,v_input1,v_input2);
+
+  func.apply(reference,input1);
+
+  extract<vec,scal>(v_result,result);
+  std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl;
+
+  int ok=0;
+  if (0) {
+    std::cout<<GridLogMessage<< "*****" << std::endl;
+    for(int i=0;i<Nsimd;i++){
+      std::cout<< input1[i]<<" ";
+    }
+    std::cout <<std::endl; 
+    for(int i=0;i<Nsimd;i++){
+      std::cout<< result[i]<<" ";
+    }
+    std::cout <<std::endl; 
+    for(int i=0;i<Nsimd;i++){
+      std::cout<< reference[i]<<" ";
+    }
+    std::cout <<std::endl; 
+    std::cout<<GridLogMessage<< "*****" << std::endl;
+  }
+  for(int i=0;i<Nsimd;i++){
+    if ( abs(reference[i]-result[i])>1.0e-7){
+      std::cout<<GridLogMessage<< "*****" << std::endl;      
+      std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
+      ok++;
+    }
+  }
+  if ( ok==0 ) {
+    std::cout<<GridLogMessage << " OK!" <<std::endl;
+  }
+  assert(ok==0);
+}
+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -235,6 +329,24 @@ int main (int argc, char ** argv)
  Tester<RealF,vRealF>(funcInnerProduct());
  ReductionTester<RealF,RealF,vRealF>(funcReduce());

+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vRealF permutes "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+
+  // Log2 iteration
+  for(int i=0;(1<<i)< vRealF::Nsimd();i++){
+    PermTester<RealF,vRealF>(funcPermute(i));
+  }
+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vRealF rotate "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  for(int r=0;r<vRealF::Nsimd();r++){
+    PermTester<RealF,vRealF>(funcRotate(r));
+  }
+
+
  std::cout << GridLogMessage <<"==================================="<<  std::endl;
  std::cout << GridLogMessage <<"Testing vRealD "<<std::endl;
  std::cout << GridLogMessage <<"==================================="<<  std::endl;
@@ -247,6 +359,25 @@ int main (int argc, char ** argv)
  Tester<RealD,vRealD>(funcInnerProduct());
  ReductionTester<RealD,RealD,vRealD>(funcReduce());

+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vRealD permutes "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+
+  // Log2 iteration
+  for(int i=0;(1<<i)< vRealD::Nsimd();i++){
+    PermTester<RealD,vRealD>(funcPermute(i));
+  }
+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vRealD rotate "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  for(int r=0;r<vRealD::Nsimd();r++){
+    PermTester<RealD,vRealD>(funcRotate(r));
+  }
+
+
+
  std::cout << GridLogMessage <<"==================================="<<  std::endl;
  std::cout << GridLogMessage <<"Testing vComplexF "<<std::endl;
  std::cout << GridLogMessage <<"==================================="<<  std::endl;
@@ -261,6 +392,23 @@ int main (int argc, char ** argv)
  Tester<ComplexF,vComplexF>(funcInnerProduct());
  ReductionTester<ComplexF,ComplexF,vComplexF>(funcReduce());

+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vComplexF permutes "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+
+  // Log2 iteration
+  for(int i=0;(1<<i)< vComplexF::Nsimd();i++){
+    PermTester<ComplexF,vComplexF>(funcPermute(i));
+  }
+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vComplexF rotate "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  for(int r=0;r<vComplexF::Nsimd();r++){
+    PermTester<ComplexF,vComplexF>(funcRotate(r));
+  }
+
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
  std::cout<<GridLogMessage << "Testing vComplexD "<<std::endl;
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
@@ -276,5 +424,23 @@ int main (int argc, char ** argv)
  Tester<ComplexD,vComplexD>(funcInnerProduct());
  ReductionTester<ComplexD,ComplexD,vComplexD>(funcReduce());

+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vComplexD permutes "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+
+  // Log2 iteration
+  for(int i=0;(1<<i)< vComplexD::Nsimd();i++){
+    PermTester<ComplexD,vComplexD>(funcPermute(i));
+  }
+
+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vComplexD rotate "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  for(int r=0;r<vComplexD::Nsimd();r++){
+    PermTester<ComplexD,vComplexD>(funcRotate(r));
+  }
+
  Grid_finalize();
 }