From 9f280b82c4e56d8b32034dfbb83187e15add41e9 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Tue, 25 Jul 2017 11:30:41 -0400
Subject: [PATCH 01/14] Added mixed-precision CG with reliable updates

---
 lib/algorithms/Algorithms.h                   |   1 +
 .../ConjugateGradientReliableUpdate.h         | 231 ++++++++++++++++++
 tests/Test_dwf_mixedcg_prec_halfcomms.cc      |  40 ++-
 3 files changed, 260 insertions(+), 12 deletions(-)
 create mode 100644 lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
diff --git a/lib/algorithms/Algorithms.h b/lib/algorithms/Algorithms.h
index 5123c7a1..f8dc2dc2 100644
--- a/lib/algorithms/Algorithms.h
+++ b/lib/algorithms/Algorithms.h
@@ -44,6 +44,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
+#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 
 // Lanczos support
 //#include <Grid/algorithms/iterative/MatrixUtils.h>
diff --git a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
new file mode 100644
index 00000000..1aab064d
--- /dev/null
+++ b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
@@ -0,0 +1,231 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
+#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
+
+namespace Grid {
+
+  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
+  class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
+  public:
+    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+    // Defaults true.
+    RealD Tolerance;
+    Integer MaxIterations;
+    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+    Integer ReliableUpdatesPerformed;
+
+    bool DoFinalCleanup; //Final DP cleanup, defaults to true
+    Integer IterationsToCleanup; //Final DP cleanup step iterations
+    
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+    GridBase* SinglePrecGrid;
+    RealD Delta; //reliable update parameter
+    
+    ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
+      : Tolerance(tol),
+        MaxIterations(maxit),
+	Delta(_delta),
+	Linop_f(_Linop_f),
+	Linop_d(_Linop_d),
+	SinglePrecGrid(_sp_grid),
+        ErrorOnNoConverge(err_on_no_conv),
+	DoFinalCleanup(true)
+    {};
+
+    void operator()(const FieldD &src, FieldD &psi) {
+      psi.checkerboard = src.checkerboard;
+      conformable(psi, src);
+
+      RealD cp, c, a, d, b, ssq, qq, b_pred;
+
+      FieldD p(src);
+      FieldD mmp(src);
+      FieldD r(src);
+
+      // Initial residual computation & set up
+      RealD guess = norm2(psi);
+      assert(std::isnan(guess) == 0);
+    
+      Linop_d.HermOpAndNorm(psi, mmp, d, b);
+    
+      r = src - mmp;
+      p = r;
+
+      a = norm2(p);
+      cp = a;
+      ssq = norm2(src);
+
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl;
+      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl;
+
+      RealD rsq = Tolerance * Tolerance * ssq;
+
+      // Check if guess is really REALLY good :)
+      if (cp <= rsq) {
+	return;
+      }
+
+      //Single prec initialization
+      FieldF r_f(SinglePrecGrid);
+      r_f.checkerboard = r.checkerboard;
+      precisionChange(r_f, r);
+
+      FieldF psi_f(r_f);
+      psi_f = zero;
+
+      FieldF p_f(r_f);
+      FieldF mmp_f(r_f);
+
+      RealD MaxResidSinceLastRelUp = cp; //initial residual    
+    
+      std::cout << GridLogIterative << std::setprecision(4)
+		<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
+
+      GridStopWatch LinalgTimer;
+      GridStopWatch MatrixTimer;
+      GridStopWatch SolverTimer;
+
+      SolverTimer.Start();
+      int k = 0;
+      int l = 0;
+    
+      for (k = 1; k <= MaxIterations; k++) {
+	c = cp;
+
+	MatrixTimer.Start();
+	Linop_f.HermOpAndNorm(p_f, mmp_f, d, qq);
+	MatrixTimer.Stop();
+
+	LinalgTimer.Start();
+
+	a = c / d;
+	b_pred = a * (a * qq - d) / c;
+
+	cp = axpy_norm(r_f, -a, mmp_f, r_f);
+	b = cp / c;
+
+	// Fuse these loops ; should be really easy
+	psi_f = a * p_f + psi_f;
+	//p_f = p_f * b + r_f;
+
+	LinalgTimer.Stop();
+
+	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
+		  << " residual " << cp << " target " << rsq << std::endl;
+	std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
+	std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
+
+	if(cp > MaxResidSinceLastRelUp){
+	  std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
+	  MaxResidSinceLastRelUp = cp;
+	}
+	  
+	// Stopping condition
+	if (cp <= rsq) {
+	  //Although not written in the paper, I assume that I have to add on the final solution
+	  precisionChange(mmp, psi_f);
+	  psi = psi + mmp;
+	
+	
+	  SolverTimer.Stop();
+	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	  p = mmp - src;
+
+	  RealD srcnorm = sqrt(norm2(src));
+	  RealD resnorm = sqrt(norm2(p));
+	  RealD true_residual = resnorm / srcnorm;
+
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
+	  std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
+	  std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
+	  std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
+
+	  std::cout << GridLogMessage << "Time breakdown "<<std::endl;
+	  std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
+	  std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+	  std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+
+	  IterationsToComplete = k;	
+	  ReliableUpdatesPerformed = l;
+	  
+	  if(DoFinalCleanup){
+	    //Do a final CG to cleanup
+	    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
+	    ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
+	    CG.ErrorOnNoConverge = ErrorOnNoConverge;
+	    CG(Linop_d,src,psi);
+	    IterationsToCleanup = CG.IterationsToComplete;
+	  }
+	  else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
+
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
+	  return;
+	}
+	else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
+		    << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
+	  precisionChange(mmp, psi_f);
+	  psi = psi + mmp;
+
+	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
+	  r = src - mmp;
+
+	  psi_f = zero;
+	  precisionChange(r_f, r);
+	  cp = norm2(r);
+	  MaxResidSinceLastRelUp = cp;
+
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
+	  
+	  l = l+1;
+	}
+
+	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
+	
+      }
+      std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
+		<< std::endl;
+      
+      if (ErrorOnNoConverge) assert(0);
+      IterationsToComplete = k;
+      ReliableUpdatesPerformed = l;      
+    }    
+  };
+
+
+};
+
+
+
+#endif
diff --git a/tests/Test_dwf_mixedcg_prec_halfcomms.cc b/tests/Test_dwf_mixedcg_prec_halfcomms.cc
index 9cc935d9..d6aaa21e 100644
--- a/tests/Test_dwf_mixedcg_prec_halfcomms.cc
+++ b/tests/Test_dwf_mixedcg_prec_halfcomms.cc
@@ -80,31 +80,47 @@ int main (int argc, char ** argv)
 
 
   LatticeFermionD    src_o(FrbGrid);
-  LatticeFermionD result_o(FrbGrid);
-  LatticeFermionD result_o_2(FrbGrid);
+  LatticeFermionD result_cg(FrbGrid);
   pickCheckerboard(Odd,src_o,src);
-  result_o.checkerboard = Odd;
-  result_o = zero;
-  result_o_2.checkerboard = Odd;
-  result_o_2 = zero;
+  result_cg.checkerboard = Odd;
+  result_cg = zero;
+  LatticeFermionD result_mcg(result_cg);
+  LatticeFermionD result_rlcg(result_cg);
 
   SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
   SchurDiagMooeeOperator<DomainWallFermionFH,LatticeFermionF> HermOpEO_f(Ddwf_f);
 
+  //#define DO_MIXED_CG
+#define DO_RLUP_CG
+
+#ifdef DO_MIXED_CG
   std::cout << "Starting mixed CG" << std::endl;
   MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
   mCG.InnerTolerance = 3.0e-5;
-  mCG(src_o,result_o);
+  mCG(src_o,result_mcg);
+#endif
 
+#ifdef DO_RLUP_CG
+  std::cout << "Starting reliable update CG" << std::endl;
+  ConjugateGradientReliableUpdate<LatticeFermionD,LatticeFermionF> rlCG(1.e-8, 10000, 0.1, FrbGrid_f, HermOpEO_f, HermOpEO);
+  rlCG(src_o,result_rlcg);
+#endif
+  
   std::cout << "Starting regular CG" << std::endl;
   ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
-  CG(HermOpEO,src_o,result_o_2);
+  CG(HermOpEO,src_o,result_cg);
 
-  LatticeFermionD diff_o(FrbGrid);
-  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
-
-  std::cout << "Diff between mixed and regular CG: " << diff << std::endl;
+#ifdef DO_MIXED_CG
+  LatticeFermionD diff_mcg(FrbGrid);
+  RealD vdiff_mcg = axpy_norm(diff_mcg, -1.0, result_cg, result_mcg);
+  std::cout << "Diff between mixed and regular CG: " << vdiff_mcg << std::endl;
+#endif
 
+#ifdef DO_RLUP_CG
+  LatticeFermionD diff_rlcg(FrbGrid);
+  RealD vdiff_rlcg = axpy_norm(diff_rlcg, -1.0, result_cg, result_rlcg);
+  std::cout << "Diff between reliable update and regular CG: " << vdiff_rlcg << std::endl;
+#endif
   
   Grid_finalize();
 }

From 9939b267d26f13cf5c152705741ea7655badece9 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Mon, 31 Jul 2017 13:39:44 -0400
Subject: [PATCH 02/14] Added switching to fallback linear operator in reliable
 update CG, and added recalculation of b parameter on update.

---
 .../ConjugateGradientReliableUpdate.h         | 29 +++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
index 1aab064d..13666f97 100644
--- a/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
+++ b/lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
@@ -47,6 +47,11 @@ namespace Grid {
     LinearOperatorBase<FieldD> &Linop_d;
     GridBase* SinglePrecGrid;
     RealD Delta; //reliable update parameter
+
+    //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
+    LinearOperatorBase<FieldF> *Linop_fallback;
+    RealD fallback_transition_tol;
+
     
     ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
       : Tolerance(tol),
@@ -56,10 +61,19 @@ namespace Grid {
 	Linop_d(_Linop_d),
 	SinglePrecGrid(_sp_grid),
         ErrorOnNoConverge(err_on_no_conv),
-	DoFinalCleanup(true)
+	DoFinalCleanup(true),
+	Linop_fallback(NULL)
     {};
 
+    void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
+      Linop_fallback = &_Linop_fallback;
+      fallback_transition_tol = _fallback_transition_tol;      
+    }
+    
     void operator()(const FieldD &src, FieldD &psi) {
+      LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
+      bool using_fallback = false;
+      
       psi.checkerboard = src.checkerboard;
       conformable(psi, src);
 
@@ -93,6 +107,8 @@ namespace Grid {
 
       // Check if guess is really REALLY good :)
       if (cp <= rsq) {
+	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
+	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 	return;
       }
 
@@ -124,7 +140,7 @@ namespace Grid {
 	c = cp;
 
 	MatrixTimer.Start();
-	Linop_f.HermOpAndNorm(p_f, mmp_f, d, qq);
+	Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
 	MatrixTimer.Stop();
 
 	LinalgTimer.Start();
@@ -206,12 +222,21 @@ namespace Grid {
 	  cp = norm2(r);
 	  MaxResidSinceLastRelUp = cp;
 
+	  b = cp/c;
+	  
 	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
 	  
 	  l = l+1;
 	}
 
 	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
+
+	if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
+	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
+	  Linop_f_use = Linop_fallback;
+	  using_fallback = true;
+	}
+
 	
       }
       std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"

From ab50145001d7a2f27c9f724440e0bfd88a6373e2 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Tue, 22 Aug 2017 17:12:25 -0400
Subject: [PATCH 03/14] Implemented first, unoptimized version of hand-unrolled
 G-parity kernels Improved Test_gparity

---
 lib/qcd/action/fermion/FermionOperatorImpl.h |  81 +++-
 lib/qcd/action/fermion/WilsonKernelsHand.cc  | 456 +++++++++++++------
 tests/core/Test_gparity.cc                   | 148 +++---
 3 files changed, 485 insertions(+), 200 deletions(-)

diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index 524179f5..1e344521 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -425,6 +425,22 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
     ////////////////////////////////////////////////////////////////////////////////////////
     // Flavour doubled spinors; is Gparity the only? what about C*?
     ////////////////////////////////////////////////////////////////////////////////////////
+namespace GparityWilsonImpl_helper{
+  template<typename Get, typename A,typename B>
+  struct getAB;
+  
+  template<typename A,typename B>
+  struct getAB<A,A,B>{
+    static inline A & ref(A &a, B &b){ return a; }
+  };
+  template<typename A,typename B>
+  struct getAB<B,A,B>{
+    static inline B & ref(A &a, B &b){ return b; }
+  };
+};
+  
+
+  
 template <class S, int Nrepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
  public:
@@ -462,7 +478,10 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
       
  ImplParams Params;
 
- GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
+ std::vector<SiteSpinor, alignedAllocator<SiteSpinor> > tmp_full;
+ std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > tmp_half;
+
+ GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p), tmp_full(GridThread::GetThreads()), tmp_half(GridThread::GetThreads()){};
 
  bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
 
@@ -538,6 +557,66 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
    
  }
 
+
+ template <class ref>
+ inline void loadLinkElement(Simd &reg, ref &memory) {
+   reg = memory;
+ }
+
+ template<typename SiteSpinorType>
+ void GparityTwistPermute(SiteSpinorType &into,  const SiteSpinorType &from, const int direction, const int distance, const int perm, GridBase* grid){
+   typedef typename SiteSpinorType::scalar_object sobj;
+   sobj stmp;
+   std::vector<sobj> vals(grid->Nsimd());
+   extract(from,vals);
+   std::vector<int> icoor;
+   for(int s=0;s<grid->Nsimd();s++){
+     grid->iCoorFromIindex(icoor,s);
+     assert((icoor[direction]==0)||(icoor[direction]==1));
+
+     int permute_lane;
+     if ( distance == 1) {
+       permute_lane = icoor[direction]?1:0;
+     } else {
+       permute_lane = icoor[direction]?0:1;
+     }
+     if(perm) permute_lane = !permute_lane;                                                                                                                                                                                                
+
+     if ( permute_lane ) {
+       stmp(0) = vals[s](1);
+       stmp(1) = vals[s](0);
+       vals[s] = stmp;
+     }
+   }
+   merge(into,vals);
+ }
+
+
+ template<typename SiteSpinorType>
+ const SiteSpinorType & GparityGetChi(int &g, SiteSpinorType const* in, const int dir, const int f, StencilEntry *SE, StencilImpl &st){
+   const int mmu = dir % 4;
+   const int direction = st._directions[dir];
+   const int sl = st._grid->_simd_layout[direction];
+   const int perm = SE->_permute;
+   g = f;
+   
+   if(SE->_around_the_world && Params.twists[mmu]){
+     if(sl == 1){ //not SIMD vectorized in G-parity direction so just change the flavor index accessed to implement the twist
+       g = (f+1) % 2;
+       return in[SE->_offset];
+     }else{ //SIMD vectorized in Gparity direction
+       const int me = omp_get_thread_num();
+       const int distance = st._distances[dir];
+       assert(distance == -1 || distance == 1);
+       SiteSpinorType &tmp = GparityWilsonImpl_helper::getAB<SiteSpinorType, SiteSpinor, SiteHalfSpinor>::ref(tmp_full[me], tmp_half[me]);
+       GparityTwistPermute<SiteSpinorType>(tmp, in[SE->_offset], direction, distance, perm, st._grid);
+       return tmp;
+     }
+   }else return in[SE->_offset];
+ }
+
+
+ 
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds._grid,GaugeGrid);
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index 96b8ab0a..866e30d2 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -30,60 +30,77 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #define REGISTER
 
-#define LOAD_CHIMU \
-  {const SiteSpinor & ref (in._odata[offset]);	\
-    Chimu_00=ref()(0)(0);\
-    Chimu_01=ref()(0)(1);\
-    Chimu_02=ref()(0)(2);\
-    Chimu_10=ref()(1)(0);\
-    Chimu_11=ref()(1)(1);\
-    Chimu_12=ref()(1)(2);\
-    Chimu_20=ref()(2)(0);\
-    Chimu_21=ref()(2)(1);\
-    Chimu_22=ref()(2)(2);\
-    Chimu_30=ref()(3)(0);\
-    Chimu_31=ref()(3)(1);\
-    Chimu_32=ref()(3)(2);}
+#define LOAD_CHIMU_BODY(F)			\
+  Chimu_00=ref(F)(0)(0);			\
+  Chimu_01=ref(F)(0)(1);			\
+  Chimu_02=ref(F)(0)(2);			\
+  Chimu_10=ref(F)(1)(0);			\
+  Chimu_11=ref(F)(1)(1);			\
+  Chimu_12=ref(F)(1)(2);			\
+  Chimu_20=ref(F)(2)(0);			\
+  Chimu_21=ref(F)(2)(1);			\
+  Chimu_22=ref(F)(2)(2);			\
+  Chimu_30=ref(F)(3)(0);			\
+  Chimu_31=ref(F)(3)(1);			\
+  Chimu_32=ref(F)(3)(2)
+
+#define LOAD_CHIMU(DIR,F)						\
+  { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
+
+#define LOAD_CHIMU_GPARITY(DIR,F)					\
+  { int g; const SiteSpinor & ref = GparityGetChi<SiteSpinor>(g,in._odata.data(),DIR,F,SE,st); LOAD_CHIMU_BODY(g); }
+
+#define LOAD_CHI_BODY(F)				\
+    Chi_00 = ref(F)(0)(0);\
+    Chi_01 = ref(F)(0)(1);\
+    Chi_02 = ref(F)(0)(2);\
+    Chi_10 = ref(F)(1)(0);\
+    Chi_11 = ref(F)(1)(1);\
+    Chi_12 = ref(F)(1)(2)
+
+#define LOAD_CHI(DIR,F)						\
+  {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
+
+#define LOAD_CHI_GPARITY(DIR,F)						\
+  { int g; const SiteHalfSpinor &ref = GparityGetChi<SiteHalfSpinor>(g,buf,DIR,F,SE,st); LOAD_CHI_BODY(g); }
 
-#define LOAD_CHI\
-  {const SiteHalfSpinor &ref(buf[offset]);	\
-    Chi_00 = ref()(0)(0);\
-    Chi_01 = ref()(0)(1);\
-    Chi_02 = ref()(0)(2);\
-    Chi_10 = ref()(1)(0);\
-    Chi_11 = ref()(1)(1);\
-    Chi_12 = ref()(1)(2);}
 
 // To splat or not to splat depends on the implementation
-#define MULT_2SPIN(A)\
-  {auto & ref(U._odata[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));	\
-   Impl::loadLinkElement(U_10,ref()(1,0));	\
-   Impl::loadLinkElement(U_20,ref()(2,0));	\
-   Impl::loadLinkElement(U_01,ref()(0,1));	\
-   Impl::loadLinkElement(U_11,ref()(1,1));	\
-   Impl::loadLinkElement(U_21,ref()(2,1));	\
-    UChi_00 = U_00*Chi_00;\
-    UChi_10 = U_00*Chi_10;\
-    UChi_01 = U_10*Chi_00;\
-    UChi_11 = U_10*Chi_10;\
-    UChi_02 = U_20*Chi_00;\
-    UChi_12 = U_20*Chi_10;\
-    UChi_00+= U_01*Chi_01;\
-    UChi_10+= U_01*Chi_11;\
-    UChi_01+= U_11*Chi_01;\
-    UChi_11+= U_11*Chi_11;\
-    UChi_02+= U_21*Chi_01;\
-    UChi_12+= U_21*Chi_11;\
-    Impl::loadLinkElement(U_00,ref()(0,2));	\
-    Impl::loadLinkElement(U_10,ref()(1,2));	\
-    Impl::loadLinkElement(U_20,ref()(2,2));	\
-    UChi_00+= U_00*Chi_02;\
-    UChi_10+= U_00*Chi_12;\
-    UChi_01+= U_10*Chi_02;\
-    UChi_11+= U_10*Chi_12;\
-    UChi_02+= U_20*Chi_02;\
-    UChi_12+= U_20*Chi_12;}
+#define MULT_2SPIN_BODY \
+  Impl::loadLinkElement(U_00,ref()(0,0));	\
+  Impl::loadLinkElement(U_10,ref()(1,0));	\
+  Impl::loadLinkElement(U_20,ref()(2,0));	\
+  Impl::loadLinkElement(U_01,ref()(0,1));	\
+  Impl::loadLinkElement(U_11,ref()(1,1));	\
+  Impl::loadLinkElement(U_21,ref()(2,1));	\
+  UChi_00 = U_00*Chi_00;			\
+  UChi_10 = U_00*Chi_10;			\
+  UChi_01 = U_10*Chi_00;			\
+  UChi_11 = U_10*Chi_10;			\
+  UChi_02 = U_20*Chi_00;			\
+  UChi_12 = U_20*Chi_10;			\
+  UChi_00+= U_01*Chi_01;			\
+  UChi_10+= U_01*Chi_11;			\
+  UChi_01+= U_11*Chi_01;			\
+  UChi_11+= U_11*Chi_11;			\
+  UChi_02+= U_21*Chi_01;			\
+  UChi_12+= U_21*Chi_11;			\
+  Impl::loadLinkElement(U_00,ref()(0,2));	\
+  Impl::loadLinkElement(U_10,ref()(1,2));	\
+  Impl::loadLinkElement(U_20,ref()(2,2));	\
+  UChi_00+= U_00*Chi_02;			\
+  UChi_10+= U_00*Chi_12;			\
+  UChi_01+= U_10*Chi_02;			\
+  UChi_11+= U_10*Chi_12;			\
+  UChi_02+= U_20*Chi_02;			\
+  UChi_12+= U_20*Chi_12
+
+
+#define MULT_2SPIN(A,F)					\
+  {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
+
+#define MULT_2SPIN_GPARITY(A,F)				\
+  {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
 
 
 #define PERMUTE_DIR(dir)			\
@@ -307,84 +324,85 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   result_31-= UChi_11;	\
   result_32-= UChi_12;
 
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
   SE=st.GetEntry(ptype,DIR,ss);			\
   offset = SE->_offset;				\
   local  = SE->_is_local;			\
   perm   = SE->_permute;			\
   if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU_IMPL(DIR,F);			\
     PROJ;					\
     if ( perm) {				\
       PERMUTE_DIR(PERM);			\
     }						\
   } else {					\
-    LOAD_CHI;					\
+    LOAD_CHI_IMPL(DIR,F);			\
   }						\
-  MULT_2SPIN(DIR);				\
+  MULT_2SPIN_IMPL(DIR,F);			\
   RECON;					
 
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
+
+#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
   SE=st.GetEntry(ptype,DIR,ss);			\
   offset = SE->_offset;				\
   local  = SE->_is_local;			\
   perm   = SE->_permute;			\
   if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU_IMPL(DIR,F);			\
     PROJ;					\
     if ( perm) {				\
       PERMUTE_DIR(PERM);			\
     }						\
   } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI;					\
+    LOAD_CHI_IMPL(DIR,F);			\
   }						\
   if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN(DIR);				\
+    MULT_2SPIN_IMPL(DIR,F);			\
     RECON;					\
   }
 
-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
   SE=st.GetEntry(ptype,DIR,ss);			\
   offset = SE->_offset;				\
   if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI;					\
-    MULT_2SPIN(DIR);				\
+    LOAD_CHI_IMPL(DIR,F);			\
+    MULT_2SPIN_IMPL(DIR,F);			\
     RECON;					\
     nmu++;					\
   }
 
-#define HAND_RESULT(ss)				\
+#define HAND_RESULT(ss,F)			\
   {						\
     SiteSpinor & ref (out._odata[ss]);		\
-    vstream(ref()(0)(0),result_00);		\
-    vstream(ref()(0)(1),result_01);		\
-    vstream(ref()(0)(2),result_02);		\
-    vstream(ref()(1)(0),result_10);		\
-    vstream(ref()(1)(1),result_11);		\
-    vstream(ref()(1)(2),result_12);		\
-    vstream(ref()(2)(0),result_20);		\
-    vstream(ref()(2)(1),result_21);		\
-    vstream(ref()(2)(2),result_22);		\
-    vstream(ref()(3)(0),result_30);		\
-    vstream(ref()(3)(1),result_31);		\
-    vstream(ref()(3)(2),result_32);		\
+    vstream(ref(F)(0)(0),result_00);		\
+    vstream(ref(F)(0)(1),result_01);		\
+    vstream(ref(F)(0)(2),result_02);		\
+    vstream(ref(F)(1)(0),result_10);		\
+    vstream(ref(F)(1)(1),result_11);		\
+    vstream(ref(F)(1)(2),result_12);		\
+    vstream(ref(F)(2)(0),result_20);		\
+    vstream(ref(F)(2)(1),result_21);		\
+    vstream(ref(F)(2)(2),result_22);		\
+    vstream(ref(F)(3)(0),result_30);		\
+    vstream(ref(F)(3)(1),result_31);		\
+    vstream(ref(F)(3)(2),result_32);		\
   }
 
-#define HAND_RESULT_EXT(ss)			\
+#define HAND_RESULT_EXT(ss,F)			\
   if (nmu){					\
     SiteSpinor & ref (out._odata[ss]);		\
-    ref()(0)(0)+=result_00;		\
-    ref()(0)(1)+=result_01;		\
-    ref()(0)(2)+=result_02;		\
-    ref()(1)(0)+=result_10;		\
-    ref()(1)(1)+=result_11;		\
-    ref()(1)(2)+=result_12;		\
-    ref()(2)(0)+=result_20;		\
-    ref()(2)(1)+=result_21;		\
-    ref()(2)(2)+=result_22;		\
-    ref()(3)(0)+=result_30;		\
-    ref()(3)(1)+=result_31;		\
-    ref()(3)(2)+=result_32;		\
+    ref(F)(0)(0)+=result_00;		\
+    ref(F)(0)(1)+=result_01;		\
+    ref(F)(0)(2)+=result_02;		\
+    ref(F)(1)(0)+=result_10;		\
+    ref(F)(1)(1)+=result_11;		\
+    ref(F)(1)(2)+=result_12;		\
+    ref(F)(2)(0)+=result_20;		\
+    ref(F)(2)(1)+=result_21;		\
+    ref(F)(2)(2)+=result_22;		\
+    ref(F)(3)(0)+=result_30;		\
+    ref(F)(3)(1)+=result_31;		\
+    ref(F)(3)(2)+=result_32;		\
   }
 
 
@@ -463,15 +481,18 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
   int offset,local,perm, ptype;
   StencilEntry *SE;
 
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
+#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 
 template<class Impl>
@@ -485,16 +506,19 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
 
   StencilEntry *SE;
   int offset,local,perm, ptype;
-  
-  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
-  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
+
+#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 
 template<class Impl> void 
@@ -509,16 +533,20 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
 
   int offset,local,perm, ptype;
   StencilEntry *SE;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
+
+#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT(ss,F)
+
+  HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 
 template<class Impl>
@@ -532,16 +560,20 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
 
   StencilEntry *SE;
   int offset,local,perm, ptype;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT(ss);
+
+#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\
+  ZERO_RESULT;							\
+  HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\
+  HAND_RESULT(ss,F)
+  
+  HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 
 template<class Impl> void 
@@ -557,16 +589,20 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
   int offset,local,perm, ptype;
   StencilEntry *SE;
   int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
+
+#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT_EXT(ss,F)
+
+  HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 
 template<class Impl>
@@ -581,16 +617,20 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
   StencilEntry *SE;
   int offset,local,perm, ptype;
   int nmu=0;
-  ZERO_RESULT;
-  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
-  HAND_RESULT_EXT(ss);
+
+#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
+  ZERO_RESULT; \
+  HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
+  HAND_RESULT_EXT(ss,F)
+
+  HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
 }
 
   ////////////////////////////////////////////////
@@ -647,10 +687,130 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 				    FermionField &out){ assert(0); }	\
 
   HAND_SPECIALISE_EMPTY(GparityWilsonImplF);
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplD);
+  //HAND_SPECIALISE_EMPTY(GparityWilsonImplD);
   HAND_SPECIALISE_EMPTY(GparityWilsonImplFH);
   HAND_SPECIALISE_EMPTY(GparityWilsonImplDF);
 
+
+
+
+
+template<> void 
+WilsonKernels<GparityWilsonImplD>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef GparityWilsonImplD Impl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+  HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+}
+
+template<>
+void WilsonKernels<GparityWilsonImplD>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef GparityWilsonImplD Impl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+  HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+}
+
+template<> void 
+WilsonKernels<GparityWilsonImplD>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef GparityWilsonImplD Impl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+  HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+}
+
+template<>
+void WilsonKernels<GparityWilsonImplD>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef GparityWilsonImplD Impl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;  
+  HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+  HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+}
+
+template<> void 
+WilsonKernels<GparityWilsonImplD>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionField &in, FermionField &out)
+{
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef GparityWilsonImplD Impl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  int nmu=0;
+  HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+  nmu = 0;
+  HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);  
+}
+
+template<>
+void WilsonKernels<GparityWilsonImplD>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionField &in, FermionField &out)
+{
+  typedef GparityWilsonImplD Impl;
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+
+  HAND_DECLARATIONS(ignore);
+
+  StencilEntry *SE;
+  int offset,local,perm, ptype;
+  int nmu=0;
+  HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+  nmu = 0;
+  HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+  
 ////////////// Wilson ; uses this implementation /////////////////////
 
 #define INSTANTIATE_THEM(A) \
diff --git a/tests/core/Test_gparity.cc b/tests/core/Test_gparity.cc
index cfb5d2c3..81091e9e 100644
--- a/tests/core/Test_gparity.cc
+++ b/tests/core/Test_gparity.cc
@@ -33,22 +33,68 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 
-typedef typename GparityDomainWallFermionR::FermionField FermionField;
+//typedef GparityDomainWallFermionD GparityDiracOp;
+//typedef DomainWallFermionD StandardDiracOp;
+//#define DOP_PARAMS
 
+typedef GparityMobiusFermionD GparityDiracOp;
+typedef MobiusFermionD StandardDiracOp;
+#define DOP_PARAMS ,1.5, 0.5
+
+
+typedef typename GparityDiracOp::FermionField GparityFermionField;
+typedef typename GparityDiracOp::GaugeField GparityGaugeField;
+typedef typename GparityFermionField::vector_type vComplexType;
+
+typedef typename StandardDiracOp::FermionField StandardFermionField;
+typedef typename StandardDiracOp::GaugeField StandardGaugeField;
+
+enum{ same_vComplex = std::is_same<vComplexType, typename StandardFermionField::vector_type>::value };
+static_assert(same_vComplex == 1, "Dirac Operators must have same underlying SIMD complex type");
 
 int main (int argc, char ** argv)
 {
-  const int nu = 3;
+  int nu = 0;
 
   Grid_init(&argc,&argv);
 
+  for(int i=1;i<argc;i++){
+    if(std::string(argv[i]) == "--Gparity-dir"){
+      std::stringstream ss; ss << argv[i+1]; ss >> nu;
+      std::cout << GridLogMessage << "Set Gparity direction to " << nu << std::endl;
+    }
+  }
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Testing Gparity Dirac operator                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexType::Nsimd()<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 
   const int Ls=4;
-  const int L =4;
-  std::vector<int> latt_2f(Nd,L);
-  std::vector<int> latt_1f(Nd,L); latt_1f[nu] = 2*L;
+  //const int L =4;
+  //std::vector<int> latt_2f(Nd,L);
 
-  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  std::vector<int> latt_2f = GridDefaultLatt();
+  std::vector<int> latt_1f(latt_2f); latt_1f[nu] = 2*latt_2f[nu];
+  int L = latt_2f[nu];
+
+
+  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexType::Nsimd());
+
+  std::cout << GridLogMessage << "SIMD layout: ";
+  for(int i=0;i<simd_layout.size();i++) std::cout << simd_layout[i] << " ";
+  std::cout << std::endl;
+  
   std::vector<int> mpi_layout  = GridDefaultMpi(); //node layout
 
   GridCartesian         * UGrid_1f   = SpaceTimeGrid::makeFourDimGrid(latt_1f, simd_layout, mpi_layout);
@@ -67,13 +113,13 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG5_2f(FGrid_2f);  RNG5_2f.SeedFixedIntegers(seeds5);
   GridParallelRNG          RNG4_2f(UGrid_2f);  RNG4_2f.SeedFixedIntegers(seeds4);
 
-  LatticeGaugeField Umu_2f(UGrid_2f);
+  GparityGaugeField Umu_2f(UGrid_2f);
   SU3::HotConfiguration(RNG4_2f,Umu_2f);
 
-  LatticeFermion    src   (FGrid_2f); 
-  LatticeFermion    tmpsrc(FGrid_2f); 
-  FermionField      src_2f(FGrid_2f); 
-  LatticeFermion    src_1f(FGrid_1f); 
+  StandardFermionField    src   (FGrid_2f); 
+  StandardFermionField    tmpsrc(FGrid_2f); 
+  GparityFermionField      src_2f(FGrid_2f); 
+  StandardFermionField    src_1f(FGrid_1f); 
 
   // Replicate fermion source
   random(RNG5_2f,src);
@@ -81,8 +127,8 @@ int main (int argc, char ** argv)
   tmpsrc=src*2.0;
   PokeIndex<0>(src_2f,tmpsrc,1);
 
-  LatticeFermion result_1f(FGrid_1f); result_1f=zero;
-  LatticeGaugeField Umu_1f(UGrid_1f); 
+  StandardFermionField result_1f(FGrid_1f); result_1f=zero;
+  StandardGaugeField Umu_1f(UGrid_1f); 
   Replicate(Umu_2f,Umu_1f);
 
   //Coordinate grid for reference
@@ -92,7 +138,7 @@ int main (int argc, char ** argv)
   //Copy-conjugate the gauge field
   //First C-shift the lattice by Lx/2
   {
-    LatticeGaugeField Umu_shift = conjugate( Cshift(Umu_1f,nu,L) );
+    StandardGaugeField Umu_shift = conjugate( Cshift(Umu_1f,nu,L) );
     Umu_1f = where( xcoor_1f >= Integer(L), Umu_shift, Umu_1f );
 
     // hack test to check the same
@@ -101,7 +147,7 @@ int main (int argc, char ** argv)
     cout << GridLogMessage << "Umu diff " << norm2(Umu_shift)<<std::endl;
 
     //Make the gauge field antiperiodic in nu-direction
-    LatticeColourMatrix Unu(UGrid_1f);
+    decltype(PeekIndex<LorentzIndex>(Umu_1f,nu)) Unu(UGrid_1f);
     Unu = PeekIndex<LorentzIndex>(Umu_1f,nu);
     Unu = where(xcoor_1f == Integer(2*L-1), -Unu, Unu);
     PokeIndex<LorentzIndex>(Umu_1f,Unu,nu);
@@ -115,33 +161,33 @@ int main (int argc, char ** argv)
 
   RealD mass=0.0;
   RealD M5=1.8;
-  DomainWallFermionR Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5);
+  StandardDiracOp Ddwf(Umu_1f,*FGrid_1f,*FrbGrid_1f,*UGrid_1f,*UrbGrid_1f,mass,M5 DOP_PARAMS);
 
-  LatticeFermion    src_o_1f(FrbGrid_1f);
-  LatticeFermion result_o_1f(FrbGrid_1f);
+  StandardFermionField    src_o_1f(FrbGrid_1f);
+  StandardFermionField result_o_1f(FrbGrid_1f);
   pickCheckerboard(Odd,src_o_1f,src_1f);
   result_o_1f=zero;
 
-  SchurDiagMooeeOperator<DomainWallFermionR,LatticeFermion> HermOpEO(Ddwf);
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+  SchurDiagMooeeOperator<StandardDiracOp,StandardFermionField> HermOpEO(Ddwf);
+  ConjugateGradient<StandardFermionField> CG(1.0e-8,10000);
   CG(HermOpEO,src_o_1f,result_o_1f);
   
   //  const int nu = 3;
   std::vector<int> twists(Nd,0);
   twists[nu] = 1;
-  GparityDomainWallFermionR::ImplParams params;
+  GparityDiracOp::ImplParams params;
   params.twists = twists;
-  GparityDomainWallFermionR GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5,params);
+  GparityDiracOp GPDdwf(Umu_2f,*FGrid_2f,*FrbGrid_2f,*UGrid_2f,*UrbGrid_2f,mass,M5 DOP_PARAMS,params);
 
   for(int disp=-1;disp<=1;disp+=2)
   for(int mu=0;mu<5;mu++)
   { 
-    FermionField Dsrc_2f(FGrid_2f);
+    GparityFermionField Dsrc_2f(FGrid_2f);
 
-    LatticeFermion Dsrc_1f(FGrid_1f);
-    LatticeFermion Dsrc_2freplica(FGrid_1f);
-    LatticeFermion Dsrc_2freplica0(FGrid_1f);
-    LatticeFermion Dsrc_2freplica1(FGrid_1f);
+    StandardFermionField Dsrc_1f(FGrid_1f);
+    StandardFermionField Dsrc_2freplica(FGrid_1f);
+    StandardFermionField Dsrc_2freplica0(FGrid_1f);
+    StandardFermionField Dsrc_2freplica1(FGrid_1f);
 
     if ( mu ==0 ) {
       std::cout << GridLogMessage<< " Cross checking entire hopping term"<<std::endl;
@@ -156,8 +202,8 @@ int main (int argc, char ** argv)
     std::cout << GridLogMessage << "S norms "<< norm2(src_2f) << " " << norm2(src_1f)  <<std::endl;
     std::cout << GridLogMessage << "D norms "<< norm2(Dsrc_2f)<< " " << norm2(Dsrc_1f) <<std::endl;
 
-    LatticeFermion Dsrc_2f0(FGrid_2f); Dsrc_2f0 = PeekIndex<0>(Dsrc_2f,0);
-    LatticeFermion Dsrc_2f1(FGrid_2f); Dsrc_2f1 = PeekIndex<0>(Dsrc_2f,1);
+    StandardFermionField Dsrc_2f0(FGrid_2f); Dsrc_2f0 = PeekIndex<0>(Dsrc_2f,0);
+    StandardFermionField Dsrc_2f1(FGrid_2f); Dsrc_2f1 = PeekIndex<0>(Dsrc_2f,1);
 
     //    Dsrc_2f1 = Dsrc_2f1 - Dsrc_2f0;
     //    std::cout << GridLogMessage << " Cross check two halves " <<norm2(Dsrc_2f1)<<std::endl;
@@ -174,20 +220,20 @@ int main (int argc, char ** argv)
   }
 
   {
-    FermionField chi   (FGrid_2f); gaussian(RNG5_2f,chi);
-    FermionField phi   (FGrid_2f); gaussian(RNG5_2f,phi);
+    GparityFermionField chi   (FGrid_2f); gaussian(RNG5_2f,chi);
+    GparityFermionField phi   (FGrid_2f); gaussian(RNG5_2f,phi);
   
-    FermionField chi_e   (FrbGrid_2f);
-    FermionField chi_o   (FrbGrid_2f);
+    GparityFermionField chi_e   (FrbGrid_2f);
+    GparityFermionField chi_o   (FrbGrid_2f);
     
-    FermionField dchi_e  (FrbGrid_2f);
-    FermionField dchi_o  (FrbGrid_2f);
+    GparityFermionField dchi_e  (FrbGrid_2f);
+    GparityFermionField dchi_o  (FrbGrid_2f);
     
-    FermionField phi_e   (FrbGrid_2f);
-    FermionField phi_o   (FrbGrid_2f);
+    GparityFermionField phi_e   (FrbGrid_2f);
+    GparityFermionField phi_o   (FrbGrid_2f);
     
-    FermionField dphi_e  (FrbGrid_2f);
-    FermionField dphi_o  (FrbGrid_2f);
+    GparityFermionField dphi_e  (FrbGrid_2f);
+    GparityFermionField dphi_o  (FrbGrid_2f);
 
     pickCheckerboard(Even,chi_e,chi);
     pickCheckerboard(Odd ,chi_o,chi);
@@ -212,14 +258,14 @@ int main (int argc, char ** argv)
 
   }
 
-  FermionField result_2f(FGrid_2f); result_2f=zero;
-  FermionField    src_o_2f(FrbGrid_2f);
-  FermionField result_o_2f(FrbGrid_2f);
+  GparityFermionField result_2f(FGrid_2f); result_2f=zero;
+  GparityFermionField    src_o_2f(FrbGrid_2f);
+  GparityFermionField result_o_2f(FrbGrid_2f);
   pickCheckerboard(Odd,src_o_2f,src_2f);
   result_o_2f=zero;
 
-  ConjugateGradient<FermionField> CG2f(1.0e-8,10000);
-  SchurDiagMooeeOperator<GparityDomainWallFermionR,FermionField> HermOpEO2f(GPDdwf);
+  ConjugateGradient<GparityFermionField> CG2f(1.0e-8,10000);
+  SchurDiagMooeeOperator<GparityDiracOp,GparityFermionField> HermOpEO2f(GPDdwf);
   CG2f(HermOpEO2f,src_o_2f,result_o_2f);
 
   std::cout << "2f cb "<<result_o_2f.checkerboard<<std::endl;
@@ -227,10 +273,10 @@ int main (int argc, char ** argv)
 
   std::cout << " result norms " <<norm2(result_o_2f)<<" " <<norm2(result_o_1f)<<std::endl;
 
-  LatticeFermion    res0o  (FrbGrid_2f); 
-  LatticeFermion    res1o  (FrbGrid_2f); 
-  LatticeFermion    res0  (FGrid_2f); 
-  LatticeFermion    res1  (FGrid_2f); 
+  StandardFermionField    res0o  (FrbGrid_2f); 
+  StandardFermionField    res1o  (FrbGrid_2f); 
+  StandardFermionField    res0  (FGrid_2f); 
+  StandardFermionField    res1  (FGrid_2f); 
 
   res0=zero;
   res1=zero;
@@ -244,9 +290,9 @@ int main (int argc, char ** argv)
   setCheckerboard(res0,res0o);
   setCheckerboard(res1,res1o);
 
-  LatticeFermion replica (FGrid_1f);
-  LatticeFermion replica0(FGrid_1f);
-  LatticeFermion replica1(FGrid_1f);
+  StandardFermionField replica (FGrid_1f);
+  StandardFermionField replica0(FGrid_1f);
+  StandardFermionField replica1(FGrid_1f);
   Replicate(res0,replica0);
   Replicate(res1,replica1);
 

From 061e48fd73c63d924e8d777ab74287f7d69d7c2a Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Tue, 22 Aug 2017 18:12:12 -0400
Subject: [PATCH 04/14] Replaced slow unpack-repack in G-parity BC twist with
 intrinsics version

---
 lib/qcd/action/fermion/FermionOperatorImpl.h | 26 ++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index 1e344521..5300063b 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -565,6 +565,7 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 
  template<typename SiteSpinorType>
  void GparityTwistPermute(SiteSpinorType &into,  const SiteSpinorType &from, const int direction, const int distance, const int perm, GridBase* grid){
+#if 0
    typedef typename SiteSpinorType::scalar_object sobj;
    sobj stmp;
    std::vector<sobj> vals(grid->Nsimd());
@@ -589,6 +590,31 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
      }
    }
    merge(into,vals);
+#else
+   int permute_type = grid->PermuteType(direction);
+   typedef typename SiteSpinorType::vector_type vtype;
+   vtype tmp1, tmp2, tmp3, tmp4;
+
+   for(int s=0;s<Ns;s++){
+     for(int c=0;c<Nrepresentation;c++){
+       permute(tmp1, from(1)(s)(c), permute_type); //1l 1h -> 1h 1l
+       exchange(tmp2,tmp3, from(0)(s)(c), tmp1, permute_type); // 0l 0h , 1h 1l -> 0l 1h 0h,1l
+       permute(tmp4, tmp3, permute_type); //0h,1l -> 1l,0h
+              
+       if( (distance == 1 && !perm) || (distance == -1 && perm) ){
+	 //Pulled fermion through forwards face, GPBC on upper component
+	 //Need 0= 0l 1h   1= 1l 0h
+	 into(0)(s)(c) = tmp2;
+	 into(1)(s)(c) = tmp4;
+       }else if( (distance == -1 && !perm) || (distance == 1 && perm) ){
+	 //Pulled fermion through backwards face, GPBC on lower component
+	 //Need 0= 1l 0h   1= 0l 1h	 
+	 into(0)(s)(c) = tmp4;
+	 into(1)(s)(c) = tmp2;
+       }else assert(0);
+     }
+   }
+#endif
  }
 
 

From b61835c1a53ab6077840aabdb3ef8e77ea161008 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Wed, 23 Aug 2017 12:33:48 -0400
Subject: [PATCH 05/14] Added inplace version of intrinsic G-parity twist to
 hand-unrolled kernel

---
 lib/qcd/action/fermion/WilsonKernelsHand.cc | 90 ++++++++++++++++++++-
 1 file changed, 86 insertions(+), 4 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index 866e30d2..045a2cda 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -47,9 +47,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define LOAD_CHIMU(DIR,F)						\
   { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
 
-#define LOAD_CHIMU_GPARITY(DIR,F)					\
-  { int g; const SiteSpinor & ref = GparityGetChi<SiteSpinor>(g,in._odata.data(),DIR,F,SE,st); LOAD_CHIMU_BODY(g); }
-
 #define LOAD_CHI_BODY(F)				\
     Chi_00 = ref(F)(0)(0);\
     Chi_01 = ref(F)(0)(1);\
@@ -61,10 +58,95 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define LOAD_CHI(DIR,F)						\
   {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
 
-#define LOAD_CHI_GPARITY(DIR,F)						\
+
+//G-parity implementations using implementation method
+#define LOAD_CHIMU_GPARITY_IMPL(DIR,F)					\
+  { int g; const SiteSpinor & ref = GparityGetChi<SiteSpinor>(g,in._odata.data(),DIR,F,SE,st); LOAD_CHIMU_BODY(g); }
+
+#define LOAD_CHI_GPARITY_IMPL(DIR,F)					\
   { int g; const SiteHalfSpinor &ref = GparityGetChi<SiteHalfSpinor>(g,buf,DIR,F,SE,st); LOAD_CHI_BODY(g); }
 
 
+//G-parity implementations using in-place intrinsic ops
+
+//1l 1h -> 1h 1l
+//0l 0h , 1h 1l -> 0l 1h 0h,1l
+//0h,1l -> 1l,0h
+//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
+//Pulled fermion through forwards face, GPBC on upper component
+//Need 0= 0l 1h   1= 1l 0h
+//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
+//Pulled fermion through backwards face, GPBC on lower component
+//Need 0= 1l 0h   1= 0l 1h
+#define DO_TWIST(INTO,S,C,F, tmp1, tmp2, tmp3, tmp4)			\
+  permute(tmp1, ref(1)(S)(C), permute_type);				\
+  exchange(tmp2,tmp3, ref(0)(S)(C), tmp1, permute_type);		\
+  permute(tmp4, tmp3, permute_type);					\
+  if( (distance == 1 && !perm) || (distance == -1 && perm) ){		\
+    INTO = F == 0 ? tmp2 : tmp4;					\
+  }else if( (distance == -1 && !perm) || (distance == 1 && perm) ){	\
+    INTO = F == 0 ? tmp4 : tmp2;					\
+  }
+
+#define LOAD_CHI_SETUP(DIR,F)						\
+  int g = F;								\
+  const int direction = st._directions[DIR];				\
+  const int distance = st._distances[DIR];				\
+  const int sl = st._grid->_simd_layout[direction];			\
+  int inplace_twist = 0;						\
+  if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
+    if(sl == 1){							\
+      g = (F+1) % 2;							\
+    }else{								\
+      inplace_twist = 1;						\
+    }									\
+  }  
+
+#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F)				\
+  { const SiteSpinor &ref(in._odata[offset]);				\
+    LOAD_CHI_SETUP(DIR,F);						\
+    if(!inplace_twist){							\
+      LOAD_CHIMU_BODY(g);						\
+    }else{								\
+      const int permute_type = st._grid->PermuteType(direction);	\
+      DO_TWIST(Chimu_00,0,0,F,  U_00,U_01,U_10,U_11);			\
+      DO_TWIST(Chimu_01,0,1,F,  U_20,U_21,U_00,U_01);			\
+      DO_TWIST(Chimu_02,0,2,F,  U_10,U_11,U_20,U_21);			\
+      DO_TWIST(Chimu_10,1,0,F,  U_00,U_01,U_10,U_11);			\
+      DO_TWIST(Chimu_11,1,1,F,  U_20,U_21,U_00,U_01);			\
+      DO_TWIST(Chimu_12,1,2,F,  U_10,U_11,U_20,U_21);			\
+      DO_TWIST(Chimu_20,2,0,F,  U_00,U_01,U_10,U_11);			\
+      DO_TWIST(Chimu_21,2,1,F,  U_20,U_21,U_00,U_01);			\
+      DO_TWIST(Chimu_22,2,2,F,  U_10,U_11,U_20,U_21);			\
+      DO_TWIST(Chimu_30,3,0,F,  U_00,U_01,U_10,U_11);			\
+      DO_TWIST(Chimu_31,3,1,F,  U_20,U_21,U_00,U_01);			\
+      DO_TWIST(Chimu_32,3,2,F,  U_10,U_11,U_20,U_21);			\
+    } \
+  }
+
+
+#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F)				\
+  { const SiteHalfSpinor &ref(buf[offset]);				\
+    LOAD_CHI_SETUP(DIR,F);						\
+    if(!inplace_twist){							\
+      LOAD_CHI_BODY(g);							\
+    }else{								\
+      const int permute_type = st._grid->PermuteType(direction);	\
+      DO_TWIST(Chi_00,0,0,F,  U_00,U_01,U_10,U_11);			\
+      DO_TWIST(Chi_01,0,1,F,  U_20,U_21,UChi_00,UChi_01);		\
+      DO_TWIST(Chi_02,0,2,F,  UChi_02,UChi_10,UChi_11,UChi_12);		\
+      DO_TWIST(Chi_10,1,0,F,  U_00,U_01,U_10,U_11);			\
+      DO_TWIST(Chi_11,1,1,F,  U_20,U_21,UChi_00,UChi_01);		\
+      DO_TWIST(Chi_12,1,2,F,  UChi_02,UChi_10,UChi_11,UChi_12);		\
+    }									\
+  }
+
+//#define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_IMPL(DIR,F)
+#define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F)
+
+//#define LOAD_CHIMU_GPARITY(DIR,F) LOAD_CHIMU_GPARITY_IMPL(DIR,F)
+#define LOAD_CHIMU_GPARITY(DIR,F) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F)
+
 // To splat or not to splat depends on the implementation
 #define MULT_2SPIN_BODY \
   Impl::loadLinkElement(U_00,ref()(0,0));	\

From 46f88e6d726c238b84a9b05894f7157cb6f5c2a3 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Wed, 23 Aug 2017 13:21:10 -0400
Subject: [PATCH 06/14] G-parity hand-unrolled intrinsics twist now uses one
 less permute and one less temporary

---
 lib/qcd/action/fermion/WilsonKernelsHand.cc | 66 +++++++++++++--------
 1 file changed, 42 insertions(+), 24 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index 045a2cda..6e03379e 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -78,16 +78,30 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 //else if( (distance == -1 && !perm) || (distance == 1 && perm) )
 //Pulled fermion through backwards face, GPBC on lower component
 //Need 0= 1l 0h   1= 0l 1h
-#define DO_TWIST(INTO,S,C,F, tmp1, tmp2, tmp3, tmp4)			\
+
+//1l 1h -> 1h 1l
+//0l 0h , 1h 1l -> 0l 1h 0h,1l
+#define DO_TWIST_0L_1H(INTO,S,C,F, tmp1, tmp2, tmp3) \
   permute(tmp1, ref(1)(S)(C), permute_type);				\
   exchange(tmp2,tmp3, ref(0)(S)(C), tmp1, permute_type);		\
-  permute(tmp4, tmp3, permute_type);					\
-  if( (distance == 1 && !perm) || (distance == -1 && perm) ){		\
-    INTO = F == 0 ? tmp2 : tmp4;					\
-  }else if( (distance == -1 && !perm) || (distance == 1 && perm) ){	\
-    INTO = F == 0 ? tmp4 : tmp2;					\
+  INTO = tmp2;
+
+//0l 0h -> 0h 0l
+//1l 1h, 0h 0l -> 1l 0h, 1h 0l
+#define DO_TWIST_1L_0H(INTO,S,C,F, tmp1, tmp2, tmp3)			\
+  permute(tmp1, ref(0)(S)(C), permute_type);				\
+  exchange(tmp2,tmp3, ref(1)(S)(C), tmp1, permute_type);		\
+  INTO = tmp2;
+
+#define DO_TWIST(INTO,S,C,F, tmp1, tmp2, tmp3)				\
+  if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
+       ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
+    DO_TWIST_0L_1H(INTO,S,C,F,tmp1,tmp2,tmp3);				\
+  }else{								\
+    DO_TWIST_1L_0H(INTO,S,C,F,tmp1,tmp2,tmp3);				\
   }
 
+
 #define LOAD_CHI_SETUP(DIR,F)						\
   int g = F;								\
   const int direction = st._directions[DIR];				\
@@ -109,18 +123,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
       LOAD_CHIMU_BODY(g);						\
     }else{								\
       const int permute_type = st._grid->PermuteType(direction);	\
-      DO_TWIST(Chimu_00,0,0,F,  U_00,U_01,U_10,U_11);			\
-      DO_TWIST(Chimu_01,0,1,F,  U_20,U_21,U_00,U_01);			\
-      DO_TWIST(Chimu_02,0,2,F,  U_10,U_11,U_20,U_21);			\
-      DO_TWIST(Chimu_10,1,0,F,  U_00,U_01,U_10,U_11);			\
-      DO_TWIST(Chimu_11,1,1,F,  U_20,U_21,U_00,U_01);			\
-      DO_TWIST(Chimu_12,1,2,F,  U_10,U_11,U_20,U_21);			\
-      DO_TWIST(Chimu_20,2,0,F,  U_00,U_01,U_10,U_11);			\
-      DO_TWIST(Chimu_21,2,1,F,  U_20,U_21,U_00,U_01);			\
-      DO_TWIST(Chimu_22,2,2,F,  U_10,U_11,U_20,U_21);			\
-      DO_TWIST(Chimu_30,3,0,F,  U_00,U_01,U_10,U_11);			\
-      DO_TWIST(Chimu_31,3,1,F,  U_20,U_21,U_00,U_01);			\
-      DO_TWIST(Chimu_32,3,2,F,  U_10,U_11,U_20,U_21);			\
+      DO_TWIST(Chimu_00,0,0,F,  U_00,U_01,U_10);			\
+      DO_TWIST(Chimu_01,0,1,F,  U_11,U_20,U_21);			\
+      DO_TWIST(Chimu_02,0,2,F,  U_00,U_01,U_10);			\
+      DO_TWIST(Chimu_10,1,0,F,  U_11,U_20,U_21);			\
+      DO_TWIST(Chimu_11,1,1,F,  U_00,U_01,U_10);			\
+      DO_TWIST(Chimu_12,1,2,F,  U_11,U_20,U_21);			\
+      DO_TWIST(Chimu_20,2,0,F,  U_00,U_01,U_10);			\
+      DO_TWIST(Chimu_21,2,1,F,  U_11,U_20,U_21);			\
+      DO_TWIST(Chimu_22,2,2,F,  U_00,U_01,U_10);			\
+      DO_TWIST(Chimu_30,3,0,F,  U_11,U_20,U_21);			\
+      DO_TWIST(Chimu_31,3,1,F,  U_00,U_01,U_10);			\
+      DO_TWIST(Chimu_32,3,2,F,  U_11,U_20,U_21);			\
     } \
   }
 
@@ -132,15 +146,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
       LOAD_CHI_BODY(g);							\
     }else{								\
       const int permute_type = st._grid->PermuteType(direction);	\
-      DO_TWIST(Chi_00,0,0,F,  U_00,U_01,U_10,U_11);			\
-      DO_TWIST(Chi_01,0,1,F,  U_20,U_21,UChi_00,UChi_01);		\
-      DO_TWIST(Chi_02,0,2,F,  UChi_02,UChi_10,UChi_11,UChi_12);		\
-      DO_TWIST(Chi_10,1,0,F,  U_00,U_01,U_10,U_11);			\
-      DO_TWIST(Chi_11,1,1,F,  U_20,U_21,UChi_00,UChi_01);		\
-      DO_TWIST(Chi_12,1,2,F,  UChi_02,UChi_10,UChi_11,UChi_12);		\
+      DO_TWIST(Chi_00,0,0,F,  U_00,U_01,U_10);				\
+      DO_TWIST(Chi_01,0,1,F,  U_11,U_20,U_21);				\
+      DO_TWIST(Chi_02,0,2,F,  UChi_00,UChi_01,UChi_02);			\
+      DO_TWIST(Chi_10,1,0,F,  UChi_10,UChi_11,UChi_12);			\
+      DO_TWIST(Chi_11,1,1,F,  U_00,U_01,U_10);				\
+      DO_TWIST(Chi_12,1,2,F,  U_11,U_20,U_21);				\
     }									\
   }
 
+
+
+
+
 //#define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_IMPL(DIR,F)
 #define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F)
 

From a0bb8e5b4660634db2b4a7cf994b2665926b24e9 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Wed, 23 Aug 2017 14:44:40 -0400
Subject: [PATCH 07/14] Added hand-unrolled kernel implementations of all the
 other dslash precision / comms precision combinations with G-parity

---
 lib/qcd/action/fermion/WilsonKernelsHand.cc | 213 ++++++++++----------
 1 file changed, 103 insertions(+), 110 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index 6e03379e..ea04845e 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -786,120 +786,113 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 				    const FermionField &in,		\
 				    FermionField &out){ assert(0); }	\
 
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplF);
-  //HAND_SPECIALISE_EMPTY(GparityWilsonImplD);
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplFH);
-  HAND_SPECIALISE_EMPTY(GparityWilsonImplDF);
 
 
+#define HAND_SPECIALISE_GPARITY(IMPL)					\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+				    int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype;					\
+    StencilEntry *SE;							\
+    HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+					    int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype;					\
+    HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+						     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype;					\
+    StencilEntry *SE;							\
+    HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+							     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype;					\
+    HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+									\
+  template<> void							\
+  WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \
+						     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    int offset,local,perm, ptype;					\
+    StencilEntry *SE;							\
+    int nmu=0;								\
+    HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    nmu = 0;								\
+    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }									\
+  template<>								\
+  void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
+							     int ss,int sU,const FermionField &in, FermionField &out) \
+  {									\
+    typedef IMPL Impl;							\
+    typedef typename Simd::scalar_type S;				\
+    typedef typename Simd::vector_type V;				\
+									\
+    HAND_DECLARATIONS(ignore);						\
+									\
+    StencilEntry *SE;							\
+    int offset,local,perm, ptype;					\
+    int nmu=0;								\
+    HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+    nmu = 0;								\
+    HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
+  }
 
 
-
-template<> void 
-WilsonKernels<GparityWilsonImplD>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef GparityWilsonImplD Impl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-  HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-}
-
-template<>
-void WilsonKernels<GparityWilsonImplD>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef GparityWilsonImplD Impl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-  HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-}
-
-template<> void 
-WilsonKernels<GparityWilsonImplD>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef GparityWilsonImplD Impl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-  HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-}
-
-template<>
-void WilsonKernels<GparityWilsonImplD>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef GparityWilsonImplD Impl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;  
-  HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-  HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-}
-
-template<> void 
-WilsonKernels<GparityWilsonImplD>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionField &in, FermionField &out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef GparityWilsonImplD Impl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  int nmu=0;
-  HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-  nmu = 0;
-  HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);  
-}
-
-template<>
-void WilsonKernels<GparityWilsonImplD>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionField &in, FermionField &out)
-{
-  typedef GparityWilsonImplD Impl;
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-
-  HAND_DECLARATIONS(ignore);
-
-  StencilEntry *SE;
-  int offset,local,perm, ptype;
-  int nmu=0;
-  HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-  nmu = 0;
-  HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY);
-}
-
-
-
+HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
+HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
 
 
 

From ce5df177eeccee214f3f394bbdec385f7c19caeb Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Wed, 23 Aug 2017 15:05:22 -0400
Subject: [PATCH 08/14] Removed superfluous implementation of G-parity twist
 for hand-unrolled kernel from GparityWilsonImpl

---
 lib/qcd/action/fermion/FermionOperatorImpl.h | 85 +-------------------
 lib/qcd/action/fermion/WilsonKernelsHand.cc  | 14 ----
 2 files changed, 1 insertion(+), 98 deletions(-)

diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index 5300063b..ffb82989 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -478,10 +478,7 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
       
  ImplParams Params;
 
- std::vector<SiteSpinor, alignedAllocator<SiteSpinor> > tmp_full;
- std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > tmp_half;
-
- GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p), tmp_full(GridThread::GetThreads()), tmp_half(GridThread::GetThreads()){};
+ GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
 
  bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
 
@@ -563,86 +560,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
    reg = memory;
  }
 
- template<typename SiteSpinorType>
- void GparityTwistPermute(SiteSpinorType &into,  const SiteSpinorType &from, const int direction, const int distance, const int perm, GridBase* grid){
-#if 0
-   typedef typename SiteSpinorType::scalar_object sobj;
-   sobj stmp;
-   std::vector<sobj> vals(grid->Nsimd());
-   extract(from,vals);
-   std::vector<int> icoor;
-   for(int s=0;s<grid->Nsimd();s++){
-     grid->iCoorFromIindex(icoor,s);
-     assert((icoor[direction]==0)||(icoor[direction]==1));
-
-     int permute_lane;
-     if ( distance == 1) {
-       permute_lane = icoor[direction]?1:0;
-     } else {
-       permute_lane = icoor[direction]?0:1;
-     }
-     if(perm) permute_lane = !permute_lane;                                                                                                                                                                                                
-
-     if ( permute_lane ) {
-       stmp(0) = vals[s](1);
-       stmp(1) = vals[s](0);
-       vals[s] = stmp;
-     }
-   }
-   merge(into,vals);
-#else
-   int permute_type = grid->PermuteType(direction);
-   typedef typename SiteSpinorType::vector_type vtype;
-   vtype tmp1, tmp2, tmp3, tmp4;
-
-   for(int s=0;s<Ns;s++){
-     for(int c=0;c<Nrepresentation;c++){
-       permute(tmp1, from(1)(s)(c), permute_type); //1l 1h -> 1h 1l
-       exchange(tmp2,tmp3, from(0)(s)(c), tmp1, permute_type); // 0l 0h , 1h 1l -> 0l 1h 0h,1l
-       permute(tmp4, tmp3, permute_type); //0h,1l -> 1l,0h
-              
-       if( (distance == 1 && !perm) || (distance == -1 && perm) ){
-	 //Pulled fermion through forwards face, GPBC on upper component
-	 //Need 0= 0l 1h   1= 1l 0h
-	 into(0)(s)(c) = tmp2;
-	 into(1)(s)(c) = tmp4;
-       }else if( (distance == -1 && !perm) || (distance == 1 && perm) ){
-	 //Pulled fermion through backwards face, GPBC on lower component
-	 //Need 0= 1l 0h   1= 0l 1h	 
-	 into(0)(s)(c) = tmp4;
-	 into(1)(s)(c) = tmp2;
-       }else assert(0);
-     }
-   }
-#endif
- }
-
-
- template<typename SiteSpinorType>
- const SiteSpinorType & GparityGetChi(int &g, SiteSpinorType const* in, const int dir, const int f, StencilEntry *SE, StencilImpl &st){
-   const int mmu = dir % 4;
-   const int direction = st._directions[dir];
-   const int sl = st._grid->_simd_layout[direction];
-   const int perm = SE->_permute;
-   g = f;
-   
-   if(SE->_around_the_world && Params.twists[mmu]){
-     if(sl == 1){ //not SIMD vectorized in G-parity direction so just change the flavor index accessed to implement the twist
-       g = (f+1) % 2;
-       return in[SE->_offset];
-     }else{ //SIMD vectorized in Gparity direction
-       const int me = omp_get_thread_num();
-       const int distance = st._distances[dir];
-       assert(distance == -1 || distance == 1);
-       SiteSpinorType &tmp = GparityWilsonImpl_helper::getAB<SiteSpinorType, SiteSpinor, SiteHalfSpinor>::ref(tmp_full[me], tmp_half[me]);
-       GparityTwistPermute<SiteSpinorType>(tmp, in[SE->_offset], direction, distance, perm, st._grid);
-       return tmp;
-     }
-   }else return in[SE->_offset];
- }
-
-
- 
  inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
  {
    conformable(Uds._grid,GaugeGrid);
diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index ea04845e..e1243304 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -59,14 +59,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
 
 
-//G-parity implementations using implementation method
-#define LOAD_CHIMU_GPARITY_IMPL(DIR,F)					\
-  { int g; const SiteSpinor & ref = GparityGetChi<SiteSpinor>(g,in._odata.data(),DIR,F,SE,st); LOAD_CHIMU_BODY(g); }
-
-#define LOAD_CHI_GPARITY_IMPL(DIR,F)					\
-  { int g; const SiteHalfSpinor &ref = GparityGetChi<SiteHalfSpinor>(g,buf,DIR,F,SE,st); LOAD_CHI_BODY(g); }
-
-
 //G-parity implementations using in-place intrinsic ops
 
 //1l 1h -> 1h 1l
@@ -156,13 +148,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   }
 
 
-
-
-
-//#define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_IMPL(DIR,F)
 #define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F)
-
-//#define LOAD_CHIMU_GPARITY(DIR,F) LOAD_CHIMU_GPARITY_IMPL(DIR,F)
 #define LOAD_CHIMU_GPARITY(DIR,F) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F)
 
 // To splat or not to splat depends on the implementation

From edabb3577ff13df048fccb8e9fb17080531e32df Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Wed, 23 Aug 2017 16:54:06 -0400
Subject: [PATCH 09/14] Imported Benchmark_gparity

---
 benchmarks/Benchmark_gparity.cc | 190 ++++++++++++++++++++++++++++++++
 1 file changed, 190 insertions(+)
 create mode 100644 benchmarks/Benchmark_gparity.cc

diff --git a/benchmarks/Benchmark_gparity.cc b/benchmarks/Benchmark_gparity.cc
new file mode 100644
index 00000000..f6036aa8
--- /dev/null
+++ b/benchmarks/Benchmark_gparity.cc
@@ -0,0 +1,190 @@
+#include <Grid/Grid.h>
+#include <sstream>
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+typedef typename GparityDomainWallFermionF::FermionField GparityLatticeFermionF;
+typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
+
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int Ls=16;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "Ls = " << Ls << std::endl;
+
+  std::vector<int> latt4 = GridDefaultLatt();
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  GparityLatticeFermionF src   (FGrid); random(RNG5,src);
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+
+  GparityLatticeFermionF result(FGrid); result=zero;
+  GparityLatticeFermionF    ref(FGrid);    ref=zero;
+  GparityLatticeFermionF    tmp(FGrid);
+  GparityLatticeFermionF    err(FGrid);
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  LatticeGaugeFieldF Umu(UGrid); 
+  SU3::HotConfiguration(RNG4,Umu); 
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
+#ifdef GRID_OMP
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+#endif
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+
+
+  std::cout << GridLogMessage<< "* SINGLE/SINGLE"<<std::endl;
+  GparityDomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  int ncall =1000;
+  if (1) {
+    FGrid->Barrier();
+    Dw.ZeroCounters();
+    Dw.Dhop(src,result,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      Dw.Dhop(src,result,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=2*1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    Dw.Report();
+  }
+
+  std::cout << GridLogMessage<< "* SINGLE/HALF"<<std::endl;
+  GparityDomainWallFermionFH DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  if (1) {
+    FGrid->Barrier();
+    DwH.ZeroCounters();
+    DwH.Dhop(src,result,0);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      DwH.Dhop(src,result,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=2*1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    DwH.Report();
+  }
+
+  GridCartesian         * UGrid_d   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d);
+  GridCartesian         * FGrid_d   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_d);
+  GridRedBlackCartesian * FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_d);
+
+  
+  std::cout << GridLogMessage<< "* DOUBLE/DOUBLE"<<std::endl;
+  GparityLatticeFermionD src_d(FGrid_d);
+  precisionChange(src_d,src);
+
+  LatticeGaugeFieldD Umu_d(UGrid_d); 
+  precisionChange(Umu_d,Umu);
+
+  GparityLatticeFermionD result_d(FGrid_d);
+
+  GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
+  if (1) {
+    FGrid_d->Barrier();
+    DwD.ZeroCounters();
+    DwD.Dhop(src_d,result_d,0);
+    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      __SSC_START;
+      DwD.Dhop(src_d,result_d,0);
+      __SSC_STOP;
+    }
+    double t1=usecond();
+    FGrid_d->Barrier();
+    
+    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+    double flops=2*1344*volume*ncall;
+
+    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
+    DwD.Report();
+  }
+
+  Grid_finalize();
+}
+

From 34a9aeb331b06e4a3fc8562afbe518039af55008 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Thu, 24 Aug 2017 13:53:50 -0700
Subject: [PATCH 10/14] Reduced number of if-statement evaluations in G-parity
 unrolled kernel

---
 lib/qcd/action/fermion/WilsonKernelsHand.cc | 62 +++++++++++++++------
 1 file changed, 44 insertions(+), 18 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index e1243304..ca93916b 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -115,18 +115,34 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
       LOAD_CHIMU_BODY(g);						\
     }else{								\
       const int permute_type = st._grid->PermuteType(direction);	\
-      DO_TWIST(Chimu_00,0,0,F,  U_00,U_01,U_10);			\
-      DO_TWIST(Chimu_01,0,1,F,  U_11,U_20,U_21);			\
-      DO_TWIST(Chimu_02,0,2,F,  U_00,U_01,U_10);			\
-      DO_TWIST(Chimu_10,1,0,F,  U_11,U_20,U_21);			\
-      DO_TWIST(Chimu_11,1,1,F,  U_00,U_01,U_10);			\
-      DO_TWIST(Chimu_12,1,2,F,  U_11,U_20,U_21);			\
-      DO_TWIST(Chimu_20,2,0,F,  U_00,U_01,U_10);			\
-      DO_TWIST(Chimu_21,2,1,F,  U_11,U_20,U_21);			\
-      DO_TWIST(Chimu_22,2,2,F,  U_00,U_01,U_10);			\
-      DO_TWIST(Chimu_30,3,0,F,  U_11,U_20,U_21);			\
-      DO_TWIST(Chimu_31,3,1,F,  U_00,U_01,U_10);			\
-      DO_TWIST(Chimu_32,3,2,F,  U_11,U_20,U_21);			\
+      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
+	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
+	DO_TWIST_0L_1H(Chimu_00,0,0,F,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_01,0,1,F,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_02,0,2,F,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_10,1,0,F,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_11,1,1,F,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_12,1,2,F,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_20,2,0,F,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_21,2,1,F,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_22,2,2,F,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_30,3,0,F,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_31,3,1,F,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_32,3,2,F,  U_11,U_20,U_21);		\
+      }else{								\
+	DO_TWIST_1L_0H(Chimu_00,0,0,F,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_01,0,1,F,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_02,0,2,F,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_10,1,0,F,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_11,1,1,F,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_12,1,2,F,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_20,2,0,F,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_21,2,1,F,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_22,2,2,F,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_30,3,0,F,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_31,3,1,F,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_32,3,2,F,  U_11,U_20,U_21);		\
+      } \
     } \
   }
 
@@ -138,12 +154,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
       LOAD_CHI_BODY(g);							\
     }else{								\
       const int permute_type = st._grid->PermuteType(direction);	\
-      DO_TWIST(Chi_00,0,0,F,  U_00,U_01,U_10);				\
-      DO_TWIST(Chi_01,0,1,F,  U_11,U_20,U_21);				\
-      DO_TWIST(Chi_02,0,2,F,  UChi_00,UChi_01,UChi_02);			\
-      DO_TWIST(Chi_10,1,0,F,  UChi_10,UChi_11,UChi_12);			\
-      DO_TWIST(Chi_11,1,1,F,  U_00,U_01,U_10);				\
-      DO_TWIST(Chi_12,1,2,F,  U_11,U_20,U_21);				\
+      if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
+	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
+	DO_TWIST_0L_1H(Chi_00,0,0,F,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_01,0,1,F,  U_11,U_20,U_21);			\
+	DO_TWIST_0L_1H(Chi_02,0,2,F,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_0L_1H(Chi_10,1,0,F,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_0L_1H(Chi_11,1,1,F,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_12,1,2,F,  U_11,U_20,U_21);			\
+      }else{								\
+	DO_TWIST_1L_0H(Chi_00,0,0,F,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_01,0,1,F,  U_11,U_20,U_21);			\
+	DO_TWIST_1L_0H(Chi_02,0,2,F,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_1L_0H(Chi_10,1,0,F,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_1L_0H(Chi_11,1,1,F,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_12,1,2,F,  U_11,U_20,U_21);			\
+      }									\
     }									\
   }
 

From f365a83fae6279ca1c52d90d4d266fc39f46335f Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Fri, 25 Aug 2017 14:24:11 -0400
Subject: [PATCH 11/14] In G-parity unrolled kernel, replaced calls to permute
 and exchange with run-time-evaluated permute type with explicit calls to
 appropriate underlying functions

---
 lib/qcd/action/fermion/WilsonKernelsHand.cc | 138 +++++++++-----------
 lib/simd/Grid_vector_types.h                |  13 +-
 2 files changed, 77 insertions(+), 74 deletions(-)

diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index ca93916b..a0f5ffec 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   Chimu_31=ref(F)(3)(1);			\
   Chimu_32=ref(F)(3)(2)
 
-#define LOAD_CHIMU(DIR,F)						\
+#define LOAD_CHIMU(DIR,F,PERM)						\
   { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
 
 #define LOAD_CHI_BODY(F)				\
@@ -55,7 +55,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     Chi_11 = ref(F)(1)(1);\
     Chi_12 = ref(F)(1)(2)
 
-#define LOAD_CHI(DIR,F)						\
+#define LOAD_CHI(DIR,F,PERM)					\
   {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
 
 
@@ -73,33 +73,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 //1l 1h -> 1h 1l
 //0l 0h , 1h 1l -> 0l 1h 0h,1l
-#define DO_TWIST_0L_1H(INTO,S,C,F, tmp1, tmp2, tmp3) \
-  permute(tmp1, ref(1)(S)(C), permute_type);				\
-  exchange(tmp2,tmp3, ref(0)(S)(C), tmp1, permute_type);		\
+#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
+  permute##PERM(tmp1, ref(1)(S)(C));				\
+  exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\
   INTO = tmp2;
 
 //0l 0h -> 0h 0l
 //1l 1h, 0h 0l -> 1l 0h, 1h 0l
-#define DO_TWIST_1L_0H(INTO,S,C,F, tmp1, tmp2, tmp3)			\
-  permute(tmp1, ref(0)(S)(C), permute_type);				\
-  exchange(tmp2,tmp3, ref(1)(S)(C), tmp1, permute_type);		\
+#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\
+  permute##PERM(tmp1, ref(0)(S)(C));				\
+  exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\
   INTO = tmp2;
 
-#define DO_TWIST(INTO,S,C,F, tmp1, tmp2, tmp3)				\
-  if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
-       ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-    DO_TWIST_0L_1H(INTO,S,C,F,tmp1,tmp2,tmp3);				\
-  }else{								\
-    DO_TWIST_1L_0H(INTO,S,C,F,tmp1,tmp2,tmp3);				\
-  }
+
 
 
 #define LOAD_CHI_SETUP(DIR,F)						\
-  int g = F;								\
-  const int direction = st._directions[DIR];				\
-  const int distance = st._distances[DIR];				\
-  const int sl = st._grid->_simd_layout[direction];			\
-  int inplace_twist = 0;						\
+  g = F;								\
+  direction = st._directions[DIR];				\
+  distance = st._distances[DIR];				\
+  sl = st._grid->_simd_layout[direction];			\
+  inplace_twist = 0;						\
   if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\
     if(sl == 1){							\
       g = (F+1) % 2;							\
@@ -108,74 +102,72 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     }									\
   }  
 
-#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F)				\
+#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\
   { const SiteSpinor &ref(in._odata[offset]);				\
     LOAD_CHI_SETUP(DIR,F);						\
     if(!inplace_twist){							\
       LOAD_CHIMU_BODY(g);						\
     }else{								\
-      const int permute_type = st._grid->PermuteType(direction);	\
       if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chimu_00,0,0,F,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_01,0,1,F,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_02,0,2,F,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_10,1,0,F,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_11,1,1,F,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_12,1,2,F,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_20,2,0,F,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_21,2,1,F,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_22,2,2,F,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_30,3,0,F,  U_11,U_20,U_21);		\
-	DO_TWIST_0L_1H(Chimu_31,3,1,F,  U_00,U_01,U_10);		\
-	DO_TWIST_0L_1H(Chimu_32,3,2,F,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
       }else{								\
-	DO_TWIST_1L_0H(Chimu_00,0,0,F,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_01,0,1,F,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_02,0,2,F,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_10,1,0,F,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_11,1,1,F,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_12,1,2,F,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_20,2,0,F,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_21,2,1,F,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_22,2,2,F,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_30,3,0,F,  U_11,U_20,U_21);		\
-	DO_TWIST_1L_0H(Chimu_31,3,1,F,  U_00,U_01,U_10);		\
-	DO_TWIST_1L_0H(Chimu_32,3,2,F,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\
+	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\
+	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\
       } \
     } \
   }
 
 
-#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F)				\
+#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\
   { const SiteHalfSpinor &ref(buf[offset]);				\
     LOAD_CHI_SETUP(DIR,F);						\
     if(!inplace_twist){							\
       LOAD_CHI_BODY(g);							\
     }else{								\
-      const int permute_type = st._grid->PermuteType(direction);	\
       if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
 	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
-	DO_TWIST_0L_1H(Chi_00,0,0,F,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_01,0,1,F,  U_11,U_20,U_21);			\
-	DO_TWIST_0L_1H(Chi_02,0,2,F,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_0L_1H(Chi_10,1,0,F,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_0L_1H(Chi_11,1,1,F,  U_00,U_01,U_10);			\
-	DO_TWIST_0L_1H(Chi_12,1,2,F,  U_11,U_20,U_21);			\
+	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
+	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
       }else{								\
-	DO_TWIST_1L_0H(Chi_00,0,0,F,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_01,0,1,F,  U_11,U_20,U_21);			\
-	DO_TWIST_1L_0H(Chi_02,0,2,F,  UChi_00,UChi_01,UChi_02);		\
-	DO_TWIST_1L_0H(Chi_10,1,0,F,  UChi_10,UChi_11,UChi_12);		\
-	DO_TWIST_1L_0H(Chi_11,1,1,F,  U_00,U_01,U_10);			\
-	DO_TWIST_1L_0H(Chi_12,1,2,F,  U_11,U_20,U_21);			\
+	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\
+	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\
+	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\
+	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\
+	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\
       }									\
     }									\
   }
 
 
-#define LOAD_CHI_GPARITY(DIR,F) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F)
-#define LOAD_CHIMU_GPARITY(DIR,F) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F)
+#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
+#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
 
 // To splat or not to splat depends on the implementation
 #define MULT_2SPIN_BODY \
@@ -442,13 +434,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   local  = SE->_is_local;			\
   perm   = SE->_permute;			\
   if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F);			\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
     PROJ;					\
     if ( perm) {				\
       PERMUTE_DIR(PERM);			\
     }						\
   } else {					\
-    LOAD_CHI_IMPL(DIR,F);			\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
   }						\
   MULT_2SPIN_IMPL(DIR,F);			\
   RECON;					
@@ -460,13 +452,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   local  = SE->_is_local;			\
   perm   = SE->_permute;			\
   if ( local ) {				\
-    LOAD_CHIMU_IMPL(DIR,F);			\
+    LOAD_CHIMU_IMPL(DIR,F,PERM);			\
     PROJ;					\
     if ( perm) {				\
       PERMUTE_DIR(PERM);			\
     }						\
   } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI_IMPL(DIR,F);			\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
   }						\
   if (local || st.same_node[DIR] ) {		\
     MULT_2SPIN_IMPL(DIR,F);			\
@@ -477,7 +469,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
   SE=st.GetEntry(ptype,DIR,ss);			\
   offset = SE->_offset;				\
   if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI_IMPL(DIR,F);			\
+    LOAD_CHI_IMPL(DIR,F,PERM);			\
     MULT_2SPIN_IMPL(DIR,F);			\
     RECON;					\
     nmu++;					\
@@ -811,7 +803,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 									\
     HAND_DECLARATIONS(ignore);						\
 									\
-    int offset,local,perm, ptype;					\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
     StencilEntry *SE;							\
     HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
     HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
@@ -828,7 +820,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
     HAND_DECLARATIONS(ignore);						\
 									\
     StencilEntry *SE;							\
-    int offset,local,perm, ptype;					\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
     HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
     HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
   }									\
@@ -843,7 +835,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 									\
     HAND_DECLARATIONS(ignore);						\
 									\
-    int offset,local,perm, ptype;					\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\
     StencilEntry *SE;							\
     HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
     HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
@@ -860,7 +852,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
     HAND_DECLARATIONS(ignore);						\
 									\
     StencilEntry *SE;							\
-    int offset,local,perm, ptype;					\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
     HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
     HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
   }									\
@@ -875,7 +867,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
 									\
     HAND_DECLARATIONS(ignore);						\
 									\
-    int offset,local,perm, ptype;					\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
     StencilEntry *SE;							\
     int nmu=0;								\
     HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
@@ -893,7 +885,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
     HAND_DECLARATIONS(ignore);						\
 									\
     StencilEntry *SE;							\
-    int offset,local,perm, ptype;					\
+    int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
     int nmu=0;								\
     HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
     nmu = 0;								\
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 27585547..c67e74cb 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -376,7 +376,18 @@ class Grid_simd {
       Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
     }
   }
-
+  friend inline void exchange0(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
+  }
+  friend inline void exchange1(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
+  }
+  friend inline void exchange2(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
+  }
+  friend inline void exchange3(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2){    
+    Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
+  }
   ////////////////////////////////////////////////////////////////////
   // General permute; assumes vector length is same across
   // all subtypes; may not be a good assumption, but could

From d36d2fb40d1e2871db5f7be4d79e58811037ad5d Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Mon, 28 Aug 2017 06:53:56 -0700
Subject: [PATCH 12/14] Added ability to override default Ls in Benchmark_dwf

---
 benchmarks/Benchmark_dwf.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc
index 7814ec7d..1f9c7624 100644
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -51,7 +51,13 @@ int main (int argc, char ** argv)
   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 
   std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
+  int Ls=16;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+
   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);

From 74af885d4eda81453b1eb83b062a11810011d5f5 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Tue, 29 Aug 2017 09:50:37 -0400
Subject: [PATCH 13/14] Removed some no-longer-needed associated with G-parity
 hand unrolled kernel

---
 lib/qcd/action/fermion/FermionOperatorImpl.h | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h
index ffb82989..9d24deb2 100644
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -425,22 +425,6 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
     ////////////////////////////////////////////////////////////////////////////////////////
     // Flavour doubled spinors; is Gparity the only? what about C*?
     ////////////////////////////////////////////////////////////////////////////////////////
-namespace GparityWilsonImpl_helper{
-  template<typename Get, typename A,typename B>
-  struct getAB;
-  
-  template<typename A,typename B>
-  struct getAB<A,A,B>{
-    static inline A & ref(A &a, B &b){ return a; }
-  };
-  template<typename A,typename B>
-  struct getAB<B,A,B>{
-    static inline B & ref(A &a, B &b){ return b; }
-  };
-};
-  
-
-  
 template <class S, int Nrepresentation, class Options=CoeffReal>
 class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
  public:

From 59bd1fe21b3fc81b8c50e9caa53f2adc65b3d7b5 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <ckelly@phys.columbia.edu>
Date: Tue, 29 Aug 2017 13:07:37 -0700
Subject: [PATCH 14/14] Fix for 'perm' and 'local' not being set for
 hand-unrolled external-site Dslash, which caused incorrect behavior of
 G-parity kernel

---
 lib/qcd/action/fermion/WilsonKernelsHand.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc
index a0f5ffec..80b81714 100644
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -468,6 +468,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\
   SE=st.GetEntry(ptype,DIR,ss);			\
   offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
   if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
     LOAD_CHI_IMPL(DIR,F,PERM);			\
     MULT_2SPIN_IMPL(DIR,F);			\