Merge branch 'develop' of github.com:paboyle/Grid into feature/baryonSpeedup

2026-07-18 16:13:28 +01:00 · 2020-05-07 11:13:21 +01:00
parent 56e2f7d088 525418abfb
commit 591ebb6213
192 changed files with 2866 additions and 6632 deletions
@@ -22,8 +22,18 @@
 #undef __CUDACC__
 #undef __CUDA_ARCH__
 #define __NVCC__REDEFINE__
 #endif 
 /* SYCL save and restore compile environment*/
 #ifdef __SYCL_DEVICE_ONLY__  
 #pragma push
 #pragma push_macro("__SYCL_DEVICE_ONLY__")
 #undef __SYCL_DEVICE_ONLY__
 #undef EIGEN_USE_SYCL
 #define EIGEN_DONT_VECTORIZE
 #endif
 #include <Grid/Eigen/Dense>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -35,7 +45,14 @@
 #pragma pop
 #endif
 /*SYCL restore*/
 #ifdef __SYCL__REDEFINE__
 #pragma pop_macro("__SYCL_DEVICE_ONLY__")
 #pragma pop
 #endif
 #if defined __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -37,211 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>
 NAMESPACE_BEGIN(Grid); 
  ////////////////////////////////////////////////////////
  // Move following 100 LOC to lattice/Lattice_basis.h
  ////////////////////////////////////////////////////////
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
  // If assume basis[j] are already orthonormal,
  // can take all inner products in parallel saving 2x bandwidth
  // Save 3x bandwidth on the second line of loop.
  // perhaps 2.5x speed up.
  // 2x overall in Multigrid Lanczos  
  for(int j=0; j<k; ++j){
    auto ip = innerProduct(basis[j],w);
    w = w - ip*basis[j];
  }
 }
 template<class Field>
 void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
 {
  typedef decltype(basis[0].View()) View;
  auto tmp_v = basis[0].View();
  Vector<View> basis_v(basis.size(),tmp_v);
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();
  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].View();
  }
 #if 0
  std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
  thread_region
  {
    vobj* B = Bt.data() + Nm * thread_num();
    thread_for_in_region(ss, grid->oSites(),{
      for(int j=j0; j<j1; ++j) B[j]=0.;
      for(int j=j0; j<j1; ++j){
 	for(int k=k0; k<k1; ++k){
 	  B[j] +=Qt(j,k) * basis_v[k][ss];
 	}
      }
      for(int j=j0; j<j1; ++j){
 	basis_v[j][ss] = B[j];
      }
    });
  }
 #else
  int nrot = j1-j0;
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
  //  printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
  Vector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];
  // GPU readable copy of Eigen matrix
  Vector<double> Qt_jv(Nm*Nm);
  double *Qt_p = & Qt_jv[0];
  for(int k=0;k<Nm;++k){
    for(int j=0;j<Nm;++j){
      Qt_p[j*Nm+k]=Qt(j,k);
    }
  }
  // Block the loop to keep storage footprint down
  vobj zz=Zero();
  for(uint64_t s=0;s<oSites;s+=siteBlock){
    // remaining work in this block
    int ssites=MIN(siteBlock,oSites-s);
    // zero out the accumulators
    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
 	auto z=coalescedRead(zz);
 	coalescedWrite(Bp[ss],z);
    });
    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
      int j =sj%nrot;
      int jj  =j0+j;
      int ss =sj/nrot;
      int sss=ss+s;
      for(int k=k0; k<k1; ++k){
 	auto tmp = coalescedRead(Bp[ss*nrot+j]);
 	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
      }
    });
    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
      int j =sj%nrot;
      int jj  =j0+j;
      int ss =sj/nrot;
      int sss=ss+s;
      coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
    });
  }
 #endif
 }
 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
  typedef decltype(basis[0].View()) View;
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();
  result.Checkerboard() = basis[0].Checkerboard();
  auto result_v=result.View();
  Vector<View> basis_v(basis.size(),result_v);
  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].View();
  }
  vobj zz=Zero();
  Vector<double> Qt_jv(Nm);
  double * Qt_j = & Qt_jv[0];
  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
    auto B=coalescedRead(zz);
    for(int k=k0; k<k1; ++k){
      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
    }
    coalescedWrite(result_v[ss], B);
  });
 }
 template<class Field>
 void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
 {
  int vlen = idx.size();
  assert(vlen>=1);
  assert(vlen<=sort_vals.size());
  assert(vlen<=_v.size());
  for (size_t i=0;i<vlen;i++) {
    if (idx[i] != i) {
      //////////////////////////////////////
      // idx[i] is a table of desired sources giving a permutation.
      // Swap v[i] with v[idx[i]].
      // Find  j>i for which _vnew[j] = _vold[i],
      // track the move idx[j] => idx[i]
      // track the move idx[i] => i
      //////////////////////////////////////
      size_t j;
      for (j=i;j<idx.size();j++)
 	if (idx[j]==i)
 	  break;
      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
      swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
      std::swap(sort_vals[i],sort_vals[idx[i]]);
      idx[j] = idx[i];
      idx[i] = i;
    }
  }
 }
 inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
 {
  std::vector<int> idx(sort_vals.size());
  std::iota(idx.begin(), idx.end(), 0);
  // sort indexes based on comparing values in v
  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
  });
  return idx;
 }
 template<class Field>
 void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
 {
  std::vector<int> idx = basisSortGetIndex(sort_vals);
  if (reverse)
    std::reverse(idx.begin(), idx.end());
  basisReorderInPlace(_v,sort_vals,idx);
 }
 // PAB: faster to compute the inner products first then fuse loops.
 // If performance critical can improve.
 template<class Field>
 void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
  result = Zero();
  assert(_v.size()==eval.size());
  int N = (int)_v.size();
  for (int i=0;i<N;i++) {
    Field& tmp = _v[i];
    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
  }
 }
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
@@ -114,6 +114,7 @@ public:
  void GlobalSumVector(RealD *,int N);
  void GlobalSum(uint32_t &);
  void GlobalSum(uint64_t &);
  void GlobalSumVector(uint64_t*,int N);
  void GlobalSum(ComplexF &c);
  void GlobalSumVector(ComplexF *c,int N);
  void GlobalSum(ComplexD &c);
@@ -255,6 +255,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
@@ -70,9 +70,10 @@ CartesianCommunicator::~CartesianCommunicator(){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
 void CartesianCommunicator::GlobalSumVector(double *,int N){}
 void CartesianCommunicator::GlobalSum(uint32_t &){}
 void CartesianCommunicator::GlobalSum(uint64_t &){}
-void CartesianCommunicator::GlobalSumVector(double *,int N){}
+void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
 void CartesianCommunicator::GlobalXOR(uint32_t &){}
 void CartesianCommunicator::GlobalXOR(uint64_t &){}
@@ -31,11 +31,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_ET.h>
 #include <Grid/lattice/Lattice_arith.h>
 #include <Grid/lattice/Lattice_trace.h>
-#include <Grid/lattice/Lattice_transpose.h>
+//#include <Grid/lattice/Lattice_transpose.h>
 #include <Grid/lattice/Lattice_local.h>
 #include <Grid/lattice/Lattice_reduction.h>
 #include <Grid/lattice/Lattice_peekpoke.h>
-#include <Grid/lattice/Lattice_reality.h>
+//#include <Grid/lattice/Lattice_reality.h>
 #include <Grid/lattice/Lattice_comparison_utils.h>
 #include <Grid/lattice/Lattice_comparison.h>
 #include <Grid/lattice/Lattice_coordinate.h>
@@ -43,4 +43,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/lattice/Lattice_rng.h>
 #include <Grid/lattice/Lattice_unary.h>
 #include <Grid/lattice/Lattice_transfer.h>
-
+#include <Grid/lattice/Lattice_basis.h>
@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: neo <cossu@post.kek.jp>
 Author: Christoph Lehner <christoph@lhnr.de
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -94,7 +95,7 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
 template <class lobj> accelerator_inline 
 const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
 {
-  auto view = arg.View();
+  auto view = arg.AcceleratorView(ViewRead);
  return view[ss];
 }
@@ -7,6 +7,7 @@
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christoph Lehner <christoph@lhnr.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
 template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto lhs_v = lhs.View();
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
-  auto rhs_v = rhs.View();
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
  conformable(ret,rhs);
  conformable(lhs,rhs);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@@ -55,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto lhs_v = lhs.View();
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
-  auto rhs_v = rhs.View();
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -72,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto lhs_v = lhs.View();
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
-  auto rhs_v = rhs.View();
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -88,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,rhs);
  conformable(lhs,rhs);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto lhs_v = lhs.View();
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
-  auto rhs_v = rhs.View();
+  auto rhs_v = rhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -107,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto lhs_v = lhs.View();
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    mult(&tmp,&lhs_v(ss),&rhs);
@@ -120,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto lhs_v = lhs.View();
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -134,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(ret,lhs);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto lhs_v = lhs.View();
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -147,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
  ret.Checkerboard() = lhs.Checkerboard();
  conformable(lhs,ret);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto lhs_v = lhs.View();
+  auto lhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto lhs_t=lhs_v(ss);
@@ -164,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
 void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto rhs_v = lhs.View();
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -178,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
 void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto rhs_v = lhs.View();
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -192,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
 void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto rhs_v = lhs.View();
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -205,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
 void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
  ret.Checkerboard() = rhs.Checkerboard();
  conformable(ret,rhs);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto rhs_v = lhs.View();
+  auto rhs_v = lhs.AcceleratorView(ViewRead);
  accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
    decltype(coalescedRead(obj1())) tmp;
    auto rhs_t=rhs_v(ss);
@@ -220,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto x_v = x.View();
+  auto x_v = x.AcceleratorView(ViewRead);
-  auto y_v = y.View();
+  auto y_v = y.AcceleratorView(ViewRead);
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
    auto tmp = a*x_v(ss)+y_v(ss);
    coalescedWrite(ret_v[ss],tmp);
@@ -233,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
  ret.Checkerboard() = x.Checkerboard();
  conformable(ret,x);
  conformable(x,y);
-  auto ret_v = ret.View();
+  auto ret_v = ret.AcceleratorView(ViewWrite);
-  auto x_v = x.View();
+  auto x_v = x.AcceleratorView(ViewRead);
-  auto y_v = y.View();
+  auto y_v = y.AcceleratorView(ViewRead);
  accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
    auto tmp = a*x_v(ss)+b*y_v(ss);
    coalescedWrite(ret_v[ss],tmp);
@@ -9,6 +9,7 @@ Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Christoph Lehner <christoph@lhnr.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -49,6 +50,26 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
  assert(lhs == rhs);
 }
 ////////////////////////////////////////////////////////////////////////////
 // Advise the LatticeAccelerator class
 ////////////////////////////////////////////////////////////////////////////
 enum LatticeAcceleratorAdvise {
  AdviseInfrequentUse = 0x1,    // Advise that the data is used infrequently.  This can
                                // significantly influence performance of bulk storage.
  AdviseReadMostly = 0x2,       // Data will mostly be read.  On some architectures
                                // enables read-only copies of memory to be kept on
                                // host and device.
 };
 ////////////////////////////////////////////////////////////////////////////
 // View Access Mode
 ////////////////////////////////////////////////////////////////////////////
 enum ViewMode {
  ViewRead = 0x1,
  ViewWrite = 0x2,
  ViewReadWrite = 0x3
 };
 ////////////////////////////////////////////////////////////////////////////
 // Minimal base class containing only data valid to access from accelerator
 // _odata will be a managed pointer in CUDA
@@ -75,6 +96,37 @@ public:
    if (grid) conformable(grid, _grid);
    else      grid = _grid;
  };
  accelerator_inline void Advise(int advise) {
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
    if (advise & AdviseInfrequentUse) {
      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
    }
    if (advise & AdviseReadMostly) {
      cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
    }
 #endif
 #endif
  };
  accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
    int target;
    cudaGetDevice(&target);
    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
 #endif
 #endif
  };
  accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
 #ifdef GRID_NVCC
 #ifndef __CUDA_ARCH__ // only on host
    cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
 #endif
 #endif
  };
 };
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -206,9 +258,23 @@ public:
  // The view is trivially copy constructible and may be copied to an accelerator device
  // in device lambdas
  /////////////////////////////////////////////////////////////////////////////////
-  LatticeView<vobj> View (void) const 
+  LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
  {                                   //                     and HostView        for thread_for
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
    return accessor;
  }
  LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const 
  {
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
    accessor.AcceleratorPrefetch(mode);
    return accessor;
  }
  LatticeView<vobj> HostView(int mode = ViewReadWrite) const 
  {
    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
    accessor.HostPrefetch(mode);
    return accessor;
  }
@@ -232,7 +298,7 @@ public:
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
    accelerator_for(ss,me.size(),1,{
      auto tmp = eval(ss,expr);
      vstream(me[ss],tmp);
@@ -251,7 +317,7 @@ public:
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
    accelerator_for(ss,me.size(),1,{
      auto tmp = eval(ss,expr);
      vstream(me[ss],tmp);
@@ -269,7 +335,7 @@ public:
    CBFromExpression(cb,expr);
    assert( (cb==Odd) || (cb==Even));
    this->checkerboard=cb;
-    auto me  = View();
+    auto me  = AcceleratorView(ViewWrite);
    accelerator_for(ss,me.size(),1,{
      auto tmp = eval(ss,expr);
      vstream(me[ss],tmp);
@@ -357,7 +423,6 @@ public:
  // copy constructor
  ///////////////////////////////////////////
  Lattice(const Lattice& r){ 
    //    std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl; 
    this->_grid = r.Grid();
    resize(this->_grid->oSites());
    *this = r;
@@ -380,8 +445,8 @@ public:
    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
    conformable(*this,r);
    this->checkerboard = r.Checkerboard();
-    auto me =   View();
+    auto me =   AcceleratorView(ViewWrite);
-    auto him= r.View();
+    auto him= r.AcceleratorView(ViewRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -394,8 +459,8 @@ public:
  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
    this->checkerboard = r.Checkerboard();
    conformable(*this,r);
-    auto me =   View();
+    auto me =   AcceleratorView(ViewWrite);
-    auto him= r.View();
+    auto him= r.AcceleratorView(ViewRead);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
      coalescedWrite(me[ss],him(ss));
    });
@@ -0,0 +1,236 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/lattice/Lattice_basis.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Christoph Lehner <christoph@lhnr.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 template<class Field>
 void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
 {
  // If assume basis[j] are already orthonormal,
  // can take all inner products in parallel saving 2x bandwidth
  // Save 3x bandwidth on the second line of loop.
  // perhaps 2.5x speed up.
  // 2x overall in Multigrid Lanczos  
  for(int j=0; j<k; ++j){
    auto ip = innerProduct(basis[j],w);
    w = w - ip*basis[j];
  }
 }
 template<class VField, class Matrix>
 void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) 
 {
  typedef decltype(basis[0]) Field;
  typedef decltype(basis[0].View()) View;
  auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
  Vector<View> basis_v(basis.size(),tmp_v);
  typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
  GridBase* grid = basis[0].Grid();
  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
  }
 #ifndef GRID_NVCC
  thread_region
  {
    std::vector < vobj > B(Nm); // Thread private
    thread_for_in_region(ss, grid->oSites(),{
 	for(int j=j0; j<j1; ++j) B[j]=0.;
 	for(int j=j0; j<j1; ++j){
 	  for(int k=k0; k<k1; ++k){
 	    B[j] +=Qt(j,k) * basis_v[k][ss];
 	  }
 	}
 	for(int j=j0; j<j1; ++j){
 	  basis_v[j][ss] = B[j];
 	}
      });
  }
 #else
  int nrot = j1-j0;
  if (!nrot) // edge case not handled gracefully by Cuda
    return;
  uint64_t oSites   =grid->oSites();
  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
  Vector <vobj> Bt(siteBlock * nrot); 
  auto Bp=&Bt[0];
  // GPU readable copy of matrix
  Vector<double> Qt_jv(Nm*Nm);
  double *Qt_p = & Qt_jv[0];
  thread_for(i,Nm*Nm,{
      int j = i/Nm;
      int k = i%Nm;
      Qt_p[i]=Qt(j,k);
    });
  // Block the loop to keep storage footprint down
  for(uint64_t s=0;s<oSites;s+=siteBlock){
    // remaining work in this block
    int ssites=MIN(siteBlock,oSites-s);
    // zero out the accumulators
    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
 	decltype(coalescedRead(Bp[ss])) z;
 	z=Zero();
 	coalescedWrite(Bp[ss],z);
      });
    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
 	int j =sj%nrot;
 	int jj  =j0+j;
 	int ss =sj/nrot;
 	int sss=ss+s;
 	for(int k=k0; k<k1; ++k){
 	  auto tmp = coalescedRead(Bp[ss*nrot+j]);
 	  coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
 	}
      });
    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
 	int j =sj%nrot;
 	int jj  =j0+j;
 	int ss =sj/nrot;
 	int sss=ss+s;
 	coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
      });
  }
 #endif
 }
 // Extract a single rotated vector
 template<class Field>
 void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
 {
  typedef decltype(basis[0].AcceleratorView()) View;
  typedef typename Field::vector_object vobj;
  GridBase* grid = basis[0].Grid();
  result.Checkerboard() = basis[0].Checkerboard();
  auto result_v=result.AcceleratorView(ViewWrite);
  Vector<View> basis_v(basis.size(),result_v);
  for(int k=0;k<basis.size();k++){
    basis_v[k] = basis[k].AcceleratorView(ViewRead);
  }
  vobj zz=Zero();
  Vector<double> Qt_jv(Nm);
  double * Qt_j = & Qt_jv[0];
  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
    auto B=coalescedRead(zz);
    for(int k=k0; k<k1; ++k){
      B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
    }
    coalescedWrite(result_v[ss], B);
  });
 }
 template<class Field>
 void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
 {
  int vlen = idx.size();
  assert(vlen>=1);
  assert(vlen<=sort_vals.size());
  assert(vlen<=_v.size());
  for (size_t i=0;i<vlen;i++) {
    if (idx[i] != i) {
      //////////////////////////////////////
      // idx[i] is a table of desired sources giving a permutation.
      // Swap v[i] with v[idx[i]].
      // Find  j>i for which _vnew[j] = _vold[i],
      // track the move idx[j] => idx[i]
      // track the move idx[i] => i
      //////////////////////////////////////
      size_t j;
      for (j=i;j<idx.size();j++)
 	if (idx[j]==i)
 	  break;
      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
      swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
      std::swap(sort_vals[i],sort_vals[idx[i]]);
      idx[j] = idx[i];
      idx[i] = i;
    }
  }
 }
 inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
 {
  std::vector<int> idx(sort_vals.size());
  std::iota(idx.begin(), idx.end(), 0);
  // sort indexes based on comparing values in v
  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
  });
  return idx;
 }
 template<class Field>
 void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
 {
  std::vector<int> idx = basisSortGetIndex(sort_vals);
  if (reverse)
    std::reverse(idx.begin(), idx.end());
  basisReorderInPlace(_v,sort_vals,idx);
 }
 // PAB: faster to compute the inner products first then fuse loops.
 // If performance critical can improve.
 template<class Field>
 void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
  result = Zero();
  assert(_v.size()==eval.size());
  int N = (int)_v.size();
  for (int i=0;i<N;i++) {
    Field& tmp = _v[i];
    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
  }
 }
 NAMESPACE_END(Grid);
@@ -156,7 +156,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
 template<class vobj,class sobj>
-accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
+inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
  GridBase *grid = l.Grid();
@@ -185,7 +185,7 @@ accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate
 };
 template<class vobj,class sobj>
-accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
+inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
  GridBase *grid=l.Grid();
@@ -5,6 +5,7 @@
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Christoph Lehner <christoph@lhnr.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
@@ -93,7 +94,7 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
 // Double inner product
 template<class vobj>
-inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
+inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
@@ -102,8 +103,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
  GridBase *grid = left.Grid();
  // Might make all code paths go this way.
-  auto left_v = left.View();
+  auto left_v = left.AcceleratorView(ViewRead);
-  auto right_v=right.View();
+  auto right_v=right.AcceleratorView(ViewRead);
  const uint64_t nsimd = grid->Nsimd();
  const uint64_t sites = grid->oSites();
@@ -137,11 +138,18 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
  })
  nrm = TensorRemove(sum(inner_tmp_v,sites));
 #endif
  grid->GlobalSum(nrm);
  return nrm;
 }
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
  GridBase *grid = left.Grid();
  ComplexD nrm = rankInnerProduct(left,right);
  grid->GlobalSum(nrm);
  return nrm;
 }
 /////////////////////////
 // Fast axpby_norm
 // z = a x + b y
@@ -167,9 +175,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  GridBase *grid = x.Grid();
-  auto x_v=x.View();
+  auto x_v=x.AcceleratorView(ViewRead);
-  auto y_v=y.View();
+  auto y_v=y.AcceleratorView(ViewRead);
-  auto z_v=z.View();
+  auto z_v=z.AcceleratorView(ViewWrite);
  const uint64_t nsimd = grid->Nsimd();
  const uint64_t sites = grid->oSites();
@@ -204,8 +212,64 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  grid->GlobalSum(nrm);
  return nrm; 
 }
 template<class vobj> strong_inline void
 innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
  conformable(left,right);
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_typeD vector_type;
  Vector<ComplexD> tmp(2);
  GridBase *grid = left.Grid();
  auto left_v=left.AcceleratorView(ViewRead);
  auto right_v=right.AcceleratorView(ViewRead);
  const uint64_t nsimd = grid->Nsimd();
  const uint64_t sites = grid->oSites();
 #ifdef GRID_NVCC
  // GPU
  typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
  typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
  Vector<inner_t> inner_tmp(sites);
  Vector<norm_t> norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto left_tmp = left_v(ss);
      coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
      coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
  });
  tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
  tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
 #else
  // CPU
  typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
  typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
  Vector<inner_t> inner_tmp(sites);
  Vector<norm_t> norm_tmp(sites);
  auto inner_tmp_v = &inner_tmp[0];
  auto norm_tmp_v = &norm_tmp[0];
  accelerator_for( ss, sites, nsimd,{
      auto left_tmp = left_v(ss);
      inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
      norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp);
  });
  // Already promoted to double
  tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
  tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
 #endif
  grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
  ip = tmp[0];
  nrm = real(tmp[1]);
 }
 template<class Op,class T1>
 inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
  ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
@@ -37,6 +37,7 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Trace
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 /*
 template<class vobj>
 inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))>
 {
@@ -48,6 +49,7 @@ inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))>
  });
  return ret;
 };
 */
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Trace Index level dependent operation
@@ -6,6 +6,7 @@
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Christoph Lehner <christoph@lhnr.de>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -63,6 +64,7 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
    }
  });
 }
 template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
  int cb = half.Checkerboard();
  auto half_v = half.View();
@@ -81,25 +83,130 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
    }
  });
 }
-  
+
-template<class vobj,class CComplex,int nbasis>
+////////////////////////////////////////////////////////////////////////////////////////////
 // Flexible Type Conversion for internal promotion to double as well as graceful
 // treatment of scalar-compatible types
 ////////////////////////////////////////////////////////////////////////////////////////////
 accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
  out = in;
 }
 accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
  out = in;
 }
 #ifdef __CUDA_ARCH__
 accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
  ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
 }
 accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
 }
 accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
  ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
 }
 #endif
 accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
  out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
 }
 accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
 }
 template<typename T1,typename T2,int N>
  accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
 template<typename T1,typename T2,int N>
  accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
 template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
 accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
  convertType(out,in._internal);
 }
 template<typename T1,typename T2>
 accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
  convertType(out._internal,in);
 }
 template<typename T1,typename T2,int N>
 accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in) {
  for (int i=0;i<N;i++)
    for (int j=0;j<N;j++)
      convertType(out._internal[i][j],in._internal[i][j]);
 }
 template<typename T1,typename T2,int N>
 accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in) {
  for (int i=0;i<N;i++)
    convertType(out._internal[i],in._internal[i]);
 }
 template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
 accelerator_inline void convertType(T & out, const T & in) {
  out = in;
 }
 template<typename T1,typename T2>
 accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
  auto out_v = out.AcceleratorView(ViewWrite);
  auto in_v  = in.AcceleratorView(ViewRead);
  accelerator_for(ss,out_v.size(),T1::Nsimd(),{
      convertType(out_v[ss],in_v(ss));
    });
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // precision-promoted local inner product
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj>
 inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
 -> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
 {
  auto lhs_v = lhs.AcceleratorView(ViewRead);
  auto rhs_v = rhs.AcceleratorView(ViewRead);
  typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
  Lattice<iScalar<t_inner>> ret(lhs.Grid());
  auto ret_v = ret.AcceleratorView(ViewWrite);
  accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
      convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
    });
  return ret;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////
 // block routines
 ////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
-			  const             Lattice<vobj>   &fineData,
+			   const             Lattice<vobj>   &fineData,
-			  const std::vector<Lattice<vobj> > &Basis)
+			   const VLattice &Basis)
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
-  Lattice<CComplex> ip(coarse); 
+  Lattice<iScalar<CComplex>> ip(coarse);
  Lattice<vobj>     fineDataRed = fineData;
  //  auto fineData_   = fineData.View();
-  auto coarseData_ = coarseData.View();
+  auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
-  auto ip_         = ip.View();
+  auto ip_         = ip.AcceleratorView(ViewReadWrite);
  for(int v=0;v<nbasis;v++) {
-    blockInnerProduct(ip,Basis[v],fineData);
+    blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
-	coalescedWrite(coarseData_[sc](v),ip_(sc));
+	convertType(coarseData_[sc](v),ip_[sc]);
      });
    // improve numerical stability of projection
    // |fine> = |fine> - <basis|fine> |basis>
    ip=-ip;
    blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); 
  }
 }
@@ -166,11 +273,11 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
  return;
 }
-template<class vobj,class CComplex>
+template<class vobj,class vobj2,class CComplex>
-inline void blockZAXPY(Lattice<vobj> &fineZ,
+  inline void blockZAXPY(Lattice<vobj> &fineZ,
-		       const Lattice<CComplex> &coarseA,
+			 const Lattice<CComplex> &coarseA,
-		       const Lattice<vobj> &fineX,
+			 const Lattice<vobj2> &fineX,
-		       const Lattice<vobj> &fineY)
+			 const Lattice<vobj> &fineY)
 {
  GridBase * fine  = fineZ.Grid();
  GridBase * coarse= coarseA.Grid();
@@ -182,7 +289,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
  conformable(fineX,fineZ);
  int _ndimension = coarse->_ndimension;
-  
+
  Coordinate  block_r      (_ndimension);
  // FIXME merge with subdivide checking routine as this is redundant
@@ -191,29 +298,65 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
    assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
  }
-  auto fineZ_  = fineZ.View();
+  auto fineZ_  = fineZ.AcceleratorView(ViewWrite);
-  auto fineX_  = fineX.View();
+  auto fineX_  = fineX.AcceleratorView(ViewRead);
-  auto fineY_  = fineY.View();
+  auto fineY_  = fineY.AcceleratorView(ViewRead);
-  auto coarseA_= coarseA.View();
+  auto coarseA_= coarseA.AcceleratorView(ViewRead);
  accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
    int sc;
    Coordinate coor_c(_ndimension);
    Coordinate coor_f(_ndimension);
-    Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
+      int sc;
-    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
+      Coordinate coor_c(_ndimension);
-    Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
+      Coordinate coor_f(_ndimension);
-    // z = A x + y
+      Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
-    coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));
+      for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
      Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
-  });
+      // z = A x + y
 #ifdef __CUDA_ARCH__
      typename vobj2::tensor_reduced::scalar_object cA;
      typename vobj::scalar_object cAx;
 #else
      typename vobj2::tensor_reduced cA;
      vobj cAx;
 #endif
      convertType(cA,TensorRemove(coarseA_(sc)));
      auto prod = cA*fineX_(sf);
      convertType(cAx,prod);
      coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
    });
  return;
 }
 template<class vobj,class CComplex>
  inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
 				 const Lattice<vobj> &fineX,
 				 const Lattice<vobj> &fineY)
 {
  typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
  GridBase *coarse(CoarseInner.Grid());
  GridBase *fine  (fineX.Grid());
  Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
  Lattice<dotp> coarse_inner(coarse);
  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
  // Precision promotion
  fine_inner = localInnerProductD(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
  accelerator_for(ss, coarse->oSites(), 1, {
      convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
    });
 }
 template<class vobj,class CComplex> // deprecate
 inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
 			      const Lattice<vobj> &fineX,
 			      const Lattice<vobj> &fineY)
@@ -227,8 +370,8 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
  Lattice<dotp> coarse_inner(coarse);
  // Precision promotion?
-  auto CoarseInner_  = CoarseInner.View();
+  auto CoarseInner_  = CoarseInner.AcceleratorView(ViewWrite);
-  auto coarse_inner_ = coarse_inner.View();
+  auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
  fine_inner = localInnerProduct(fineX,fineY);
  blockSum(coarse_inner,fine_inner);
@@ -236,6 +379,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
    CoarseInner_[ss] = coarse_inner_[ss];
  });
 }
 template<class vobj,class CComplex>
 inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 {
@@ -248,7 +392,7 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
 // useful in multigrid project;
 // Generic name : Coarsen?
 template<class vobj>
-inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
+inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) 
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
@@ -256,42 +400,41 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  subdivides(coarse,fine); // require they map
  int _ndimension = coarse->_ndimension;
-  
+
  Coordinate  block_r      (_ndimension);
-  
+
  for(int d=0 ; d<_ndimension;d++){
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
  }
  int blockVol = fine->oSites()/coarse->oSites();
-  // Turn this around to loop threaded over sc and interior loop 
+  auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
-  // over sf would thread better
+  auto fineData_   = fineData.AcceleratorView(ViewRead);
  auto coarseData_ = coarseData.View();
  auto fineData_   = fineData.View();
  accelerator_for(sc,coarse->oSites(),1,{
-    // One thread per sub block
+      // One thread per sub block
-    Coordinate coor_c(_ndimension);
+      Coordinate coor_c(_ndimension);
-    Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
+      Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions);  // Block coordinate
-    coarseData_[sc]=Zero();
+      coarseData_[sc]=Zero();
-    for(int sb=0;sb<blockVol;sb++){
+      for(int sb=0;sb<blockVol;sb++){
      int sf;
      Coordinate coor_b(_ndimension);
      Coordinate coor_f(_ndimension);
      Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
      for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
      Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
-      coarseData_[sc]=coarseData_[sc]+fineData_[sf];
+	int sf;
-    }
+	Coordinate coor_b(_ndimension);
 	Coordinate coor_f(_ndimension);
 	Lexicographic::CoorFromIndex(coor_b,sb,block_r);               // Block sub coordinate
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 	Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
-  });
+	coarseData_[sc]=coarseData_[sc]+fineData_[sf];
      }
    });
  return;
 }
 template<class vobj>
 inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
 {
@@ -313,8 +456,8 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
  }
 }
-template<class vobj,class CComplex>
+template<class CComplex,class VLattice>
-inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis)
+inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
 {
  GridBase *coarse = ip.Grid();
  GridBase *fine   = Basis[0].Grid();
@@ -322,23 +465,30 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
  int       nbasis = Basis.size() ;
  // checks
-  subdivides(coarse,fine); 
+  subdivides(coarse,fine);
  for(int i=0;i<nbasis;i++){
    conformable(Basis[i].Grid(),fine);
  }
  for(int v=0;v<nbasis;v++) {
    for(int u=0;u<v;u++) {
-      //Inner product & remove component 
+      //Inner product & remove component
-      blockInnerProduct(ip,Basis[u],Basis[v]);
+      blockInnerProductD(ip,Basis[u],Basis[v]);
      ip = -ip;
-      blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]);
+      blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
    }
    blockNormalise(ip,Basis[v]);
  }
 }
 template<class vobj,class CComplex>
 inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
 {
  blockOrthonormalize(ip,Basis);
 }
 #if 0
 // TODO: CPU optimized version here
 template<class vobj,class CComplex,int nbasis>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
@@ -383,24 +533,18 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 }
 #else
-template<class vobj,class CComplex,int nbasis>
+template<class vobj,class CComplex,int nbasis,class VLattice>
 inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
 			 Lattice<vobj>   &fineData,
-			 const std::vector<Lattice<vobj> > &Basis)
+			 const VLattice &Basis)
 {
  GridBase * fine  = fineData.Grid();
  GridBase * coarse= coarseData.Grid();
  fineData=Zero();
  for(int i=0;i<nbasis;i++) {
    Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
-    Lattice<CComplex> cip(coarse);
+    auto  ip_ =  ip.AcceleratorView(ViewRead);
-    auto cip_ = cip.View();
+    blockZAXPY(fineData,ip,Basis[i],fineData);
    auto  ip_ =  ip.View();
    accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
 	coalescedWrite(cip_[sc], ip_(sc)());
    });
    blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
  }
 }
 #endif
@@ -470,8 +614,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  Coordinate rdt = Tg->_rdimensions;
  Coordinate ist = Tg->_istride;
  Coordinate ost = Tg->_ostride;
-  auto t_v = To.View();
+  auto t_v = To.AcceleratorView(ViewWrite);
-  auto f_v = From.View();
+  auto f_v = From.AcceleratorView(ViewRead);
  accelerator_for(idx,Fg->lSites(),1,{
    sobj s;
    Coordinate Fcoor(nd);
@@ -341,7 +341,7 @@ class BinaryIO {
    int ieee32big = (format == std::string("IEEE32BIG"));
    int ieee32    = (format == std::string("IEEE32"));
    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
    assert(ieee64||ieee32|ieee64big||ieee32big);
    assert((ieee64+ieee32+ieee64big+ieee32big)==1);
    //////////////////////////////////////////////////////////////////////////////
@@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger {
  };
 };
 template<class fobj,class sobj>
 struct GaugeDoubleStoredMunger{
  void operator()(fobj &in, sobj &out) {
    for (int mu = 0; mu < Nds; mu++) {
      for (int i = 0; i < Nc; i++) {
        for (int j = 0; j < Nc; j++) {
          out(mu)()(i, j) = in(mu)()(i, j);
        }}
    }
  };
 };
 template <class fobj, class sobj>
 struct GaugeDoubleStoredUnmunger {
  void operator()(sobj &in, fobj &out) {
    for (int mu = 0; mu < Nds; mu++) {
      for (int i = 0; i < Nc; i++) {
        for (int j = 0; j < Nc; j++) {
          out(mu)()(i, j) = in(mu)()(i, j);
        }}
    }
  };
 };
 template<class fobj,class sobj>
 struct Gauge3x2munger{
  void operator() (fobj &in,sobj &out){
@@ -146,7 +146,7 @@ public:
    int ieee32big = (format == std::string("IEEE32BIG"));
    int ieee32    = (format == std::string("IEEE32"));
    int ieee64big = (format == std::string("IEEE64BIG"));
-    int ieee64    = (format == std::string("IEEE64"));
+    int ieee64    = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
    uint32_t nersc_csum,scidac_csuma,scidac_csumb;
    // depending on datatype, set up munger;
@@ -0,0 +1,224 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/parallelIO/OpenQcdIO.h
 Copyright (C) 2015 - 2020
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 NAMESPACE_BEGIN(Grid);
 struct OpenQcdHeader : Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader,
                                  int,    Nt,
                                  int,    Nx,
                                  int,    Ny,
                                  int,    Nz,
                                  double, plaq);
 };
 class OpenQcdIO : public BinaryIO {
 public:
  static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6
  static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) {
    OpenQcdHeader header;
    {
      std::ifstream fin(file, std::ios::in | std::ios::binary);
      fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
      assert(!fin.fail());
      field.data_start = fin.tellg();
      fin.close();
    }
    header.plaq /= normalisationFactor;
    // sanity check (should trigger on endian issues)
    assert(0 < header.Nt && header.Nt <= 1024);
    assert(0 < header.Nx && header.Nx <= 1024);
    assert(0 < header.Ny && header.Ny <= 1024);
    assert(0 < header.Nz && header.Nz <= 1024);
    field.dimension[0] = header.Nx;
    field.dimension[1] = header.Ny;
    field.dimension[2] = header.Nz;
    field.dimension[3] = header.Nt;
    std::cout << GridLogDebug << "header: " << header << std::endl;
    std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
    std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
    assert(grid->_ndimension == Nd);
    for(int d = 0; d < Nd; d++)
      assert(grid->_fdimensions[d] == field.dimension[d]);
    field.plaquette = header.plaq;
    return field.data_start;
  }
  template<class vsimd>
  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
                                       FieldMetaData&                        header,
                                       std::string                           file) {
    typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
    assert(Ns == 4 and Nd == 4 and Nc == 3);
    auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
    assert(grid != nullptr); assert(grid->_ndimension == Nd);
    uint64_t offset = readHeader(file, Umu.Grid(), header);
    FieldMetaData clone(header);
    std::string format("IEEE64"); // they always store little endian double precsision
    uint32_t    nersc_csum, scidac_csuma, scidac_csumb;
    GridCartesian*         grid_openqcd = createOpenQcdGrid(grid);
    GridRedBlackCartesian* grid_rb      = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
    typedef DoubleStoredColourMatrixD                                              fobj;
    typedef typename DoubleStoredGaugeField::vector_object::scalar_object          sobj;
    typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word;
    word w = 0;
    std::vector<fobj> iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here
    std::vector<sobj> scalardata(grid->lSites());
    IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC,
             nersc_csum, scidac_csuma, scidac_csumb);
    GridStopWatch timer;
    timer.Start();
    DoubleStoredGaugeField Umu_ds(grid);
    auto munge = GaugeDoubleStoredMunger<DoubleStoredColourMatrixD, DoubleStoredColourMatrix>();
    Coordinate ldim = grid->LocalDimensions();
    thread_for(idx_g, grid->lSites(), {
        Coordinate coor;
        grid->LocalIndexToLocalCoor(idx_g, coor);
        bool isOdd = grid_rb->CheckerBoard(coor) == Odd;
        if(!isOdd) continue;
        int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir]
                  +  coor[Xdir] * ldim[Ydir] * ldim[Zdir]
                  +  coor[Ydir] * ldim[Zdir]
                  +  coor[Zdir])/2;
        munge(iodata[idx_o], scalardata[idx_g]);
    });
    grid->Barrier(); timer.Stop();
    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl;
    timer.Reset(); timer.Start();
    vectorizeFromLexOrdArray(scalardata, Umu_ds);
    grid->Barrier(); timer.Stop();
    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl;
    timer.Reset(); timer.Start();
    undoDoubleStore(Umu, Umu_ds);
    grid->Barrier(); timer.Stop();
    std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
    GaugeStatistics(Umu, clone);
    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
    // clang-format off
    std::cout << GridLogMessage << "OpenQcd Configuration " << file
              << " plaquette " << clone.plaquette
              << " header " << header.plaquette
              << " difference " << plaq_diff
              << std::endl;
    // clang-format on
    RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
    RealD tol     = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
    if(plaq_diff >= tol)
      std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
    assert(plaq_diff < tol);
    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
  }
  template<class vsimd>
  static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
                                        std::string                           file) {
    std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl;
    exit(EXIT_FAILURE);
  }
 private:
  static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) {
    // exploit GridCartesian to be able to still use IOobject
    Coordinate gdim  = grid->GlobalDimensions();
    Coordinate ldim  = grid->LocalDimensions();
    Coordinate pcoor = grid->ThisProcessorCoor();
    // openqcd does rb on the z direction
    gdim[Zdir] /= 2;
    ldim[Zdir] /= 2;
    // and has the order T X Y Z (from slowest to fastest)
    std::swap(gdim[Xdir], gdim[Zdir]);
    std::swap(ldim[Xdir], ldim[Zdir]);
    std::swap(pcoor[Xdir], pcoor[Zdir]);
    GridCartesian* ret   = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid());
    ret->_ldimensions    = ldim;
    ret->_processor_coor = pcoor;
    return ret;
  }
  template<class vsimd>
  static inline void undoDoubleStore(Lattice<iLorentzColourMatrix<vsimd>>&            Umu,
                                     Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
    conformable(Umu.Grid(), Umu_ds.Grid());
    Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
    // they store T+, T-, X+, X-, Y+, Y-, Z+, Z-
    for(int mu_g = 0; mu_g < Nd; ++mu_g) {
      int mu_o = (mu_g + 1) % Nd;
      U        = PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o)
               + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o + 1), mu_g, +1);
      PokeIndex<LorentzIndex>(Umu, U, mu_g);
    }
  }
 };
 NAMESPACE_END(Grid);
@@ -0,0 +1,281 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h
 Copyright (C) 2015 - 2020
 Author: Daniel Richtmann <daniel.richtmann@ur.de>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #pragma once
 #include <ios>
 #include <iostream>
 #include <limits>
 #include <iomanip>
 #include <mpi.h>
 #include <ostream>
 #include <string>
 #define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;}
 #define CHECK_VAR(a)   { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; }
 // #undef CHECK
 // #define CHECK
 NAMESPACE_BEGIN(Grid);
 class ParRdr {
 private:
  bool const swap;
  MPI_Status status;
  MPI_File   fp;
  int err;
  MPI_Datatype oddSiteType;
  MPI_Datatype fileViewType;
  GridBase* grid;
 public:
  ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr)
    : swap(false)
    , grid(gridPtr) {
    err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
    assert(err == MPI_SUCCESS);
  }
  virtual ~ParRdr() { MPI_File_close(&fp); }
  inline void errInfo(int const err, std::string const& func) {
    static char estring[MPI_MAX_ERROR_STRING];
    int         eclass = -1, len = 0;
    MPI_Error_class(err, &eclass);
    MPI_Error_string(err, estring, &len);
    std::cerr << func << " - Error " << eclass << ": " << estring << std::endl;
  }
  int readHeader(FieldMetaData& field) {
    assert((grid->_ndimension == Nd) && (Nd == 4));
    assert(Nc == 3);
    OpenQcdHeader header;
    readBlock(reinterpret_cast<char*>(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR);
    header.plaq /= 3.; // TODO change this into normalizationfactor
    // sanity check (should trigger on endian issues) TODO remove?
    assert(0 < header.Nt && header.Nt <= 1024);
    assert(0 < header.Nx && header.Nx <= 1024);
    assert(0 < header.Ny && header.Ny <= 1024);
    assert(0 < header.Nz && header.Nz <= 1024);
    field.dimension[0] = header.Nx;
    field.dimension[1] = header.Ny;
    field.dimension[2] = header.Nz;
    field.dimension[3] = header.Nt;
    for(int d = 0; d < Nd; d++)
      assert(grid->FullDimensions()[d] == field.dimension[d]);
    field.plaquette = header.plaq;
    field.data_start = sizeof(OpenQcdHeader);
    return field.data_start;
  }
  void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) {
    err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status);
    errInfo(err, "MPI_File_read_at_all");
    // CHECK_VAR(err)
    int read = -1;
    MPI_Get_count(&status, datatype, &read);
    // CHECK_VAR(read)
    assert(nbytes == (uint64_t)read);
    assert(err == MPI_SUCCESS);
  }
  void createTypes() {
    constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
    err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
    err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
    Coordinate const L = grid->GlobalDimensions();
    Coordinate const l = grid->LocalDimensions();
    Coordinate const i = grid->ThisProcessorCoor();
    Coordinate sizes({L[2] / 2, L[1], L[0], L[3]});
    Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
    Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
    err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
    err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
  }
  void freeTypes() {
    err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
    err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
  }
  bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
    auto hdr_offset = readHeader(meta);
    CHECK
    createTypes();
    err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
    CHECK
    int const domainSites = grid->lSites();
    domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
    // the actual READ
    constexpr uint64_t cm_size   = 2 * Nc * Nc * sizeof(double);    // 2_complex
    constexpr uint64_t os_size   = Nd * 2 * cm_size;                // 2_fwdbwd
    constexpr uint64_t max_elems = std::numeric_limits<int>::max(); // int adressable elems: floor is fine
    uint64_t const     n_os      = domainSites / 2;
    for(uint64_t os_idx = 0; os_idx < n_os;) {
      uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx;
      uint64_t const cm      = os_idx * Nd * 2;
      readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType);
      os_idx += read_os;
    }
    CHECK
    err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
  errInfo(err, "MPI_File_set_view1");
    assert(err == MPI_SUCCESS);
    freeTypes();
    std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
    return true;
  }
 };
 class OpenQcdIOChromaReference : public BinaryIO {
 public:
  template<class vsimd>
  static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
                                       Grid::FieldMetaData&                  header,
                                       std::string                           file) {
    typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
    assert(Ns == 4 and Nd == 4 and Nc == 3);
    auto grid = Umu.Grid();
    typedef ColourMatrixD fobj;
    std::vector<fobj> iodata(
      Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file
    {
      ParRdr rdr(MPI_COMM_WORLD, file, grid);
      rdr.readGauge(iodata, header);
    } // equivalent to using binaryio
    std::vector<iDoubleStoredColourMatrix<typename vsimd::scalar_type>> Umu_ds_scalar(grid->lSites());
    copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging
    DoubledGaugeField Umu_ds(grid);
    vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds);
    redistribute(Umu, Umu_ds); // equivalent to undoDoublestore
    FieldMetaData clone(header);
    GaugeStatistics(Umu, clone);
    RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
    // clang-format off
    std::cout << GridLogMessage << "OpenQcd Configuration " << file
              << " plaquette " << clone.plaquette
              << " header " << header.plaquette
              << " difference " << plaq_diff
              << std::endl;
    // clang-format on
    RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
    RealD tol     = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
    if(plaq_diff >= tol)
      std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
    assert(plaq_diff < tol);
    std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
  }
 private:
  template<class vsimd>
  static inline void redistribute(Lattice<iLorentzColourMatrix<vsimd>>&            Umu,
                                  Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
    Grid::conformable(Umu.Grid(), Umu_ds.Grid());
    Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
    U = PeekIndex<LorentzIndex>(Umu_ds, 2) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 3), 0, +1); PokeIndex<LorentzIndex>(Umu, U, 0);
    U = PeekIndex<LorentzIndex>(Umu_ds, 4) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 5), 1, +1); PokeIndex<LorentzIndex>(Umu, U, 1);
    U = PeekIndex<LorentzIndex>(Umu_ds, 6) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 7), 2, +1); PokeIndex<LorentzIndex>(Umu, U, 2);
    U = PeekIndex<LorentzIndex>(Umu_ds, 0) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 1), 3, +1); PokeIndex<LorentzIndex>(Umu, U, 3);
  }
  static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
                                         std::vector<ColourMatrixD> const&      node_buff,
                                         GridBase*                              grid) {
    assert(node_buff.size() == Nd * grid->lSites());
    Coordinate const& l = grid->LocalDimensions();
    Coordinate coord(Nd);
    int&       x = coord[0];
    int&       y = coord[1];
    int&       z = coord[2];
    int&       t = coord[3];
    int buff_idx = 0;
    for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering
      for(x = 0; x < l[0]; ++x)
        for(y = 0; y < l[1]; ++y)
          for(z = 0; z < l[2]; ++z) {
            if((t + z + y + x) % 2 == 0) continue;
            int local_idx;
            Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions());
            for(int mu = 0; mu < 2 * Nd; ++mu)
              for(int c1 = 0; c1 < Nc; ++c1) {
                for(int c2 = 0; c2 < Nc; ++c2) {
                  u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2);
                }
              }
            buff_idx += 2 * Nd;
          }
    assert(node_buff.size() == buff_idx);
  }
 };
 NAMESPACE_END(Grid);
@@ -95,7 +95,8 @@ inline uint64_t cyclecount(void){
 }
 #elif defined __x86_64__
 inline uint64_t cyclecount(void){ 
-  return __rdtsc();
+  uint64_t ret = __rdtsc();
  return (uint64_t)ret;
 }
 #else
@@ -133,23 +133,23 @@ typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
 typedef iSpinColourMatrix<vComplexF>    vSpinColourMatrixF;
 typedef iSpinColourMatrix<vComplexD>    vSpinColourMatrixD;
-    // SpinColourSpinColour matrix
+// SpinColourSpinColour matrix
-    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
-    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
-    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
-    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
-    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
-    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
+typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
-    // SpinColourSpinColour matrix
+// SpinColourSpinColour matrix
-    typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
+typedef iSpinColourSpinColourMatrix<Complex  >    SpinColourSpinColourMatrix;
-    typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
+typedef iSpinColourSpinColourMatrix<ComplexF >    SpinColourSpinColourMatrixF;
-    typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
+typedef iSpinColourSpinColourMatrix<ComplexD >    SpinColourSpinColourMatrixD;
-    typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
+typedef iSpinColourSpinColourMatrix<vComplex >    vSpinColourSpinColourMatrix;
-    typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
+typedef iSpinColourSpinColourMatrix<vComplexF>    vSpinColourSpinColourMatrixF;
-    typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
+typedef iSpinColourSpinColourMatrix<vComplexD>    vSpinColourSpinColourMatrixD;
 // LorentzColour
 typedef iLorentzColourMatrix<Complex  > LorentzColourMatrix;
@@ -443,16 +443,16 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 //////////////////////////////////////////////
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
-    //template <class Prop, class Ferm>
+//template <class Prop, class Ferm>
-    template <class Fimpl>
+template <class Fimpl>
-      void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
+void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
      auto fj  = peekSpin(f, j);
-            for(int i = 0; i < Fimpl::Dimension; ++i)
+      for(int i = 0; i < Fimpl::Dimension; ++i)
 	{
 	  pokeColour(pjs, peekColour(fj, i), i, c);
 	}
@@ -460,16 +460,16 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
    }
 }
-    //template <class Prop, class Ferm>
+//template <class Prop, class Ferm>
-    template <class Fimpl>
+template <class Fimpl>
-      void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
+void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
  for(int j = 0; j < Ns; ++j)
    {
      auto pjs = peekSpin(p, j, s);
      auto fj  = peekSpin(f, j);
-            for(int i = 0; i < Fimpl::Dimension; ++i)
+      for(int i = 0; i < Fimpl::Dimension; ++i)
 	{
 	  pokeColour(fj, peekColour(pjs, i, c), i);
 	}
@@ -141,7 +141,33 @@ public:
  Vector<iSinglet<Simd> >  MatpInvDag;
  Vector<iSinglet<Simd> >  MatmInvDag;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
  // Virtual can't template
  void ContractConservedCurrent(PropagatorField &q_in_1,
 				PropagatorField &q_in_2,
 				PropagatorField &q_out,
 				PropagatorField &phys_src,
 				Current curr_type, 
 				unsigned int mu);
   void SeqConservedCurrent(PropagatorField &q_in,
 			   PropagatorField &q_out,
 			   PropagatorField &phys_src,
 			   Current curr_type,
 			   unsigned int mu,
 			   unsigned int tmin,
 			   unsigned int tmax,
 			   ComplexField &lattice_cmplx);
  void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
  void ContractJ5q(FermionField &q_in,ComplexField &J5q);
  ///////////////////////////////////////////////////////////////
  // Constructors
  ///////////////////////////////////////////////////////////////
  CayleyFermion5D(GaugeField &_Umu,
 		  GridCartesian         &FiveDimGrid,
 		  GridRedBlackCartesian &FiveDimRedBlackGrid,
@@ -148,15 +148,19 @@ public:
  virtual void ContractConservedCurrent(PropagatorField &q_in_1,
 					PropagatorField &q_in_2,
 					PropagatorField &q_out,
 					PropagatorField &phys_src,
 					Current curr_type,
-					unsigned int mu)=0;
+					unsigned int mu)
  {assert(0);};
  virtual void SeqConservedCurrent(PropagatorField &q_in, 
 				   PropagatorField &q_out,
 				   PropagatorField &phys_src,
 				   Current curr_type,
 				   unsigned int mu,
 				   unsigned int tmin, 
 				   unsigned int tmax,
-				   ComplexField &lattice_cmplx)=0;
+				   ComplexField &lattice_cmplx)
  {assert(0);};
      // Only reimplemented in Wilson5D 
      // Default to just a zero correlation function
@@ -38,6 +38,7 @@ public:
 static const bool isFundamental = Representation::isFundamental;
 static const int Nhcs = Options::Nhcs;
 static const bool LsVectorised=false;
 static const bool isGparity=true;
 typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
 INHERIT_GIMPL_TYPES(Gimpl);
@@ -46,7 +47,7 @@ public:
 typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
 template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Dimension>, Ns>,   Ngp>;
- template <typename vtype> using iImplPropagator        = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
+ template <typename vtype> using iImplPropagator        = iMatrix<iMatrix<iMatrix<vtype, Dimension>, Ns>,   Ngp>;
 template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Dimension>, Nhs>,  Ngp>;
 template <typename vtype> using iImplHalfCommSpinor    = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
 template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
@@ -80,6 +81,7 @@ public:
  {
    assert(0);
  } 
  template<class _Spinor>
  static accelerator_inline void multLink(_Spinor &phi, 
 					  const SiteDoubledGaugeField &U,
@@ -191,6 +193,16 @@ public:
 #endif   
  }
  template<class _SpinorField>
  inline void multLinkField(_SpinorField & out,
 			    const DoubledGaugeField &Umu,
 			    const _SpinorField & phi,
 			    int mu)
  {
    assert(0);
  }
  template <class ref>
  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
  {
@@ -185,10 +185,12 @@ public:
  void ContractConservedCurrent(PropagatorField &q_in_1,
                                PropagatorField &q_in_2,
                                PropagatorField &q_out,
                                PropagatorField &src,
                                Current curr_type,
                                unsigned int mu);
  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
                           PropagatorField &srct,
                           Current curr_type,
                           unsigned int mu, 
                           unsigned int tmin,
@@ -217,15 +217,17 @@ public:
  void ContractConservedCurrent(PropagatorField &q_in_1,
 				PropagatorField &q_in_2,
 				PropagatorField &q_out,
 				PropagatorField &src,
 				Current curr_type,
 				unsigned int mu);
  void SeqConservedCurrent(PropagatorField &q_in,
 			   PropagatorField &q_out,
 			   PropagatorField &src,
 			   Current curr_type,
 			   unsigned int mu, 
 			   unsigned int tmin,
-                             unsigned int tmax,
+			   unsigned int tmax,
-                 	     ComplexField &lattice_cmplx);
+			   ComplexField &lattice_cmplx);
 };
 NAMESPACE_END(Grid);
@@ -179,15 +179,17 @@ public:
  void ContractConservedCurrent(PropagatorField &q_in_1,
                                PropagatorField &q_in_2,
                                PropagatorField &q_out,
                                PropagatorField &phys_src,
                                Current curr_type,
                                unsigned int mu);
  void SeqConservedCurrent(PropagatorField &q_in,
                           PropagatorField &q_out,
                           PropagatorField &phys_src,
                           Current curr_type,
                           unsigned int mu, 
                           unsigned int tmin,
-                             unsigned int tmax,
+			   unsigned int tmax,
-			     ComplexField &lattice_cmplx);
+			   ComplexField &lattice_cmplx);
 };
 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
@@ -217,25 +217,7 @@ public:
  // Comms buffer
  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
  ///////////////////////////////////////////////////////////////
  // Conserved current utilities
  ///////////////////////////////////////////////////////////////
  void ContractConservedCurrent(PropagatorField &q_in_1,
 				PropagatorField &q_in_2,
 				PropagatorField &q_out,
 				Current curr_type, 
 				unsigned int mu);
  void SeqConservedCurrent(PropagatorField &q_in,
 			   PropagatorField &q_out,
 			   Current curr_type,
 			   unsigned int mu,
 			   unsigned int tmin,
 			   unsigned int tmax,
 			   ComplexField &lattice_cmplx);
  void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
  void ContractJ5q(FermionField &q_in,ComplexField &J5q);
 };
@@ -41,6 +41,7 @@ public:
  static const int Dimension = Representation::Dimension;
  static const bool isFundamental = Representation::isFundamental;
  static const bool LsVectorised=false;
  static const bool isGparity=false;
  static const int Nhcs = Options::Nhcs;
  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -98,8 +99,21 @@ public:
  {
    multLink(phi,U,chi,mu);
  }
-    
+
-      
+  template<class _SpinorField> 
  inline void multLinkField(_SpinorField & out,
 			    const DoubledGaugeField &Umu,
 			    const _SpinorField & phi,
 			    int mu)
  {
    auto out_v= out.View();
    auto phi_v= phi.View();
    auto Umu_v= Umu.View();
    thread_for(sss,out.Grid()->oSites(),{
 	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
    });
  }
  template <class ref>
  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
  {
@@ -66,41 +66,6 @@ public:
  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
  //////////////////////////////////////////////////////////////////////////////
  // Utilities for inserting Wilson conserved current.
  //////////////////////////////////////////////////////////////////////////////
  static void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
                                       const SitePropagator &q_in_2,
                                       SitePropagator &q_out,
                                       DoubledGaugeFieldView &U,
                                       unsigned int sU,
                                       unsigned int mu,
                                       bool switch_sign = false);
  static void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
                                       const SitePropagator &q_in_2,
                                       SitePropagator &q_out,
                                       DoubledGaugeFieldView &U,
                                       unsigned int sU,
                                       unsigned int mu,
                                       bool switch_sign = false);
  static void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
                                  SitePropagator &q_out,
                                  DoubledGaugeFieldView &U,
                                  unsigned int sU,
                                  unsigned int mu,
                                  vPredicate t_mask,
                                  bool switch_sign = false);
  static void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
                                  SitePropagator &q_out,
                                  DoubledGaugeFieldView &U,
                                  unsigned int sU,
                                  unsigned int mu,
                                  vPredicate t_mask,
                                  bool switch_sign = false);
 private:
  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
@@ -588,6 +588,355 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
  //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
 }
 template <class Impl>
 void CayleyFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
 {
  conformable(this->GaugeGrid(), J5q.Grid());
  conformable(q_in.Grid(), this->FermionGrid());
  Gamma G5(Gamma::Algebra::Gamma5);
  // 4d field
  int Ls = this->Ls;
  FermionField psi(this->GaugeGrid());
  FermionField p_plus (this->GaugeGrid());
  FermionField p_minus(this->GaugeGrid());
  FermionField p(this->GaugeGrid());
  ExtractSlice(p_plus , q_in, Ls/2-1 , 0);
  ExtractSlice(p_minus, q_in, Ls/2   , 0);
  p_plus = p_plus + G5*p_plus;
  p_minus= p_minus - G5*p_minus;
  p=0.5*(p_plus+p_minus);
  J5q = localInnerProduct(p,p);
 }
 template <class Impl>
 void CayleyFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
 {
  conformable(this->GaugeGrid(), J5q.Grid());
  conformable(q_in.Grid(), this->FermionGrid());
  Gamma G5(Gamma::Algebra::Gamma5);
  // 4d field
  int Ls = this->Ls;
  PropagatorField psi(this->GaugeGrid());
  PropagatorField p_plus (this->GaugeGrid());
  PropagatorField p_minus(this->GaugeGrid());
  PropagatorField p(this->GaugeGrid());
  ExtractSlice(p_plus , q_in, Ls/2-1 , 0);
  ExtractSlice(p_minus, q_in, Ls/2   , 0);
  p_plus = p_plus + G5*p_plus;
  p_minus= p_minus - G5*p_minus;
  p=0.5*(p_plus+p_minus);
  J5q = localInnerProduct(p,p);
 }
 #define Pp(Q) (0.5*(Q+g5*Q))
 #define Pm(Q) (0.5*(Q-g5*Q))
 #define Q_4d(Q) (Pm((Q)[0]) + Pp((Q)[Ls-1]))
 #define TopRowWithSource(Q) (phys_src + (1.0-mass)*Q_4d(Q))
 template <class Impl> 
 void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
 						      PropagatorField &q_in_2,
 						      PropagatorField &q_out,
 						      PropagatorField &phys_src,
 						      Current curr_type,
 						      unsigned int mu)
 {
 #ifndef GRID_NVCC
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
    Gamma::Algebra::Gamma5
  };
  auto UGrid= this->GaugeGrid();
  auto FGrid= this->FermionGrid();
  RealD sgn=1.0;
  if ( curr_type == Current::Axial ) sgn = -1.0;
  int Ls = this->Ls;
  std::vector<PropagatorField> L_Q(Ls,UGrid); 
  std::vector<PropagatorField> R_Q(Ls,UGrid); 
  for(int s=0;s<Ls;s++){
    ExtractSlice(L_Q[s], q_in_1, s , 0);
    ExtractSlice(R_Q[s], q_in_2, s , 0);
  }
  Gamma g5(Gamma::Algebra::Gamma5);
  PropagatorField C(UGrid); 
  PropagatorField p5d(UGrid); 
  PropagatorField us_p5d(UGrid); 
  PropagatorField gp5d(UGrid); 
  PropagatorField gus_p5d(UGrid); 
  PropagatorField L_TmLsGq0(UGrid); 
  PropagatorField L_TmLsTmp(UGrid);
  PropagatorField R_TmLsGq0(UGrid); 
  PropagatorField R_TmLsTmp(UGrid);
  {
    PropagatorField TermA(UGrid);
    PropagatorField TermB(UGrid);
    PropagatorField TermC(UGrid);
    PropagatorField TermD(UGrid);
    TermA = (Pp(Q_4d(L_Q)));
    TermB = (Pm(Q_4d(L_Q)));
    TermC = (Pm(TopRowWithSource(L_Q)));
    TermD = (Pp(TopRowWithSource(L_Q)));
    L_TmLsGq0 = (TermD - TermA + TermB);
    L_TmLsTmp = (TermC - TermB + TermA);
    TermA = (Pp(Q_4d(R_Q)));
    TermB = (Pm(Q_4d(R_Q)));
    TermC = (Pm(TopRowWithSource(R_Q)));
    TermD = (Pp(TopRowWithSource(R_Q)));
    R_TmLsGq0 = (TermD - TermA + TermB);
    R_TmLsTmp = (TermC - TermB + TermA);
  }
  std::vector<PropagatorField> R_TmLsGq(Ls,UGrid);
  std::vector<PropagatorField> L_TmLsGq(Ls,UGrid);
  for(int s=0;s<Ls;s++){
    R_TmLsGq[s] = (Pm((R_Q)[(s)]) + Pp((R_Q)[((s)-1+Ls)%Ls]));
    L_TmLsGq[s] = (Pm((L_Q)[(s)]) + Pp((L_Q)[((s)-1+Ls)%Ls]));
  }
  Gamma gmu=Gamma(Gmu[mu]);
  q_out = Zero();
  PropagatorField tmp(UGrid); 
  for(int s=0;s<Ls;s++){
    int sp = (s+1)%Ls;
    int sr = Ls-1-s;
    int srp= (sr+1)%Ls;
    // Mobius parameters
    auto b=this->bs[s];
    auto c=this->cs[s];
    auto bpc = 1.0/(b+c);  // -0.5 factor in gauge links
    if (s == 0) {
      p5d    =(b*Pm(L_TmLsGq[Ls-1])+ c*Pp(L_TmLsGq[Ls-1]) + b*Pp(L_TmLsTmp)   + c*Pm(L_TmLsTmp     ));
      tmp    =(b*Pm(R_TmLsGq0)     + c*Pp(R_TmLsGq0     ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
    } else if (s == Ls-1) {
      p5d    =(b*Pm(L_TmLsGq0)     + c*Pp(L_TmLsGq0     ) + b*Pp(L_TmLsGq[1]) + c*Pm(L_TmLsGq[1]));
      tmp    =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp)   + c*Pm(R_TmLsTmp   ));
    } else {
      p5d    =(b*Pm(L_TmLsGq[sr]) + c*Pp(L_TmLsGq[sr])+ b*Pp(L_TmLsGq[srp])+ c*Pm(L_TmLsGq[srp]));
      tmp    =(b*Pm(R_TmLsGq[s])  + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
    }
    tmp    = Cshift(tmp,mu,1);
    Impl::multLinkField(us_p5d,this->Umu,tmp,mu);
    gp5d=g5*p5d*g5;
    gus_p5d=gmu*us_p5d;
    C = bpc*(adj(gp5d)*us_p5d);
    C-= bpc*(adj(gp5d)*gus_p5d);
    if (s == 0) {
      p5d    =(b*Pm(R_TmLsGq0)     + c*Pp(R_TmLsGq0  )    + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
      tmp    =(b*Pm(L_TmLsGq[Ls-1])+ c*Pp(L_TmLsGq[Ls-1]) + b*Pp(L_TmLsTmp)   + c*Pm(L_TmLsTmp  ));
    } else if (s == Ls-1) {
      p5d    =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp)   + c*Pm(R_TmLsTmp  ));
      tmp    =(b*Pm(L_TmLsGq0)     + c*Pp(L_TmLsGq0  )    + b*Pp(L_TmLsGq[1]) + c*Pm(L_TmLsGq[1]));
    } else {
      p5d    =(b*Pm(R_TmLsGq[s])  + c*Pp(R_TmLsGq[s])  + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
      tmp    =(b*Pm(L_TmLsGq[sr]) + c*Pp(L_TmLsGq[sr]) + b*Pp(L_TmLsGq[srp])+ c*Pm(L_TmLsGq[srp]));
    }
    tmp    = Cshift(tmp,mu,1);
    Impl::multLinkField(us_p5d,this->Umu,tmp,mu);
    gp5d=gmu*p5d;
    gus_p5d=g5*us_p5d*g5;
    C-= bpc*(adj(gus_p5d)*gp5d);
    C-= bpc*(adj(gus_p5d)*p5d);
    if (s < Ls/2) q_out += sgn*C;
    else          q_out +=     C;
  }
 #endif
 }
 template <class Impl>
 void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                                PropagatorField &q_out,
                                                PropagatorField &phys_src,
                                                Current curr_type, 
                                                unsigned int mu,
                                                unsigned int tmin, 
                                                unsigned int tmax,
 						ComplexField &ph)// Complex phase factor
 {
  assert(mu>=0);
  assert(mu<Nd);
  int tshift = (mu == Nd-1) ? 1 : 0;
 #if 0
  ////////////////////////////////////////////////
  // SHAMIR CASE 
  ////////////////////////////////////////////////
  int Ls = this->Ls;
  auto UGrid= this->GaugeGrid();
  auto FGrid= this->FermionGrid();
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT
  };
  Gamma gmu=Gamma(Gmu[mu]);
  PropagatorField L_Q(UGrid); 
  PropagatorField R_Q(UGrid); 
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
  LatticeInteger zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for (int s=0;s<Ls;s++) {
    RealD G_s = (curr_type == Current::Axial  ) ? ((s < Ls/2) ? -1 : 1) : 1;
    ExtractSlice(R_Q, q_in, s , 0);
    tmp    = Cshift(R_Q,mu,1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
    tmp    = G_s*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time
    tmp    = where((lcoor<=tmax),tmp,zz);
    L_Q = tmp;
    tmp    = R_Q*ph;
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd);// Adjoint link
    tmp    = -G_s*( Utmp + gmu*Utmp ); 
    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
    tmp    = where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
    L_Q= L_Q+tmp;
    InsertSlice(L_Q, q_out, s , 0);
  }
 #endif
 #ifndef GRID_NVCC
  ////////////////////////////////////////////////
  // GENERAL CAYLEY CASE
  ////////////////////////////////////////////////
  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
    Gamma::Algebra::GammaY,
    Gamma::Algebra::GammaZ,
    Gamma::Algebra::GammaT,
    Gamma::Algebra::Gamma5
  };
  Gamma gmu=Gamma(Gmu[mu]);
  Gamma g5(Gamma::Algebra::Gamma5);
  int Ls = this->Ls;
  auto UGrid= this->GaugeGrid();
  auto FGrid= this->FermionGrid();
  std::vector<PropagatorField> R_Q(Ls,UGrid); 
  PropagatorField L_Q(UGrid); 
  PropagatorField tmp(UGrid);
  PropagatorField Utmp(UGrid);
  LatticeInteger zz (UGrid);   zz=0.0;
  LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
  for(int s=0;s<Ls;s++){
    ExtractSlice(R_Q[s], q_in, s , 0);
  }
  PropagatorField R_TmLsGq0(UGrid); 
  PropagatorField R_TmLsTmp(UGrid);
  {
    PropagatorField TermA(UGrid);
    PropagatorField TermB(UGrid);
    PropagatorField TermC(UGrid);
    PropagatorField TermD(UGrid);
    TermA = (Pp(Q_4d(R_Q)));
    TermB = (Pm(Q_4d(R_Q)));
    TermC = (Pm(TopRowWithSource(R_Q)));
    TermD = (Pp(TopRowWithSource(R_Q)));
    R_TmLsGq0 = (TermD - TermA + TermB);
    R_TmLsTmp = (TermC - TermB + TermA);
  }
  std::vector<PropagatorField> R_TmLsGq(Ls,UGrid);
  for(int s=0;s<Ls;s++){
    R_TmLsGq[s] = (Pm((R_Q)[(s)]) + Pp((R_Q)[((s)-1+Ls)%Ls]));
  }
  std::vector<RealD> G_s(Ls,1.0);
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
    }
  }
  for(int s=0;s<Ls;s++){
    int sp = (s+1)%Ls;
    int sr = Ls-1-s;
    int srp= (sr+1)%Ls;
    // Mobius parameters
    auto b=this->bs[s];
    auto c=this->cs[s];
    //    auto bpc = G_s[s]*1.0/(b+c);  // -0.5 factor in gauge links
    if (s == 0) {
      tmp    =(b*Pm(R_TmLsGq0)     + c*Pp(R_TmLsGq0     ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
    } else if (s == Ls-1) {
      tmp    =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp)   + c*Pm(R_TmLsTmp   ));
    } else {
      tmp    =(b*Pm(R_TmLsGq[s])  + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
    }
    tmp    = Cshift(tmp,mu,1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
    tmp    = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time 
    L_Q    = where((lcoor<=tmax),tmp,zz); // Position of current complicated
    if (s == 0) {
      tmp    =(b*Pm(R_TmLsGq0)     + c*Pp(R_TmLsGq0  )    + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
    } else if (s == Ls-1) {
      tmp    =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp)   + c*Pm(R_TmLsTmp  ));
    } else {
      tmp    =(b*Pm(R_TmLsGq[s])   + c*Pp(R_TmLsGq[s])    + b*Pp(R_TmLsGq[sp])+ c*Pm(R_TmLsGq[sp]));
    }
    tmp    = tmp *ph;
    tmp    = Cshift(tmp,mu,-1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
    tmp = -G_s[s]*( Utmp + gmu*Utmp );
    tmp    = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time 
    L_Q   += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
    InsertSlice(L_Q, q_out, s , 0);
  }
 #endif
 }
 #undef Pp
 #undef Pm
 #undef Q_4d
 #undef TopRowWithSource
 #if 0
 template<class Impl>
 void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
@@ -611,6 +611,7 @@ template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 								PropagatorField &q_in_2,
 								PropagatorField &q_out,
 								PropagatorField &src,
 								Current curr_type,
 								unsigned int mu)
 {
@@ -620,11 +621,12 @@ void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField
 template <class Impl>
 void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 							   PropagatorField &q_out,
 							   PropagatorField &src,
 							   Current curr_type,
 							   unsigned int mu, 
 							   unsigned int tmin,
-                                              unsigned int tmax,
+							   unsigned int tmax,
-					      ComplexField &lattice_cmplx)
+							   ComplexField &lattice_cmplx)
 {
  assert(0);
@@ -600,6 +600,7 @@ template <class Impl>
 void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 							      PropagatorField &q_in_2,
 							      PropagatorField &q_out,
 							      PropagatorField &src,
 							      Current curr_type,
 							      unsigned int mu)
 {
@@ -609,6 +610,7 @@ void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q
 template <class Impl>
 void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
                                                         PropagatorField &q_out,
                                                         PropagatorField &src,
                                                         Current curr_type,
                                                         unsigned int mu, 
                                                         unsigned int tmin,
@@ -132,14 +132,14 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
  pickCheckerboard(Even, CloverTermEven, CloverTerm);
  pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
-  pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
+  pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
-  pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
+  pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
  pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
  pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
-  pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
+  pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
-  pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
+  pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
 }
 template <class Impl>
@@ -861,7 +861,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
 * Conserved current utilities for Wilson fermions, for contracting propagators
 * to make a conserved current sink or inserting the conserved current 
 * sequentially.
 ******************************************************************************/
 // Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
 #define REVERSE_LS(qSite, qSiteRev, Nsimd) \
@@ -877,220 +876,10 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
    merge(qSiteRev, qSiteVec); \
 }
-//          psi = chiralProjectPlus(Result_s[Ls/2-1]);
+ ******************************************************************************/
 //          psi+= chiralProjectMinus(Result_s[Ls/2]);
 //         PJ5q+=localInnerProduct(psi,psi);
 template<class vobj> 
 Lattice<vobj> spProj5p(const Lattice<vobj> & in)
 {
  GridBase *grid=in.Grid();
  Gamma G5(Gamma::Algebra::Gamma5);
  Lattice<vobj> ret(grid);
  auto ret_v = ret.View();
  auto in_v  =  in.View();
  thread_for(ss,grid->oSites(),{
    ret_v[ss] = in_v[ss] + G5*in_v[ss];
  });
  return ret;
 }
 template<class vobj> 
 Lattice<vobj> spProj5m(const Lattice<vobj> & in)
 {
  Gamma G5(Gamma::Algebra::Gamma5);
  GridBase *grid=in.Grid();
  Lattice<vobj> ret(grid);
  auto ret_v = ret.View();
  auto in_v  =  in.View();
  thread_for(ss,grid->oSites(),{
    ret_v[ss] = in_v[ss] - G5*in_v[ss];
  });
  return ret;
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
 {
  conformable(GaugeGrid(), J5q.Grid());
  conformable(q_in.Grid(), FermionGrid());
  // 4d field
  int Ls = this->Ls;
  FermionField psi(GaugeGrid());
  FermionField p_plus (GaugeGrid());
  FermionField p_minus(GaugeGrid());
  FermionField p(GaugeGrid());
  ExtractSlice(p_plus , q_in, Ls/2   , 0);
  ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
  p_plus = spProj5p(p_plus );
  p_minus= spProj5m(p_minus);
  p=p_plus+p_minus;
  J5q = localInnerProduct(p,p);
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
 {
  conformable(GaugeGrid(), J5q.Grid());
  conformable(q_in.Grid(), FermionGrid());
  // 4d field
  int Ls = this->Ls;
  PropagatorField psi(GaugeGrid());
  PropagatorField p_plus (GaugeGrid());
  PropagatorField p_minus(GaugeGrid());
  PropagatorField p(GaugeGrid());
  ExtractSlice(p_plus , q_in, Ls/2   , 0);
  ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
  p_plus = spProj5p(p_plus );
  p_minus= spProj5m(p_minus);
  p=p_plus+p_minus;
  J5q = localInnerProduct(p,p);
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                     PropagatorField &q_in_2,
                                                     PropagatorField &q_out,
                                                     Current curr_type,
                                                     unsigned int mu)
 {
    conformable(q_in_1.Grid(), FermionGrid());
    conformable(q_in_1.Grid(), q_in_2.Grid());
    conformable(_FourDimGrid, q_out.Grid());
    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
    unsigned int LLs = q_in_1.Grid()->_rdimensions[0];
    q_out = Zero();
    // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
    // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
    tmp1 = Cshift(q_in_1, mu + 1, 1);
    tmp2 = Cshift(q_in_2, mu + 1, 1);
    auto q_in_1_v = q_in_1.View();
    auto q_in_2_v = q_in_2.View();
    auto tmp1_v   = tmp1.View();
    auto tmp2_v   = tmp2.View();
    auto q_out_v  = q_out.View();
    auto Umu_v    = Umu.View();
    thread_for(sU, Umu.Grid()->oSites(),{
        unsigned int sF1 = sU * LLs;
        unsigned int sF2 = (sU + 1) * LLs - 1;
        for (unsigned int s = 0; s < LLs; ++s)
        {
            bool axial_sign = ((curr_type == Current::Axial) && \
                               (s < (LLs / 2)));
            SitePropagator qSite2, qmuSite2;
            // If vectorised in 5th dimension, reverse q2 vector to match up
            // sites correctly.
            if (Impl::LsVectorised)
            {
                REVERSE_LS(q_in_2_v[sF2], qSite2, Ls / LLs);
                REVERSE_LS(tmp2_v[sF2], qmuSite2, Ls / LLs);
            }
            else
            {
                qSite2   = q_in_2_v[sF2];
                qmuSite2 = tmp2_v[sF2];
            }
            Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sF1], 
                                                     qSite2, 
                                                     q_out_v[sU],
                                                     Umu_v, sU, mu, axial_sign);
            Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sF1],
                                                     qmuSite2,
                                                     q_out_v[sU],
                                                     Umu_v, sU, mu, axial_sign);
            sF1++;
            sF2--;
        }
    });
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                                PropagatorField &q_out,
                                                Current curr_type, 
                                                unsigned int mu,
                                                unsigned int tmin, 
                                                unsigned int tmax,
 						ComplexField &lattice_cmplx)
 {
    conformable(q_in.Grid(), FermionGrid());
    conformable(q_in.Grid(), q_out.Grid());
    PropagatorField tmp(GaugeGrid()),tmp2(GaugeGrid());
    unsigned int tshift = (mu == Tp) ? 1 : 0;
    unsigned int LLs = q_in.Grid()->_rdimensions[0];
    unsigned int LLt    = GridDefaultLatt()[Tp];
    q_out = Zero();
    LatticeInteger coords(_FourDimGrid);
    LatticeCoordinate(coords, Tp);
    auto q_out_v = q_out.View();
    auto tmp2_v  = tmp2.View();
    auto coords_v= coords.View();
    auto Umu_v   = Umu.View();
    for (unsigned int s = 0; s < LLs; ++s)
    {
        bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
 	bool tadpole_sign = (curr_type == Current::Tadpole);
 	bool switch_sgn = tadpole_sign || axial_sign;
        //forward direction: Need q(x + mu, s)*A(x)
        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s) 
        tmp = Cshift(tmp2, mu, 1);	 //q(x+mu,s)
        tmp2 = tmp*lattice_cmplx;	 //q(x+mu,s)*A(x)	
    	thread_for(sU, Umu.Grid()->oSites(),{
            // Compute the sequential conserved current insertion only if our simd
            // object contains a timeslice we need.
            vPredicate t_mask;
 	    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
            Integer timeSlices = Reduce(t_mask());
            if (timeSlices > 0)
            {
 		unsigned int sF = sU * LLs + s;
                Kernels::SeqConservedCurrentSiteFwd(tmp2_v[sU], 
 						    q_out_v[sF], Umu_v, sU,
 						    mu, t_mask, switch_sgn);
            }
        });
        //backward direction: Need q(x - mu, s)*A(x-mu)
        ExtractSlice(tmp2, q_in, s, 0);  //q(x,s)
        tmp = lattice_cmplx*tmp2;	 //q(x,s)*A(x)
        tmp2 = Cshift(tmp, mu, -1);	 //q(x-mu,s)*A(x-mu,s)
    	thread_for(sU, Umu.Grid()->oSites(),
    	{
 	  vPredicate t_mask;
 	  t_mask()= ((coords_v[sU] >= (tmin + tshift)) && (coords_v[sU] <= (tmax + tshift)));
 	  //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
 	  unsigned int t0 = 0;
 	  if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
 	  Integer timeSlices = Reduce(t_mask());
 	  if (timeSlices > 0) {
 	    unsigned int sF = sU * LLs + s; 
 	    Kernels::SeqConservedCurrentSiteBwd(tmp2_v[sU], 
 						q_out_v[sF], Umu_v, sU,
 						mu, t_mask, axial_sign);
 	  }
 	});
    }
 }
 NAMESPACE_END(Grid);
@@ -459,6 +459,7 @@ template <class Impl>
 void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
                                                   PropagatorField &q_in_2,
                                                   PropagatorField &q_out,
                                                   PropagatorField &src,
                                                   Current curr_type,
                                                   unsigned int mu)
 {
@@ -466,6 +467,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
 #if 0
  PropagatorField tmp1(_grid), tmp2(_grid);
  q_out = Zero();
@@ -489,12 +491,15 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
 					       q_out_v[sU],
 					       Umu_v, sU, mu);
  });
 #else
 #endif
 }
 template <class Impl>
 void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
                                              PropagatorField &q_out,
                                              PropagatorField &src,
                                              Current curr_type,
                                              unsigned int mu,
                                              unsigned int tmin, 
@@ -503,6 +508,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 {
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
 #if 0
  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
  Complex i(0.0,1.0);
@@ -556,6 +562,8 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 					  Umu_v, sU, mu, t_mask);
    }
  });
 #else
 #endif
 }
 NAMESPACE_END(Grid);
@@ -444,19 +444,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
 #ifndef GRID_NVCC
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite); printf(".");    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
 #ifndef GRID_NVCC
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt); printf("-");    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) { 
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
 #ifndef GRID_NVCC
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt); printf("+");    return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
   }
   assert(0 && " Kernel optimisation case not covered ");
@@ -493,131 +493,5 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
   assert(0 && " Kernel optimisation case not covered ");
  }
 /*******************************************************************************
 * Conserved current utilities for Wilson fermions, for contracting propagators
 * to make a conserved current sink or inserting the conserved current 
 * sequentially. Common to both 4D and 5D.
 ******************************************************************************/
 // N.B. Functions below assume a -1/2 factor within U.
 #define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
 #define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
 /*******************************************************************************
 * Name: ContractConservedCurrentSiteFwd
 * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
 * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
 *        - Pass in q_in_1 shifted in +ve mu direction.
 ******************************************************************************/
 template<class Impl>
 void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
 							  const SitePropagator &q_in_2,
 							  SitePropagator &q_out,
 							  DoubledGaugeFieldView &U,
 							  unsigned int sU,
 							  unsigned int mu,
 							  bool switch_sign)
 {
  SitePropagator result, tmp;
  Gamma g5(Gamma::Algebra::Gamma5);
  Impl::multLink(tmp, U[sU], q_in_1, mu);
  result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
  if (switch_sign) {
    q_out -= result;
  } else {
    q_out += result;
  }
 }
 /*******************************************************************************
 * Name: ContractConservedCurrentSiteBwd
 * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
 * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
 *        - Pass in q_in_2 shifted in +ve mu direction.
 ******************************************************************************/
 template<class Impl>
 void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
 							  const SitePropagator &q_in_2,
 							  SitePropagator &q_out,
 							  DoubledGaugeFieldView &U,
 							  unsigned int sU,
 							  unsigned int mu,
 							  bool switch_sign)
 {
  SitePropagator result, tmp;
  Gamma g5(Gamma::Algebra::Gamma5);
  Impl::multLink(tmp, U[sU], q_in_1, mu + Nd);
  result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
  if (switch_sign) {
    q_out += result;
  } else {
    q_out -= result;
  }
 }
 /*******************************************************************************
 * Name: SeqConservedCurrentSiteFwd
 * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
 * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
 *        - Pass in q_in shifted in +ve mu direction.
 ******************************************************************************/
 template<class Impl>
 void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
                                                     SitePropagator &q_out,
                                                     DoubledGaugeFieldView &U,
                                                     unsigned int sU,
                                                     unsigned int mu,
                                                     vPredicate t_mask,
                                                     bool switch_sign)
 {
  SitePropagator result;
  Impl::multLink(result, U[sU], q_in, mu);
  result = WilsonCurrentFwd(result, mu);
  // Zero any unwanted timeslice entries.
  result = predicatedWhere(t_mask, result, 0.*result);
  if (switch_sign) {
    q_out -= result;
  } else {
    q_out += result;
  }
 }
 /*******************************************************************************
 * Name: SeqConservedCurrentSiteFwd
 * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
 * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
 *        - Pass in q_in shifted in -ve mu direction.
 ******************************************************************************/
 template<class Impl>
 void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
                                                     SitePropagator &q_out,
                                                     DoubledGaugeFieldView &U,
                                                     unsigned int sU,
                                                     unsigned int mu,
                                                     vPredicate t_mask,
                                                     bool switch_sign)
 {
  SitePropagator result;
  Impl::multLink(result, U[sU], q_in, mu + Nd);
  result = WilsonCurrentBwd(result, mu);
  // Zero any unwanted timeslice entries.
  result = predicatedWhere(t_mask, result, 0.*result);
  if (switch_sign) {
    q_out += result;
  } else {
    q_out -= result;
  }
 }
 NAMESPACE_END(Grid);
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
 			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
 			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CayleyFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../CayleyFermion5DInstantiation.cc.master
@@ -1,38 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
 #include <Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class ContinuedFractionFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../ContinuedFractionFermion5DInstantiation.cc.master
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 #include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class DomainWallEOFAFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../DomainWallEOFAFermionInstantiation.cc.master
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 #include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class MobiusEOFAFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../MobiusEOFAFermionInstantiation.cc.master
@@ -1,39 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
 #include <Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class PartialFractionFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../PartialFractionFermion5DInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
    Copyright (C) 2017
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonCloverFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonCloverFermionInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonFermion5DInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonFermionInstantiation.cc.master
@@ -1,74 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h>
 NAMESPACE_BEGIN(Grid);
 // Move these
 #include "impl.h"
 // G-parity requires more specialised implementation.
 template <>
 void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1, 
 							  const SitePropagator &q_in_2, 
 							  SitePropagator &q_out, 
 							  DoubledGaugeFieldView &U,
 							  unsigned int sU, 
 							  unsigned int mu, 
 							  bool switch_sign)
 {			
  assert(0);		
 }			
 template <>		
 void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteBwd( const SitePropagator &q_in_1,
 							   const SitePropagator &q_in_2,
 							   SitePropagator &q_out,
 							   DoubledGaugeFieldView &U,
 							   unsigned int mu,
 							   unsigned int sU,
 							   bool switch_sign)
 {			
  assert(0);		
 }
 HAND_SPECIALISE_GPARITY(IMPLEMENTATION);
 template class WilsonKernels<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiationGparity.cc.master
@@ -1,37 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonTMFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonTMFermionInstantiation.cc.master
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
 			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
 			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CayleyFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../CayleyFermion5DInstantiation.cc.master
@@ -1,38 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
 #include <Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class ContinuedFractionFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../ContinuedFractionFermion5DInstantiation.cc.master
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 #include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class DomainWallEOFAFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../DomainWallEOFAFermionInstantiation.cc.master
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 #include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class MobiusEOFAFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../MobiusEOFAFermionInstantiation.cc.master
@@ -1,39 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
 #include <Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class PartialFractionFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../PartialFractionFermion5DInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
    Copyright (C) 2017
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonCloverFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonCloverFermionInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonFermion5DInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonFermionInstantiation.cc.master
@@ -1,74 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h>
 NAMESPACE_BEGIN(Grid);
 // Move these
 #include "impl.h"
 // G-parity requires more specialised implementation.
 template <>
 void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1, 
 							  const SitePropagator &q_in_2, 
 							  SitePropagator &q_out, 
 							  DoubledGaugeFieldView &U,
 							  unsigned int sU, 
 							  unsigned int mu, 
 							  bool switch_sign)
 {			
  assert(0);		
 }			
 template <>		
 void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteBwd( const SitePropagator &q_in_1,
 							   const SitePropagator &q_in_2,
 							   SitePropagator &q_out,
 							   DoubledGaugeFieldView &U,
 							   unsigned int mu,
 							   unsigned int sU,
 							   bool switch_sign)
 {			
  assert(0);		
 }
 HAND_SPECIALISE_GPARITY(IMPLEMENTATION);
 template class WilsonKernels<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiationGparity.cc.master
@@ -1,37 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonTMFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonTMFermionInstantiation.cc.master
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
 			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
 			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CayleyFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../CayleyFermion5DInstantiation.cc.master
@@ -1,38 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
 #include <Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class ContinuedFractionFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../ContinuedFractionFermion5DInstantiation.cc.master
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 #include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class DomainWallEOFAFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../DomainWallEOFAFermionInstantiation.cc.master
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
 Copyright (C) 2017
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #include <Grid/Grid_Eigen_Dense.h>
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 #include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class MobiusEOFAFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../MobiusEOFAFermionInstantiation.cc.master
@@ -1,39 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
 #include <Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class PartialFractionFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../PartialFractionFermion5DInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid
    Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
    Copyright (C) 2017
    Author: paboyle <paboyle@ph.ed.ac.uk>
    Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/qcd/spin/Dirac.h>
 #include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonCloverFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonCloverFermionInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonFermion5DInstantiation.cc.master
@@ -1,40 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonFermionInstantiation.cc.master
@@ -1,74 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h>
 NAMESPACE_BEGIN(Grid);
 // Move these
 #include "impl.h"
 // G-parity requires more specialised implementation.
 template <>
 void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1, 
 							  const SitePropagator &q_in_2, 
 							  SitePropagator &q_out, 
 							  DoubledGaugeFieldView &U,
 							  unsigned int sU, 
 							  unsigned int mu, 
 							  bool switch_sign)
 {			
  assert(0);		
 }			
 template <>		
 void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteBwd( const SitePropagator &q_in_1,
 							   const SitePropagator &q_in_2,
 							   SitePropagator &q_out,
 							   DoubledGaugeFieldView &U,
 							   unsigned int mu,
 							   unsigned int sU,
 							   bool switch_sign)
 {			
  assert(0);		
 }
 HAND_SPECIALISE_GPARITY(IMPLEMENTATION);
 template class WilsonKernels<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonKernelsInstantiationGparity.cc.master
@@ -1,37 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/WilsonTMFermion.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class WilsonTMFermion<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../WilsonTMFermionInstantiation.cc.master
@@ -1,44 +0,0 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
 			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
 			   //#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class CayleyFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
@@ -0,0 +1 @@
 ../CayleyFermion5DInstantiation.cc.master
@@ -1,38 +0,0 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
    Copyright (C) 2015
 Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
 #include <Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h>
 NAMESPACE_BEGIN(Grid);
 #include "impl.h"
 template class ContinuedFractionFermion5D<IMPLEMENTATION>; 
 NAMESPACE_END(Grid);
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`../ContinuedFractionFermion5DInstantiation.cc.master`
		`@@ -0,0 +1 @@`
							`../DomainWallEOFAFermionInstantiation.cc.master`
		`@@ -0,0 +1 @@`
							`../MobiusEOFAFermionInstantiation.cc.master`
		`@@ -0,0 +1 @@`
							`../PartialFractionFermion5DInstantiation.cc.master`
		`@@ -0,0 +1 @@`
							`../WilsonCloverFermionInstantiation.cc.master`
		`@@ -0,0 +1 @@`
							`../WilsonKernelsInstantiationGparity.cc.master`