finer timers in Benchmark_IO

Update README.md
removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)
2025-10-31 03:54:33 +00:00 · 2021-06-17 11:57:02 +01:00 · 2021-06-06 04:52:05 -04:00 · 2021-06-04 11:12:22 +01:00 · 2021-06-03 04:24:19 +00:00 · 2021-05-05 14:17:18 -07:00
98 changed files with 3414 additions and 2299 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,56 +0,0 @@
-language: cpp
-
-cache:
-  directories:
-    - clang
-
-matrix:
-  include:
-    - os:        osx
-      osx_image: xcode8.3
-      compiler: clang
-      
-before_install:
-    - export GRIDDIR=`pwd`
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
-    
-install:
-    - export CWD=`pwd`
-    - echo $CWD
-    - export CC=$CC$VERSION
-    - export CXX=$CXX$VERSION
-    - echo $PATH
-    - which autoconf
-    - autoconf  --version
-    - which automake
-    - automake  --version
-    - which $CC
-    - $CC  --version
-    - which $CXX
-    - $CXX --version
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
-    
-script:
-    - ./bootstrap.sh
-    - mkdir build
-    - cd build
-    - mkdir lime
-    - cd lime
-    - mkdir build
-    - cd build
-    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
-    - tar xf lime-1.3.2.tar.gz
-    - cd lime-1.3.2
-    - ./configure --prefix=$CWD/build/lime/install
-    - make -j4
-    - make install
-    - cd $CWD/build
-    - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
-    - make -j4 
-    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
-    - make check
--- a/Grid/Makefile.am
+++ b/Grid/Makefile.am
@@ -54,9 +54,11 @@ Version.h: version-cache
 include Make.inc
 include Eigen.inc

-#extra_sources+=$(ZWILS_FERMION_FILES)
 extra_sources+=$(WILS_FERMION_FILES)
 extra_sources+=$(STAG_FERMION_FILES)
+if BUILD_ZMOBIUS
+  extra_sources+=$(ZWILS_FERMION_FILES)
+endif
 if BUILD_GPARITY
  extra_sources+=$(GP_FERMION_FILES)
 endif
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -7,6 +7,7 @@
    Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -169,6 +170,23 @@ static inline int divides(int a,int b)
 }
 void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
 {
+  ////////////////////////////////////////////////////////////////
+  // Allow user to configure through environment variable
+  ////////////////////////////////////////////////////////////////
+  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
+  if ( str ) {
+    std::vector<int> IntShmDims;
+    GridCmdOptionIntVector(std::string(str),IntShmDims);
+    assert(IntShmDims.size() == WorldDims.size());
+    long ShmSize = 1;
+    for (int dim=0;dim<WorldDims.size();dim++) {
+      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
+      assert(divides(ShmDims[dim],WorldDims[dim]));
+    }
+    assert(ShmSize == WorldShmSize);
+    return;
+  }
+  
  ////////////////////////////////////////////////////////////////
  // Powers of 2,3,5 only in prime decomposition for now
  ////////////////////////////////////////////////////////////////
--- a/Grid/cshift/Cshift_common.h
+++ b/Grid/cshift/Cshift_common.h
@@ -112,7 +112,9 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  if ( cbmask ==0x3){
 #ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
-    accelerator_for2d(n,e1,b,e2,1,{
+    accelerator_for(nn,e1*e2,1,{
+	int n = nn%e1;
+	int b = nn/e1;
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	
@@ -135,7 +137,9 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
-    accelerator_for2d(n,e1,b,e2,1,{
+    accelerator_for(nn,e1*e2,1,{
+	int n = nn%e1;
+	int b = nn/e1;

 	Coordinate coor;

@@ -257,7 +261,9 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
    int _slice_block = rhs.Grid()->_slice_block[dimension];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v , rhs, AcceleratorWrite);
-    accelerator_for2d(n,e1,b,e2,1,{
+    accelerator_for(nn,e1*e2,1,{
+	int n = nn%e1;
+	int b = nn/e1;
 	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
@@ -274,7 +280,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA

    // Case of SIMD split AND checker dim cannot currently be hit, except in 
    // Test_cshift_red_black code.
-    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
+    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
    assert(0); // This will fail if hit on GPU
    autoView( rhs_v, rhs, CpuWrite);
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -122,8 +122,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  cshiftVector<vobj> send_buf(buffer_size);
-  cshiftVector<vobj> recv_buf(buffer_size);
+  static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
    
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -198,8 +198,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
-  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
 
@@ -294,8 +294,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  cshiftVector<vobj> send_buf_v(buffer_size);
-  cshiftVector<vobj> recv_buf_v(buffer_size);
+  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
  vobj *send_buf;
  vobj *recv_buf;
  {
@@ -381,8 +381,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
-  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
  {
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -97,6 +97,20 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
  out = in;
 }

+template<typename T>
+accelerator_inline EnableIf<isGridFundamental<T>> convertType(T & out, const T & in) {
+  out = in;
+}
+
+// This would allow for conversions between GridFundamental types, but is not strictly needed as yet
+/*template<typename T1, typename T2>
+accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
+// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
+//accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
+convertType(T1 & out, const T2 & in) {
+  out = in;
+}*/
+
 #ifdef GRID_SIMT
 accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
  ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
@@ -117,23 +131,18 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
 }

-template<typename T1,typename T2,int N>
-  accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
-template<typename T1,typename T2,int N>
-  accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
-
-template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
-accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
-  convertType(out,in._internal);
+template<typename T1,typename T2>
+accelerator_inline void convertType(iScalar<T1> & out, const iScalar<T2> & in) {
+  convertType(out._internal,in._internal);
 }

-template<typename T1, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
-accelerator_inline void convertType(T1 & out, const iScalar<T1> & in) {
+template<typename T1,typename T2>
+accelerator_inline NotEnableIf<isGridScalar<T1>> convertType(T1 & out, const iScalar<T2> & in) {
  convertType(out,in._internal);
 }

 template<typename T1,typename T2>
-accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
+accelerator_inline NotEnableIf<isGridScalar<T2>> convertType(iScalar<T1> & out, const T2 & in) {
  convertType(out._internal,in);
 }

@@ -150,11 +159,6 @@ accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & i
    convertType(out._internal[i],in._internal[i]);
 }

-template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
-accelerator_inline void convertType(T & out, const T & in) {
-  out = in;
-}
-
 template<typename T1,typename T2>
 accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
  autoView( out_v , out,AcceleratorWrite);
--- a/Grid/lattice/Lattice_view.h
+++ b/Grid/lattice/Lattice_view.h
@@ -71,8 +71,8 @@ public:
  //  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
  accelerator_inline vobj       & operator[](size_t i) const { return this->_odata[i]; };
 #else
-  //  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
-  //  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
+  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
+  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
 #endif
  
  accelerator_inline uint64_t begin(void) const { return 0;};
--- a/Grid/lattice/Lattice_where.h
+++ b/Grid/lattice/Lattice_where.h
@@ -43,7 +43,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  conformable(iftrue,predicate);
  conformable(iftrue,ret);

-  GridBase *grid=iftrue._grid;
+  GridBase *grid=iftrue.Grid();

  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
@@ -52,22 +52,23 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<

  const int Nsimd = grid->Nsimd();

-  std::vector<Integer> mask(Nsimd);
-  std::vector<scalar_object> truevals (Nsimd);
-  std::vector<scalar_object> falsevals(Nsimd);
-
-  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
-
-    extract(iftrue._odata[ss]   ,truevals);
-    extract(iffalse._odata[ss]  ,falsevals);
-    extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
-
-    for(int s=0;s<Nsimd;s++){
-      if (mask[s]) falsevals[s]=truevals[s];
-    }
-
-    merge(ret._odata[ss],falsevals);
+  autoView(iftrue_v,iftrue,CpuRead);
+  autoView(iffalse_v,iffalse,CpuRead);
+  autoView(predicate_v,predicate,CpuRead);
+  autoView(ret_v,ret,CpuWrite);
+  Integer NN= grid->oSites();
+  thread_for(ss,NN,{
+    Integer mask;
+    scalar_object trueval;
+    scalar_object falseval;
+    for(int l=0;l<Nsimd;l++){
+      trueval =extractLane(l,iftrue_v[ss]);
+      falseval=extractLane(l,iffalse_v[ss]);
+      mask    =extractLane(l,predicate_v[ss]);
+      if (mask) falseval=trueval;
+      insertLane(l,ret_v[ss],falseval);
    }
+  });
 }

 template<class vobj,class iobj>
@@ -76,9 +77,9 @@ inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &ift
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);

-  Lattice<vobj> ret(iftrue._grid);
+  Lattice<vobj> ret(iftrue.Grid());

-  where(ret,predicate,iftrue,iffalse);
+  whereWolf(ret,predicate,iftrue,iffalse);

  return ret;
 }
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@@ -271,7 +271,7 @@ class BinaryIO {
 			      uint32_t &scidac_csumb)
  {
    grid->Barrier();
-    GridStopWatch timer; 
+    GridStopWatch timer, insideTimer; 
    GridStopWatch bstimer;
    
    nersc_csum=0;
@@ -363,7 +363,10 @@ class BinaryIO {
 	std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
 	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
 	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
-	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
+  insideTimer.Start();
+	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);
+  insideTimer.Stop();
+  assert(ierr==0);
 	MPI_File_close(&fh);
 	MPI_Type_free(&fileArray);
 	MPI_Type_free(&localArray);
@@ -438,7 +441,9 @@ class BinaryIO {
        assert(ierr == 0);

        std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
+        insideTimer.Start();
        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
+        insideTimer.Stop();
        assert(ierr == 0);

        MPI_Offset os;
@@ -516,8 +521,13 @@ class BinaryIO {
    if ( control & BINARYIO_READ) std::cout << " read  ";
    else                          std::cout << " write ";
    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
-    std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
+    std::cout<< lastPerf.size <<"bytes in "<< timer.Elapsed() <<" "
 	     << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
+    std::cout << GridLogMessage << "IOobject: pure MPI IO call " 
+              << lastPerf.size <<" bytes in " 
+              << insideTimer.Elapsed() << " "
+              << lastPerf.size/1024./1024./(insideTimer.useconds()/1.0e6)
+              <<" MB/s "<<std::endl;

    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;

--- a/Grid/parallelIO/MetaData.h
+++ b/Grid/parallelIO/MetaData.h
@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
  std::time_t t = std::time(nullptr);
  std::tm tm_ = *std::localtime(&t);
  std::ostringstream oss; 
-  //      oss << std::put_time(&tm_, "%c %Z");
+  oss << std::put_time(&tm_, "%c %Z");
  header.creation_date = oss.str();
  header.archive_date  = header.creation_date;

--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -205,11 +205,20 @@ public:
    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
  }

+  // Preferred interface
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
+					std::string file, 
+					std::string ens_label = std::string("DWF"))
+  {
+    writeConfiguration(Umu,file,0,1,ens_label);
+  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
-					int bits32)
+					int bits32,
+					std::string ens_label = std::string("DWF"))
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
@@ -219,8 +228,8 @@ public:
    // Following should become arguments
    ///////////////////////////////////////////
    header.sequence_number = 1;
-    header.ensemble_id     = "UKQCD";
-    header.ensemble_label  = "DWF";
+    header.ensemble_id     = std::string("UKQCD");
+    header.ensemble_label  = ens_label;

    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -41,7 +41,7 @@ class Action
 public:
  bool is_smeared = false;
  // Heatbath?
-  virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
+  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
  virtual std::string action_name()    = 0;                             // return the action name
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -291,12 +291,6 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

-#ifndef GRID_CUDA
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
-#endif
-
 NAMESPACE_END(Grid);

 ////////////////////
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -153,8 +153,8 @@ public:
  typedef typename Impl::StencilImpl             StencilImpl;		\
  typedef typename Impl::ImplParams               ImplParams;	        \
  typedef typename Impl::StencilImpl::View_type  StencilView;		\
-  typedef typename ViewMap<FermionField>::Type      FermionFieldView;	\
-  typedef typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
+  typedef const typename ViewMap<FermionField>::Type      FermionFieldView;	\
+  typedef const typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;

 #define INHERIT_IMPL_TYPES(Base)		\
  INHERIT_GIMPL_TYPES(Base)			\
@@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered);
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour one component spinors with colour index. 5d vec
 /////////////////////////////////////////////////////////////////////////////
-#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
-NAMESPACE_CHECK(ImplStaggered5dVec);  
+// Deprecate Vec5d
+//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
+//NAMESPACE_CHECK(ImplStaggered5dVec);  


--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -88,7 +88,7 @@ public:
 					  const _Spinor &chi, 
 					  int mu, 
 					  StencilEntry *SE,
-					  const StencilView &St) 
+					  StencilView &St) 
  {
    int direction = St._directions[mu];
    int distance  = St._distances[mu];
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -72,19 +72,23 @@ public:
    
  StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
      
-  static accelerator_inline void multLink(SiteSpinor &phi,
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
 		       const SiteDoubledGaugeField &U,
-		       const SiteSpinor &chi,
+		       const _Spinor &chi,
 		       int mu)
  {
-    mult(&phi(), &U(mu), &chi());
+    auto UU = coalescedRead(U(mu));
+    mult(&phi(), &UU, &chi());
  }
-  static accelerator_inline void multLinkAdd(SiteSpinor &phi,
+  template<class _Spinor>
+  static accelerator_inline void multLinkAdd(_Spinor &phi,
 			  const SiteDoubledGaugeField &U,
-			  const SiteSpinor &chi,
+			  const _Spinor &chi,
 			  int mu)
  {
-    mac(&phi(), &U(mu), &chi());
+    auto UU = coalescedRead(U(mu));
+    mac(&phi(), &UU, &chi());
  }
      
  template <class ref>
--- a/Grid/qcd/action/fermion/StaggeredKernels.h
+++ b/Grid/qcd/action/fermion/StaggeredKernels.h
@@ -56,12 +56,8 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
 		 DoubledGaugeField &U,
 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior);
  
-  void DhopDirKernel(StencilImpl &st,
-		     const DoubledGaugeFieldView &U,
-		     const DoubledGaugeFieldView &UUU, SiteSpinor * buf,
-		     int sF, int sU,
-		     const FermionFieldView &in,
-		     const FermionFieldView &out, int dir,int disp);
+  void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
+		     int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
 protected:    

   ///////////////////////////////////////////////////////////////////////////////////////
@@ -69,67 +65,53 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
   ///////////////////////////////////////////////////////////////////////////////////////
   template<int Naik> 
   static accelerator_inline
-   void DhopSiteGeneric(const StencilView &st, 
-			const DoubledGaugeFieldView &U,
-			const DoubledGaugeFieldView &UUU, 
+   void DhopSiteGeneric(StencilView &st, 
+			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
-			const FermionFieldView &in,
-			const FermionFieldView &out,int dag);
+			const FermionFieldView &in, FermionFieldView &out,int dag);
   
   template<int Naik> static accelerator_inline
-   void DhopSiteGenericInt(const StencilView &st, 
-			   const DoubledGaugeFieldView &U,
-			   const DoubledGaugeFieldView &UUU, 
+   void DhopSiteGenericInt(StencilView &st, 
+			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, 
 			   SiteSpinor * buf, int LLs, int sU, 
-			   const FermionFieldView &in,
-			   const FermionFieldView &out,int dag);
+			   const FermionFieldView &in, FermionFieldView &out,int dag);
   
   template<int Naik> static accelerator_inline
-   void DhopSiteGenericExt(const StencilView &st, 
-			   const DoubledGaugeFieldView &U,
-			   const DoubledGaugeFieldView &UUU,
+   void DhopSiteGenericExt(StencilView &st, 
+			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 			   SiteSpinor * buf, int LLs, int sU, 
-			   const FermionFieldView &in,
-			   const FermionFieldView &out,int dag);
+			   const FermionFieldView &in, FermionFieldView &out,int dag);

   ///////////////////////////////////////////////////////////////////////////////////////
   // Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   
   template<int Naik> static accelerator_inline
-   void DhopSiteHand(const StencilView &st, 
-		     const DoubledGaugeFieldView &U,
-		     const DoubledGaugeFieldView &UUU, 
+   void DhopSiteHand(StencilView &st, 
+		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		     SiteSpinor * buf, int LLs, int sU, 
-		     const FermionFieldView &in,
-		     const FermionFieldView &out,int dag);
+		     const FermionFieldView &in, FermionFieldView &out,int dag);
   
   template<int Naik> static accelerator_inline
-   void DhopSiteHandInt(const StencilView &st, 
-			const DoubledGaugeFieldView &U,
-			const DoubledGaugeFieldView &UUU, 
+   void DhopSiteHandInt(StencilView &st, 
+			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
-			const FermionFieldView &in,
-			const FermionFieldView &out,int dag);
+			const FermionFieldView &in, FermionFieldView &out,int dag);
   
   template<int Naik> static accelerator_inline
-   void DhopSiteHandExt(const StencilView &st, 
-			const DoubledGaugeFieldView &U,
-			const DoubledGaugeFieldView &UUU, 
+   void DhopSiteHandExt(StencilView &st, 
+			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 			SiteSpinor * buf, int LLs, int sU, 
-			const FermionFieldView &in,
-			const FermionFieldView &out,int dag);
+			const FermionFieldView &in, FermionFieldView &out,int dag);

   ///////////////////////////////////////////////////////////////////////////////////////
   // Asm Nc=3 specific kernels
   ///////////////////////////////////////////////////////////////////////////////////////
   
-   void DhopSiteAsm(const StencilView &st, 
-		    const DoubledGaugeFieldView &U,
-		    const DoubledGaugeFieldView &UUU, 
+   void DhopSiteAsm(StencilView &st, 
+		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, 
 		    SiteSpinor * buf, int LLs, int sU, 
-		    const FermionFieldView &in,
-		    const FermionFieldView &out,int dag);
+		    const FermionFieldView &in, FermionFieldView &out,int dag);
  
 public:

--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -245,7 +245,7 @@ public:
    return out;
  }

-private:
+protected:
  // here fixing the 4 dimensions, make it more general?

  RealD csw_r;                                               // Clover coefficient - spatial
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -72,7 +72,7 @@ public:
  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
  typedef WilsonImplParams ImplParams;
  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
-  typedef typename StencilImpl::View_type StencilView;
+  typedef const typename StencilImpl::View_type StencilView;
    
  ImplParams Params;

@@ -95,7 +95,7 @@ public:
 					  const _Spinor &chi,
 					  int mu,
 					  StencilEntry *SE,
-					  const StencilView &St) 
+					  StencilView &St) 
  {
    multLink(phi,U,chi,mu);
  }
@@ -184,18 +184,22 @@ public:
      mat = TraceIndex<SpinIndex>(P); 
    }
      
-    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
+    {
      for (int mu = 0; mu < Nd; mu++)
      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
    }

-
-  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
+  {
+#undef USE_OLD_INSERT_FORCE    
    int Ls=Btilde.Grid()->_fdimensions[0];
+    autoView( mat_v , mat, AcceleratorWrite);
+#ifdef USE_OLD_INSERT_FORCE    
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
+      const int Nsimd = SiteSpinor::Nsimd();
      autoView( tmp_v , tmp, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
@@ -208,6 +212,29 @@ public:
 	});
    }
    PokeIndex<LorentzIndex>(mat,tmp,mu);
+#else
+    {
+      const int Nsimd = SiteSpinor::Nsimd();
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
+	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+	  zeroit(sum);  
+	  for(int s=0;s<Ls;s++){
+	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+  	      auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF]()(spn) );
+	      auto op = outerProduct(bb,aa);
+  	      sum = sum + op;
+	    }
+	  }
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
+      });
+    }
+#endif    
  }
 };

--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -50,9 +50,16 @@ public:
  INHERIT_IMPL_TYPES(Impl);
  typedef FermionOperator<Impl> Base;
  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
-   
 public:

+#ifdef GRID_SYCL
+#define SYCL_HACK
+#endif  
+#ifdef SYCL_HACK
+  static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
+			       int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
+#endif
+  
  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 int interior=1,int exterior=1) ;
@@ -69,87 +76,73 @@ public:

 private:

-  static accelerator_inline void DhopDirK(const StencilView &st, const DoubledGaugeFieldView &U,
-					  SiteHalfSpinor * buf, int sF, int sU,
-					  const FermionFieldView &in,const FermionFieldView &out, int dirdisp, int gamma);
+  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
+				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);

-  static accelerator_inline void DhopDirXp(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,
-					   const FermionFieldView &in, const FermionFieldView &out,int dirdisp);
-  static accelerator_inline void DhopDirYp(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,
-					   const FermionFieldView &in, const FermionFieldView &out,int dirdisp);
-  static accelerator_inline void DhopDirZp(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,
-					   const FermionFieldView &in, const FermionFieldView &out,int dirdisp);
-  static accelerator_inline void DhopDirTp(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,
-					   const FermionFieldView &in, const FermionFieldView &out,int dirdisp);
-  static accelerator_inline void DhopDirXm(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,
-					   const FermionFieldView &in, const FermionFieldView &out,int dirdisp);
-  static accelerator_inline void DhopDirYm(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,
-					   const FermionFieldView &in, const FermionFieldView &out,int dirdisp);
-  static accelerator_inline void DhopDirZm(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,
-					   const FermionFieldView &in, const FermionFieldView &out,int dirdisp);
-  static accelerator_inline void DhopDirTm(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,
-					   const FermionFieldView &in, const FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTp(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirTm(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
      
  // Specialised variants
-  static accelerator void GenericDhopSite(const StencilView &st,
-					  const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					  int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static accelerator void GenericDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
-  static accelerator void GenericDhopSiteDag(const StencilView &st, const  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					     int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static accelerator void GenericDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						    int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
  
-  static accelerator void GenericDhopSiteInt(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					     int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static accelerator void GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						    int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
-  static accelerator void GenericDhopSiteDagInt(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-						int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static accelerator void GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
  
-  static accelerator void GenericDhopSiteExt(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					     int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static accelerator void GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
      
-  static accelerator void GenericDhopSiteDagExt(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-						int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static accelerator void GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+						       int sF, int sU, const FermionFieldView &in, FermionFieldView &out);

-// Keep Hand unrolled 
-  static accelerator void HandDhopSiteSycl(StencilVector st_perm, StencilEntry *st_p,  SiteDoubledGaugeField *U, SiteHalfSpinor * buf,
-					   int sF, int sU, const SiteSpinor *in, SiteSpinor *out);
+  static void AsmDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			  int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out);
  
-  static accelerator void HandDhopSite(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-				       int sF, int sU, const FermionFieldView &in,const FermionFieldView &out);
+  static void AsmDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);
  
-  static accelerator void HandDhopSiteDag(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					  int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static void AsmDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out);
  
-  static accelerator void HandDhopSiteInt(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					  int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static void AsmDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+				int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);
  
-  static accelerator void HandDhopSiteDagInt(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					     int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static void AsmDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,FermionFieldView &out);
  
-  static accelerator void HandDhopSiteExt(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					  int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
+  static void AsmDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+				int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out);

-  static accelerator void HandDhopSiteDagExt(const StencilView &st, const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-					     int sF, int sU, const FermionFieldView &in, const FermionFieldView &out);
-  //AVX 512 ASM
-  static void AsmDhopSite(const StencilView &st,  const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-			  int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,const FermionFieldView &out);
+// Keep Hand unrolled temporarily  
+  static accelerator void HandDhopSite(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+				       int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
  
-  static void AsmDhopSiteDag(const StencilView &st,  const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, const FermionFieldView &out);
+  static accelerator void HandDhopSiteDag(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
  
-  static void AsmDhopSiteInt(const StencilView &st,  const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,const FermionFieldView &out);
+  static accelerator void HandDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
  
-  static void AsmDhopSiteDagInt(const StencilView &st,  const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-				int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, const FermionFieldView &out);
+  static accelerator void HandDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
  
-  static void AsmDhopSiteExt(const StencilView &st,  const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-			     int sF, int sU, int Ls, int Nsite, const FermionFieldView &in,const FermionFieldView &out);
-  
-  static void AsmDhopSiteDagExt(const StencilView &st,  const DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
-				int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, const FermionFieldView &out);
+  static accelerator void HandDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					  int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
  
+  static accelerator void HandDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
+					     int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
 public:
 WilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
 };
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -880,11 +880,23 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }

  std::vector<RealD> G_s(Ls,1.0);
+  Integer sign = 1; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
    }
  }
+  else if ( curr_type == Current::Tadpole ) {
+    auto b=this->_b;
+    auto c=this->_c;
+    if ( b == 1 && c == 0 ) {
+      sign = -1;    
+    }
+    else {
+      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
+      assert(b==1 && c==0);
+    }
+  }

  for(int s=0;s<Ls;s++){

@@ -907,7 +919,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

    tmp    = Cshift(tmp,mu,1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
-    tmp    = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
+    tmp    = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time 
    L_Q    = where((lcoor<=tmax),tmp,zz); // Position of current complicated

--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
@@ -618,13 +618,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);

 template <class Impl>
-void StaggeredKernels<Impl>::DhopSiteAsm(const StencilView &st,
-					 const DoubledGaugeFieldView &U,
-					 const DoubledGaugeFieldView &UUU,
+void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
+					 DoubledGaugeFieldView &U,
+					 DoubledGaugeFieldView &UUU,
 					 SiteSpinor *buf, int sF,
-					 int sU,
-					 const FermionFieldView &in,
-					 const FermionFieldView &out,int dag) 
+					 int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  assert(0);
 };
@@ -682,16 +680,15 @@ void StaggeredKernels<Impl>::DhopSiteAsm(const StencilView &st,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
  
-
+#undef STAG_VEC5D
+#ifdef STAG_VEC5D
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(const StencilView &st,
-								    const DoubledGaugeFieldView &U,
-								    const DoubledGaugeFieldView &UUU,
+template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
+								    DoubledGaugeFieldView &U,
+								    DoubledGaugeFieldView &UUU,
 								    SiteSpinor *buf, int sF,
-								    int sU,
-								    const FermionFieldView &in,
-								    const FermionFieldView &out,int dag) 
+								    int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
@@ -742,13 +739,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(const Stenci
 }

 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(const StencilView &st, 
-								    const DoubledGaugeFieldView &U,
-								    const DoubledGaugeFieldView &UUU,
+template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st, 
+								    DoubledGaugeFieldView &U,
+								    DoubledGaugeFieldView &UUU,
 								    SiteSpinor *buf, int sF,
-								    int sU,
-								    const FermionFieldView &in,
-								    const FermionFieldView &out, int dag) 
+								    int sU, const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
@@ -796,7 +791,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(const Stenci
 #endif
 }
   
-   
+#endif   


 #define PERMUTE_DIR3 __asm__ (	\
@@ -830,13 +825,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(const Stenci
  // This is the single precision 5th direction vectorised kernel

 #include <Grid/simd/Intel512single.h>
-template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(const StencilView &st, 
-							       const DoubledGaugeFieldView &U,
-							       const DoubledGaugeFieldView &UUU,
+template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st, 
+							       DoubledGaugeFieldView &U,
+							       DoubledGaugeFieldView &UUU,
 							       SiteSpinor *buf, int sF,
-							       int sU,
-							       const FermionFieldView &in,
-							       const FermionFieldView &out,int dag) 
+							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
@@ -901,13 +894,11 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(const StencilView
 }

 #include <Grid/simd/Intel512double.h>
-template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(const StencilView &st, 
-							       const DoubledGaugeFieldView &U,
-							       const DoubledGaugeFieldView &UUU,
+template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st, 
+							       DoubledGaugeFieldView &U,
+							       DoubledGaugeFieldView &UUU,
 							       SiteSpinor *buf, int sF,
-							       int sU,
-							       const FermionFieldView &in,
-							       const FermionFieldView &out,int dag) 
+							       int sU, const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
 #ifdef AVX512
  uint64_t gauge0,gauge1,gauge2,gauge3;
--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h
@@ -32,25 +32,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-#define LOAD_CHI(b)		\
+#ifdef GRID_SIMT
+
+#define LOAD_CHI(ptype,b)			\
  const SiteSpinor & ref (b[offset]);				\
-    Chi_0=ref()()(0);\
-    Chi_1=ref()()(1);\
+  Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane);	\
+  Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane);	\
+  Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
+
+#define LOAD_CHI_COMMS(b)		\
+  const SiteSpinor & ref (b[offset]);	\
+  Chi_0=coalescedRead(ref()()(0),lane);	\
+  Chi_1=coalescedRead(ref()()(1),lane);	\
+  Chi_2=coalescedRead(ref()()(2),lane);
+
+#define PERMUTE_DIR(dir)	;
+#else
+#define LOAD_CHI(ptype,b)      LOAD_CHI_COMMS(b)
+
+#define LOAD_CHI_COMMS(b)		\
+  const SiteSpinor & ref (b[offset]);	\
+  Chi_0=ref()()(0);			\
+  Chi_1=ref()()(1);			\
  Chi_2=ref()()(2);

+#define PERMUTE_DIR(dir)			\
+  permute##dir(Chi_0,Chi_0);			\
+  permute##dir(Chi_1,Chi_1);			\
+  permute##dir(Chi_2,Chi_2);
+
+#endif
+

 // To splat or not to splat depends on the implementation
 #define MULT(A,UChi)				\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    U_02=coalescedRead(ref()(0,2),lane);				\
+    U_12=coalescedRead(ref()(1,2),lane);				\
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0  = U_00*Chi_0;	       \
    UChi ## _1  = U_10*Chi_0;\
    UChi ## _2  = U_20*Chi_0;\
@@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid);

 #define MULT_ADD(U,A,UChi)			\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    U_02=coalescedRead(ref()(0,2),lane);				\
+    U_12=coalescedRead(ref()(1,2),lane);				\
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0 += U_00*Chi_0;	       \
    UChi ## _1 += U_10*Chi_0;\
    UChi ## _2 += U_20*Chi_0;\
@@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid);
    UChi ## _2 += U_22*Chi_2;


-#define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_0,Chi_0);			\
-  permute##dir(Chi_1,Chi_1);			\
-  permute##dir(Chi_2,Chi_2);
-
-
 #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);	\
  offset = SE->_offset;			\
  local  = SE->_is_local;		\
  perm   = SE->_permute;		\
  if ( local ) {						\
-    LOAD_CHI(in);					\
+    LOAD_CHI(Perm,in);						\
    if ( perm) {						\
      PERMUTE_DIR(Perm);					\
    }								\
  } else {							\
-    LOAD_CHI(buf);						\
+    LOAD_CHI_COMMS(buf);					\
  }								

 #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
@@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid);
  }


-
 #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ( local ) {					\
-    LOAD_CHI(in);				\
+    LOAD_CHI(Perm,in);					\
    if ( perm) {					\
      PERMUTE_DIR(Perm);				\
    }							\
  } else if ( st.same_node[Dir] ) {			\
-    LOAD_CHI(buf);					\
+    LOAD_CHI_COMMS(buf);				\
  }							\
  if (local || st.same_node[Dir] ) {		\
    MULT_ADD(U,Dir,even);				\
@@ -140,45 +158,51 @@ NAMESPACE_BEGIN(Grid);
  local  = SE->_is_local;				\
  if ((!local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    { LOAD_CHI(buf);	  }					\
+    { LOAD_CHI_COMMS(buf);	  }				\
    { MULT_ADD(U,Dir,even); }					\
  }								

+#define HAND_DECLARATIONS(Simd) \
+  Simd even_0;			\
+  Simd even_1;			\
+  Simd even_2;			\
+  Simd odd_0;			\
+  Simd odd_1;			\
+  Simd odd_2;		        \
+		      		\
+  Simd Chi_0;			\
+  Simd Chi_1;			\
+  Simd Chi_2;			\
+				\
+  Simd U_00;			\
+  Simd U_10;			\
+  Simd U_20;			\
+  Simd U_01;			\
+  Simd U_11;			\
+  Simd U_21;			\
+  Simd U_02;			\
+  Simd U_12;			\
+  Simd U_22;			
+  

 template <class Impl>
 template <int Naik> accelerator_inline
-void StaggeredKernels<Impl>::DhopSiteHand(const StencilView &st,
-					  const DoubledGaugeFieldView &U,
-					  const DoubledGaugeFieldView &UUU,
+void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
+					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
 					  SiteSpinor *buf, int sF, int sU, 
-					  const FermionFieldView &in,
-					  const FermionFieldView &out,int dag) 
+					  const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset,local,perm, ptype;

  StencilEntry *SE;
@@ -217,45 +241,28 @@ void StaggeredKernels<Impl>::DhopSiteHand(const StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }


 template <class Impl>
 template <int Naik> accelerator_inline
-void StaggeredKernels<Impl>::DhopSiteHandInt(const StencilView &st, 
-					     const DoubledGaugeFieldView &U,
-					     const DoubledGaugeFieldView &UUU,
+void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, 
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
-					     const FermionFieldView &in,
-					     const FermionFieldView &out,int dag) 
+					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset, ptype, local, perm;

  StencilEntry *SE;
@@ -265,8 +272,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(const StencilView &st,
  //    int sF=s+LLs*sU;
  {

-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);

    skew = 0;
    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
@@ -298,45 +305,28 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(const StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }


 template <class Impl>
 template <int Naik> accelerator_inline
-void StaggeredKernels<Impl>::DhopSiteHandExt(const StencilView &st,
-					     const DoubledGaugeFieldView &U,
-					     const DoubledGaugeFieldView &UUU,
+void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
-					     const FermionFieldView &in,
-					     const FermionFieldView &out,int dag) 
+					     const FermionFieldView &in, FermionFieldView &out,int dag) 
 {
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset, ptype, local;

  StencilEntry *SE;
@@ -346,8 +336,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(const StencilView &st,
  //    int sF=s+LLs*sU;
  {

-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);
    int nmu=0;
    skew = 0;
    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
@@ -380,7 +370,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(const StencilView &st,
 	result()()(1) = even_1 + odd_1;
 	result()()(2) = even_2 + odd_2;
      }
-      out[sF] = out[sF] + result;
+      coalescedWrite(out[sF] , out(sF)+ result);
    }
  }
 }
@@ -403,6 +393,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(const StencilView &st,
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 */
 #undef LOAD_CHI
+#undef HAND_DECLARATIONS

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h
@@ -35,39 +35,32 @@ NAMESPACE_BEGIN(Grid);
 #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);			\
+    int perm= SE->_permute;						\
+    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
  } else {							\
-      chi_p = &in[SE->_offset];					\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  } else {							\
-    chi_p = &buf[SE->_offset];					\
-  }								\
-  multLink(Uchi, U[sU], *chi_p, Dir);			
+  acceleratorSynchronise();					\
+  multLink(Uchi, U[sU], chi, Dir);			

 #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);			\
-    } else {							\
-      chi_p = &in[SE->_offset];					\
-    }								\
+    int perm= SE->_permute;						\
+    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
  } else if ( st.same_node[Dir] ) {				\
-    chi_p = &buf[SE->_offset];					\
+    chi = coalescedRead(buf[SE->_offset],lane);                 \
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U[sU], *chi_p, Dir);				\
+    multLink(Uchi, U[sU], chi, Dir);				\
  }

 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U[sU], *chi_p, Dir);				\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+    multLink(Uchi, U[sU], chi, Dir);				\
  }

 template <class Impl>
@@ -79,17 +72,19 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl>
 template <int Naik> accelerator_inline
-void StaggeredKernels<Impl>::DhopSiteGeneric(const StencilView &st, 
-					     const DoubledGaugeFieldView &U, const DoubledGaugeFieldView &UUU,
+void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, 
+					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 					     SiteSpinor *buf, int sF, int sU, 
-					     const FermionFieldView &in, const FermionFieldView &out, int dag) 
+					     const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);

  //  for(int s=0;s<LLs;s++){
  //
@@ -118,7 +113,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(const StencilView &st,
    if ( dag ) { 
      Uchi = - Uchi;
    } 
-    vstream(out[sF], Uchi);
+    coalescedWrite(out[sF], Uchi,lane);
  }
 };

@@ -127,17 +122,19 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(const StencilView &st,
  ///////////////////////////////////////////////////
 template <class Impl>
 template <int Naik> accelerator_inline
-void StaggeredKernels<Impl>::DhopSiteGenericInt(const StencilView &st, 
-						const DoubledGaugeFieldView &U, const DoubledGaugeFieldView &UUU,
+void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
+						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
-						const FermionFieldView &in, const FermionFieldView &out,int dag)
+						const FermionFieldView &in, FermionFieldView &out,int dag)
 {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew ;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);

  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -166,7 +163,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(const StencilView &st,
    if ( dag ) {
      Uchi = - Uchi;
    }
-    vstream(out[sF], Uchi);
+    coalescedWrite(out[sF], Uchi,lane);
  }
 };

@@ -176,20 +173,20 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(const StencilView &st,
  ///////////////////////////////////////////////////
 template <class Impl>
 template <int Naik> accelerator_inline
-void StaggeredKernels<Impl>::DhopSiteGenericExt(const StencilView &st, 
-						const DoubledGaugeFieldView &U,
-						const DoubledGaugeFieldView &UUU,
+void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
+						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
-						const FermionFieldView &in,
-						const FermionFieldView &out,int dag)
+						const FermionFieldView &in, FermionFieldView &out,int dag)
 {
-  const SiteSpinor *chi_p;
-  //  SiteSpinor chi;
-  SiteSpinor Uchi;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int nmu=0;
  int skew ;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);

  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -216,10 +213,11 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(const StencilView &st,
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
    }
    if ( nmu ) {
+      auto _out = coalescedRead(out[sF],lane);
      if ( dag ) {
-	out[sF] = out[sF] - Uchi;
+	coalescedWrite(out[sF], _out-Uchi,lane);
      } else { 
-	out[sF] = out[sF] + Uchi;
+	coalescedWrite(out[sF], _out+Uchi,lane);
      }
    }
  }
@@ -229,13 +227,8 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(const StencilView &st,
 // Driving / wrapping routine to select right kernel
 ////////////////////////////////////////////////////////////////////////////////////
 template <class Impl> 
-void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st,
-					   const DoubledGaugeFieldView &U,
-					   const DoubledGaugeFieldView &UUU,
-					   SiteSpinor * buf,
-					   int sF, int sU,
-					   const FermionFieldView &in,
-					   const FermionFieldView &out, int dir,int disp)
+void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
+					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
 {
  // Disp should be either +1,-1,+3,-3
  // What about "dag" ?
@@ -263,14 +256,15 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st,
  });

 template <class Impl> 
-void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
-					  LebesgueOrder &lo, 
+void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, 
 					  DoubledGaugeField &U, DoubledGaugeField &UUU, 
 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior)
 {
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v , UUU, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
@@ -311,6 +305,8 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v ,   U, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -397,6 +397,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
 {
+  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@@ -408,6 +409,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
 {
+  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@@ -420,6 +422,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h
@@ -38,46 +38,46 @@ NAMESPACE_BEGIN(Grid);

 ///////////////////////////////////////////////////////////
 // Default to no assembler implementation
-// Will specialise to AVX512 if available
+// Will specialise to 
 ///////////////////////////////////////////////////////////
 template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSite(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-				  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, const FermionFieldView &out)
+WilsonKernels<Impl >::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				  int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
  assert(0);
 }

 template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDag(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, const FermionFieldView &out)
+WilsonKernels<Impl >::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
  assert(0);
 }

 template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteInt(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, const FermionFieldView &out)
+WilsonKernels<Impl >::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
  assert(0);
 }

 template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagInt(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, const FermionFieldView &out)
+WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
  assert(0);
 }

 template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteExt(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, const FermionFieldView &out)
+WilsonKernels<Impl >::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+				     int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
  assert(0);
 }

 template<class Impl> void 
-WilsonKernels<Impl >::AsmDhopSiteDagExt(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, const FermionFieldView &out)
+WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
 {
  assert(0);
 }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@@ -647,13 +647,8 @@ NAMESPACE_BEGIN(Grid);

 #define HAND_SPECIALISE_GPARITY(IMPL)					\
  template<> accelerator_inline void						\
-  WilsonKernels<IMPL>::HandDhopSiteSycl(StencilVector st_perm, StencilEntry *st_p, \
-					SiteDoubledGaugeField *U, SiteHalfSpinor * buf, \
-					int sF, int sU, const SiteSpinor *in, SiteSpinor *out) {} \
-  									\
-  template<> accelerator_inline void						\
-  WilsonKernels<IMPL>::HandDhopSite(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-				    int ss,int sU,const FermionFieldView &in, const FermionFieldView &out) \
+  WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
+				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -668,8 +663,8 @@ NAMESPACE_BEGIN(Grid);
  }									\
 									\
  template<> accelerator_inline void						\
-  WilsonKernels<IMPL>::HandDhopSiteDag(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-				       int ss,int sU,const FermionFieldView &in, const FermionFieldView &out) \
+  WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -684,8 +679,8 @@ NAMESPACE_BEGIN(Grid);
  }									\
 									\
  template<> accelerator_inline void						\
-  WilsonKernels<IMPL>::HandDhopSiteInt(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-				       int ss,int sU,const FermionFieldView &in, const FermionFieldView &out) \
+  WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -700,8 +695,8 @@ NAMESPACE_BEGIN(Grid);
  }									\
 									\
  template<> accelerator_inline void						\
-  WilsonKernels<IMPL>::HandDhopSiteDagInt(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-					  int ss,int sU,const FermionFieldView &in, const FermionFieldView &out) \
+  WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -716,8 +711,8 @@ NAMESPACE_BEGIN(Grid);
  }									\
 									\
  template<> accelerator_inline void							\
-  WilsonKernels<IMPL>::HandDhopSiteExt(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
-				       int ss,int sU,const FermionFieldView &in, const FermionFieldView &out) \
+  WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \
+				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
@@ -733,8 +728,8 @@ NAMESPACE_BEGIN(Grid);
    HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
  }									\
  template<> accelerator_inline void						\
-  WilsonKernels<IMPL>::HandDhopSiteDagExt(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
-					  int ss,int sU,const FermionFieldView &in, const FermionFieldView &out) \
+  WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
  {									\
    typedef IMPL Impl;							\
    typedef typename Simd::scalar_type S;				\
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h
@@ -76,7 +76,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define REGISTER

-#define LOAD_CHIMU \
+#ifdef GRID_SIMT
+#define LOAD_CHIMU(ptype)		\
+  {const SiteSpinor & ref (in[offset]);	\
+    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+#define PERMUTE_DIR(dir) ;
+#else
+#define LOAD_CHIMU(ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@@ -91,55 +108,55 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chimu_31=ref()(3)(1);\
    Chimu_32=ref()(3)(2);}

-#define LOAD_CHI\
-  {const SiteHalfSpinor &ref(buf[offset]);	\
-    Chi_00 = ref()(0)(0);\
-    Chi_01 = ref()(0)(1);\
-    Chi_02 = ref()(0)(2);\
-    Chi_10 = ref()(1)(0);\
-    Chi_11 = ref()(1)(1);\
-    Chi_12 = ref()(1)(2);}
-
-// To splat or not to splat depends on the implementation
-#define MULT_2SPIN(A)\
-  {auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));	\
-   Impl::loadLinkElement(U_10,ref()(1,0));	\
-   Impl::loadLinkElement(U_20,ref()(2,0));	\
-   Impl::loadLinkElement(U_01,ref()(0,1));	\
-   Impl::loadLinkElement(U_11,ref()(1,1));	\
-   Impl::loadLinkElement(U_21,ref()(2,1));	\
-    UChi_00 = U_00*Chi_00;\
-    UChi_10 = U_00*Chi_10;\
-    UChi_01 = U_10*Chi_00;\
-    UChi_11 = U_10*Chi_10;\
-    UChi_02 = U_20*Chi_00;\
-    UChi_12 = U_20*Chi_10;\
-    UChi_00+= U_01*Chi_01;\
-    UChi_10+= U_01*Chi_11;\
-    UChi_01+= U_11*Chi_01;\
-    UChi_11+= U_11*Chi_11;\
-    UChi_02+= U_21*Chi_01;\
-    UChi_12+= U_21*Chi_11;\
-    Impl::loadLinkElement(U_00,ref()(0,2));	\
-    Impl::loadLinkElement(U_10,ref()(1,2));	\
-    Impl::loadLinkElement(U_20,ref()(2,2));	\
-    UChi_00+= U_00*Chi_02;\
-    UChi_10+= U_00*Chi_12;\
-    UChi_01+= U_10*Chi_02;\
-    UChi_11+= U_10*Chi_12;\
-    UChi_02+= U_20*Chi_02;\
-    UChi_12+= U_20*Chi_12;}
-
-
 #define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_00,Chi_00);\
+  permute##dir(Chi_00,Chi_00);	\
      permute##dir(Chi_01,Chi_01);\
      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);\
+      permute##dir(Chi_10,Chi_10);	\
      permute##dir(Chi_11,Chi_11);\
      permute##dir(Chi_12,Chi_12);

+#endif
+
+#define MULT_2SPIN(A)\
+  {auto & ref(U[sU](A));						\
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    UChi_00 = U_00*Chi_00;						\
+    UChi_10 = U_00*Chi_10;						\
+    UChi_01 = U_10*Chi_00;						\
+    UChi_11 = U_10*Chi_10;						\
+    UChi_02 = U_20*Chi_00;						\
+    UChi_12 = U_20*Chi_10;						\
+    UChi_00+= U_01*Chi_01;						\
+    UChi_10+= U_01*Chi_11;						\
+    UChi_01+= U_11*Chi_01;						\
+    UChi_11+= U_11*Chi_11;						\
+    UChi_02+= U_21*Chi_01;						\
+    UChi_12+= U_21*Chi_11;						\
+    U_00=coalescedRead(ref()(0,2),lane);				\
+    U_10=coalescedRead(ref()(1,2),lane);				\
+    U_20=coalescedRead(ref()(2,2),lane);				\
+    UChi_00+= U_00*Chi_02;						\
+    UChi_10+= U_00*Chi_12;						\
+    UChi_01+= U_10*Chi_02;						\
+    UChi_11+= U_10*Chi_12;						\
+    UChi_02+= U_20*Chi_02;						\
+    UChi_12+= U_20*Chi_12;}
+
+#define LOAD_CHI				\
+  {const SiteHalfSpinor &ref(buf[offset]);	\
+    Chi_00 = coalescedRead(ref()(0)(0),lane);	\
+    Chi_01 = coalescedRead(ref()(0)(1),lane);	\
+    Chi_02 = coalescedRead(ref()(0)(2),lane);	\
+    Chi_10 = coalescedRead(ref()(1)(0),lane);	\
+    Chi_11 = coalescedRead(ref()(1)(1),lane);	\
+    Chi_12 = coalescedRead(ref()(1)(2),lane);}
+
 //      hspin(0)=fspin(0)+timesI(fspin(3));
 //      hspin(1)=fspin(1)+timesI(fspin(2));
 #define XP_PROJ \
@@ -353,13 +370,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_31-= UChi_11;	\
  result_32-= UChi_12;

-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU(PERM);				\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
@@ -367,6 +384,37 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  } else {					\
    LOAD_CHI;					\
  }						\
+  acceleratorSynchronise();			\
+  MULT_2SPIN(DIR);				\
+  RECON;					
+
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+  SE=&st_p[DIR+8*ss];				\
+  ptype=st_perm[DIR];				\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU(PERM);				\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else {					\
+    LOAD_CHI;					\
+  }						\
+  acceleratorSynchronise();			\
+  MULT_2SPIN(DIR);				\
+  RECON;					
+
+#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
+  SE=&st_p[DIR+8*ss];							\
+  ptype=st_perm[DIR];							\
+ /*SE=st.GetEntry(ptype,DIR,ss);*/					\
+  offset = SE->_offset;				\
+  perm   = SE->_permute;			\
+  LOAD_CHIMU(PERM);				\
+  PROJ;						\
  MULT_2SPIN(DIR);				\
  RECON;					

@@ -376,7 +424,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU(PERM);				\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
@@ -384,10 +432,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  } else if ( st.same_node[DIR] ) {		\
    LOAD_CHI;					\
  }						\
+  acceleratorSynchronise();			\
  if (local || st.same_node[DIR] ) {		\
    MULT_2SPIN(DIR);				\
    RECON;					\
-  }
+  }						\
+  acceleratorSynchronise();			

 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
@@ -397,44 +447,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    MULT_2SPIN(DIR);				\
    RECON;					\
    nmu++;					\
-  }
+  }						\
+  acceleratorSynchronise();			

 #define HAND_RESULT(ss)				\
  {						\
    SiteSpinor & ref (out[ss]);			\
-    vstream(ref()(0)(0),result_00);		\
-    vstream(ref()(0)(1),result_01);		\
-    vstream(ref()(0)(2),result_02);		\
-    vstream(ref()(1)(0),result_10);		\
-    vstream(ref()(1)(1),result_11);		\
-    vstream(ref()(1)(2),result_12);		\
-    vstream(ref()(2)(0),result_20);		\
-    vstream(ref()(2)(1),result_21);		\
-    vstream(ref()(2)(2),result_22);		\
-    vstream(ref()(3)(0),result_30);		\
-    vstream(ref()(3)(1),result_31);		\
-    vstream(ref()(3)(2),result_32);		\
+    coalescedWrite(ref()(0)(0),result_00,lane);		\
+    coalescedWrite(ref()(0)(1),result_01,lane);		\
+    coalescedWrite(ref()(0)(2),result_02,lane);		\
+    coalescedWrite(ref()(1)(0),result_10,lane);		\
+    coalescedWrite(ref()(1)(1),result_11,lane);		\
+    coalescedWrite(ref()(1)(2),result_12,lane);		\
+    coalescedWrite(ref()(2)(0),result_20,lane);		\
+    coalescedWrite(ref()(2)(1),result_21,lane);		\
+    coalescedWrite(ref()(2)(2),result_22,lane);		\
+    coalescedWrite(ref()(3)(0),result_30,lane);		\
+    coalescedWrite(ref()(3)(1),result_31,lane);		\
+    coalescedWrite(ref()(3)(2),result_32,lane);		\
  }

 #define HAND_RESULT_EXT(ss)				\
-  if (nmu){					\
+  {							\
    SiteSpinor & ref (out[ss]);				\
-    ref()(0)(0)+=result_00;		\
-    ref()(0)(1)+=result_01;		\
-    ref()(0)(2)+=result_02;		\
-    ref()(1)(0)+=result_10;		\
-    ref()(1)(1)+=result_11;		\
-    ref()(1)(2)+=result_12;		\
-    ref()(2)(0)+=result_20;		\
-    ref()(2)(1)+=result_21;		\
-    ref()(2)(2)+=result_22;		\
-    ref()(3)(0)+=result_30;		\
-    ref()(3)(1)+=result_31;		\
-    ref()(3)(2)+=result_32;		\
+    coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00,lane);	\
+    coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01,lane);	\
+    coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02,lane);	\
+    coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10,lane);	\
+    coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11,lane);	\
+    coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12,lane);	\
+    coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20,lane);	\
+    coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21,lane);	\
+    coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22,lane);	\
+    coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30,lane);	\
+    coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31,lane);	\
+    coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32,lane);	\
  }

-
-#define HAND_DECLARATIONS(a)			\
+#define HAND_DECLARATIONS(Simd)			\
  Simd result_00;				\
  Simd result_01;				\
  Simd result_02;				\
@@ -467,18 +517,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  Simd U_21;

 #define ZERO_RESULT							\
-  result_00=Zero();				\
-  result_01=Zero();				\
-  result_02=Zero();				\
-  result_10=Zero();				\
-  result_11=Zero();				\
-  result_12=Zero();				\
-  result_20=Zero();				\
-  result_21=Zero();				\
-  result_22=Zero();				\
-  result_30=Zero();				\
-  result_31=Zero();				\
-  result_32=Zero();			
+  zeroit(result_00);							\
+  zeroit(result_01);							\
+  zeroit(result_02);							\
+  zeroit(result_10);							\
+  zeroit(result_11);							\
+  zeroit(result_12);							\
+  zeroit(result_20);							\
+  zeroit(result_21);							\
+  zeroit(result_22);							\
+  zeroit(result_30);							\
+  zeroit(result_31);							\
+  zeroit(result_32);			

 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@@ -495,15 +545,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+
+#ifdef SYCL_HACK
 template<class Impl> accelerator_inline void 
-WilsonKernels<Impl>::HandDhopSite(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-				  int ss,int sU,const FermionFieldView &in, const FermionFieldView &out)
+WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
+				      int ss,int sU,const SiteSpinor *in, SiteSpinor *out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef iSinglet<Simd> vCplx;
+  //  typedef decltype( coalescedRead( vCplx()()() )) Simt;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+#endif
+
+template<class Impl> accelerator_inline void 
+WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
+{
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
+
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  int offset,local,perm, ptype;
  StencilEntry *SE;
@@ -520,13 +608,19 @@ WilsonKernels<Impl>::HandDhopSite(const StencilView &st, const DoubledGaugeField
 }

 template<class Impl>  accelerator_inline
-void WilsonKernels<Impl>::HandDhopSiteDag(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-					  int ss,int sU,const FermionFieldView &in, const FermionFieldView &out)
+void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
  int offset,local,perm, ptype;
@@ -543,14 +637,20 @@ void WilsonKernels<Impl>::HandDhopSiteDag(const StencilView &st,const DoubledGau
 }

 template<class Impl>  accelerator_inline void 
-WilsonKernels<Impl>::HandDhopSiteInt(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, const FermionFieldView &out)
+WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  int offset,local,perm, ptype;
  StencilEntry *SE;
@@ -567,13 +667,19 @@ WilsonKernels<Impl>::HandDhopSiteInt(const StencilView &st,const DoubledGaugeFie
 }

 template<class Impl> accelerator_inline
-void WilsonKernels<Impl>::HandDhopSiteDagInt(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, const FermionFieldView &out)
+void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
  int offset,local,perm, ptype;
@@ -590,14 +696,20 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(const StencilView &st,const Doubled
 }

 template<class Impl>  accelerator_inline void 
-WilsonKernels<Impl>::HandDhopSiteExt(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-					  int ss,int sU,const FermionFieldView &in, const FermionFieldView &out)
+WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  int offset, ptype;
  StencilEntry *SE;
@@ -615,13 +727,19 @@ WilsonKernels<Impl>::HandDhopSiteExt(const StencilView &st,const DoubledGaugeFie
 }

 template<class Impl>  accelerator_inline
-void WilsonKernels<Impl>::HandDhopSiteDagExt(const StencilView &st,const DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
-						  int ss,int sU,const FermionFieldView &in, const FermionFieldView &out)
+void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
+						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
  int offset, ptype;
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementationSycl.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementationSycl.h
@@ -1,598 +0,0 @@
-   /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-
-#pragma once
-
-#include <Grid/qcd/action/fermion/FermionCore.h>
-
-
-#undef LOAD_CHIMU  
-#undef LOAD_CHI 
-#undef MULT_2SPIN
-#undef PERMUTE_DIR
-#undef XP_PROJ  
-#undef YP_PROJ  
-#undef ZP_PROJ  
-#undef TP_PROJ  
-#undef XM_PROJ  
-#undef YM_PROJ  
-#undef ZM_PROJ  
-#undef TM_PROJ  
-#undef XP_RECON 
-#undef XP_RECON_ACCUM 
-#undef XM_RECON 
-#undef XM_RECON_ACCUM 
-#undef YP_RECON_ACCUM 
-#undef YM_RECON_ACCUM 
-#undef ZP_RECON_ACCUM 
-#undef ZM_RECON_ACCUM 
-#undef TP_RECON_ACCUM 
-#undef TM_RECON_ACCUM 
-#undef ZERO_RESULT				 
-#undef Chimu_00
-#undef Chimu_01
-#undef Chimu_02
-#undef Chimu_10
-#undef Chimu_11
-#undef Chimu_12
-#undef Chimu_20
-#undef Chimu_21
-#undef Chimu_22
-#undef Chimu_30
-#undef Chimu_31
-#undef Chimu_32
-#undef HAND_STENCIL_LEG
-#undef HAND_STENCIL_LEG_INT
-#undef HAND_STENCIL_LEG_EXT
-#undef HAND_RESULT
-#undef HAND_RESULT_INT
-#undef HAND_RESULT_EXT
-
-#define REGISTER
-
-#ifdef GRID_SIMT
-#define LOAD_CHIMU(ptype)		\
-  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm);	\
-    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm);	\
-    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm);	\
-    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm);	\
-    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm);	\
-    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm);	\
-    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm);	\
-    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm);	\
-    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm);	\
-    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm);	\
-    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm);	\
-    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm);	}
-
-#define PERMUTE_DIR(dir) ;
-#else
-#define LOAD_CHIMU(ptype)		\
-  {const SiteSpinor & ref (in[offset]);	\
-    Chimu_00=coalescedRead(ref()(0)(0));	\
-    Chimu_01=coalescedRead(ref()(0)(1));	\
-    Chimu_02=coalescedRead(ref()(0)(2));	\
-    Chimu_10=coalescedRead(ref()(1)(0));	\
-    Chimu_11=coalescedRead(ref()(1)(1));	\
-    Chimu_12=coalescedRead(ref()(1)(2));	\
-    Chimu_20=coalescedRead(ref()(2)(0));	\
-    Chimu_21=coalescedRead(ref()(2)(1));	\
-    Chimu_22=coalescedRead(ref()(2)(2));	\
-    Chimu_30=coalescedRead(ref()(3)(0));	\
-    Chimu_31=coalescedRead(ref()(3)(1));	\
-    Chimu_32=coalescedRead(ref()(3)(2));	}
-
-#define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_00,Chi_00);	\
-      permute##dir(Chi_01,Chi_01);\
-      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);	\
-      permute##dir(Chi_11,Chi_11);\
-      permute##dir(Chi_12,Chi_12);
-#endif
-
-#define MULT_2SPIN(A)\
-  {auto & ref(U[sU](A));					\
-  U_00=coalescedRead(ref()(0,0));				\
-  U_10=coalescedRead(ref()(1,0));					\
-  U_20=coalescedRead(ref()(2,0));					\
-  U_01=coalescedRead(ref()(0,1));					\
-  U_11=coalescedRead(ref()(1,1));					\
-  U_21=coalescedRead(ref()(2,1));					\
-    UChi_00 = U_00*Chi_00;					\
-    UChi_10 = U_00*Chi_10;					\
-    UChi_01 = U_10*Chi_00;					\
-    UChi_11 = U_10*Chi_10;					\
-    UChi_02 = U_20*Chi_00;					\
-    UChi_12 = U_20*Chi_10;					\
-    UChi_00+= U_01*Chi_01;					\
-    UChi_10+= U_01*Chi_11;					\
-    UChi_01+= U_11*Chi_01;					\
-    UChi_11+= U_11*Chi_11;					\
-    UChi_02+= U_21*Chi_01;					\
-    UChi_12+= U_21*Chi_11;					\
-    U_00=coalescedRead(ref()(0,2));				\
-    U_10=coalescedRead(ref()(1,2));				\
-    U_20=coalescedRead(ref()(2,2));				\
-    UChi_00+= U_00*Chi_02;					\
-    UChi_10+= U_00*Chi_12;					\
-    UChi_01+= U_10*Chi_02;					\
-    UChi_11+= U_10*Chi_12;					\
-    UChi_02+= U_20*Chi_02;					\
-    UChi_12+= U_20*Chi_12;}
-
-#define LOAD_CHI				\
-  {const SiteHalfSpinor &ref(buf[offset]);	\
-    Chi_00 = coalescedRead(ref()(0)(0));	\
-    Chi_01 = coalescedRead(ref()(0)(1));	\
-    Chi_02 = coalescedRead(ref()(0)(2));	\
-    Chi_10 = coalescedRead(ref()(1)(0));	\
-    Chi_11 = coalescedRead(ref()(1)(1));	\
-    Chi_12 = coalescedRead(ref()(1)(2));}
-
-//      hspin(0)=fspin(0)+timesI(fspin(3));
-//      hspin(1)=fspin(1)+timesI(fspin(2));
-#define XP_PROJ \
-    Chi_00 = Chimu_00+timesI(Chimu_30);\
-    Chi_01 = Chimu_01+timesI(Chimu_31);\
-    Chi_02 = Chimu_02+timesI(Chimu_32);\
-    Chi_10 = Chimu_10+timesI(Chimu_20);\
-    Chi_11 = Chimu_11+timesI(Chimu_21);\
-    Chi_12 = Chimu_12+timesI(Chimu_22);
-
-#define YP_PROJ \
-    Chi_00 = Chimu_00-Chimu_30;\
-    Chi_01 = Chimu_01-Chimu_31;\
-    Chi_02 = Chimu_02-Chimu_32;\
-    Chi_10 = Chimu_10+Chimu_20;\
-    Chi_11 = Chimu_11+Chimu_21;\
-    Chi_12 = Chimu_12+Chimu_22;
-
-#define ZP_PROJ \
-  Chi_00 = Chimu_00+timesI(Chimu_20);		\
-  Chi_01 = Chimu_01+timesI(Chimu_21);		\
-  Chi_02 = Chimu_02+timesI(Chimu_22);		\
-  Chi_10 = Chimu_10-timesI(Chimu_30);		\
-  Chi_11 = Chimu_11-timesI(Chimu_31);		\
-  Chi_12 = Chimu_12-timesI(Chimu_32);
-
-#define TP_PROJ \
-  Chi_00 = Chimu_00+Chimu_20;		\
-  Chi_01 = Chimu_01+Chimu_21;		\
-  Chi_02 = Chimu_02+Chimu_22;		\
-  Chi_10 = Chimu_10+Chimu_30;		\
-  Chi_11 = Chimu_11+Chimu_31;		\
-  Chi_12 = Chimu_12+Chimu_32;
-
-
-//      hspin(0)=fspin(0)-timesI(fspin(3));
-//      hspin(1)=fspin(1)-timesI(fspin(2));
-#define XM_PROJ \
-    Chi_00 = Chimu_00-timesI(Chimu_30);\
-    Chi_01 = Chimu_01-timesI(Chimu_31);\
-    Chi_02 = Chimu_02-timesI(Chimu_32);\
-    Chi_10 = Chimu_10-timesI(Chimu_20);\
-    Chi_11 = Chimu_11-timesI(Chimu_21);\
-    Chi_12 = Chimu_12-timesI(Chimu_22);
-
-#define YM_PROJ \
-    Chi_00 = Chimu_00+Chimu_30;\
-    Chi_01 = Chimu_01+Chimu_31;\
-    Chi_02 = Chimu_02+Chimu_32;\
-    Chi_10 = Chimu_10-Chimu_20;\
-    Chi_11 = Chimu_11-Chimu_21;\
-    Chi_12 = Chimu_12-Chimu_22;
-
-#define ZM_PROJ \
-  Chi_00 = Chimu_00-timesI(Chimu_20);		\
-  Chi_01 = Chimu_01-timesI(Chimu_21);		\
-  Chi_02 = Chimu_02-timesI(Chimu_22);		\
-  Chi_10 = Chimu_10+timesI(Chimu_30);		\
-  Chi_11 = Chimu_11+timesI(Chimu_31);		\
-  Chi_12 = Chimu_12+timesI(Chimu_32);
-
-#define TM_PROJ \
-  Chi_00 = Chimu_00-Chimu_20;		\
-  Chi_01 = Chimu_01-Chimu_21;		\
-  Chi_02 = Chimu_02-Chimu_22;		\
-  Chi_10 = Chimu_10-Chimu_30;		\
-  Chi_11 = Chimu_11-Chimu_31;		\
-  Chi_12 = Chimu_12-Chimu_32;
-
-//      fspin(0)=hspin(0);
-//      fspin(1)=hspin(1);
-//      fspin(2)=timesMinusI(hspin(1));
-//      fspin(3)=timesMinusI(hspin(0));
-#define XP_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesMinusI(UChi_10);\
-  result_21 = timesMinusI(UChi_11);\
-  result_22 = timesMinusI(UChi_12);\
-  result_30 = timesMinusI(UChi_00);\
-  result_31 = timesMinusI(UChi_01);\
-  result_32 = timesMinusI(UChi_02);
-
-#define XP_RECON_ACCUM\
-  result_00+=UChi_00;\
-  result_01+=UChi_01;\
-  result_02+=UChi_02;\
-  result_10+=UChi_10;\
-  result_11+=UChi_11;\
-  result_12+=UChi_12;\
-  result_20-=timesI(UChi_10);\
-  result_21-=timesI(UChi_11);\
-  result_22-=timesI(UChi_12);\
-  result_30-=timesI(UChi_00);\
-  result_31-=timesI(UChi_01);\
-  result_32-=timesI(UChi_02);
-
-#define XM_RECON\
-  result_00 = UChi_00;\
-  result_01 = UChi_01;\
-  result_02 = UChi_02;\
-  result_10 = UChi_10;\
-  result_11 = UChi_11;\
-  result_12 = UChi_12;\
-  result_20 = timesI(UChi_10);\
-  result_21 = timesI(UChi_11);\
-  result_22 = timesI(UChi_12);\
-  result_30 = timesI(UChi_00);\
-  result_31 = timesI(UChi_01);\
-  result_32 = timesI(UChi_02);
-
-#define XM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_10);\
-  result_21+= timesI(UChi_11);\
-  result_22+= timesI(UChi_12);\
-  result_30+= timesI(UChi_00);\
-  result_31+= timesI(UChi_01);\
-  result_32+= timesI(UChi_02);
-
-#define YP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_10;\
-  result_21+= UChi_11;\
-  result_22+= UChi_12;\
-  result_30-= UChi_00;\
-  result_31-= UChi_01;\
-  result_32-= UChi_02;
-
-#define YM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_10;\
-  result_21-= UChi_11;\
-  result_22-= UChi_12;\
-  result_30+= UChi_00;\
-  result_31+= UChi_01;\
-  result_32+= UChi_02;
-
-#define ZP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= timesI(UChi_00);			\
-  result_21-= timesI(UChi_01);			\
-  result_22-= timesI(UChi_02);			\
-  result_30+= timesI(UChi_10);			\
-  result_31+= timesI(UChi_11);			\
-  result_32+= timesI(UChi_12);
-
-#define ZM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= timesI(UChi_00);			\
-  result_21+= timesI(UChi_01);			\
-  result_22+= timesI(UChi_02);			\
-  result_30-= timesI(UChi_10);			\
-  result_31-= timesI(UChi_11);			\
-  result_32-= timesI(UChi_12);
-
-#define TP_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20+= UChi_00;			\
-  result_21+= UChi_01;			\
-  result_22+= UChi_02;			\
-  result_30+= UChi_10;			\
-  result_31+= UChi_11;			\
-  result_32+= UChi_12;
-
-#define TM_RECON_ACCUM\
-  result_00+= UChi_00;\
-  result_01+= UChi_01;\
-  result_02+= UChi_02;\
-  result_10+= UChi_10;\
-  result_11+= UChi_11;\
-  result_12+= UChi_12;\
-  result_20-= UChi_00;	\
-  result_21-= UChi_01;	\
-  result_22-= UChi_02;	\
-  result_30-= UChi_10;	\
-  result_31-= UChi_11;	\
-  result_32-= UChi_12;
-
-#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)	\
-  SE=&st_p[DIR+8*ss];			\
-  ptype=st_perm[DIR];			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU(PERM);				\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else {					\
-    LOAD_CHI;					\
-  }						\
-  MULT_2SPIN(DIR);				\
-  RECON;					
-
-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
-  SE=&st_p[DIR+8*ss];			\
-  ptype=st_perm[DIR];			\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  LOAD_CHIMU(PERM);				\
-  PROJ;						\
-  MULT_2SPIN(DIR);				\
-  RECON;					
-
-
-#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\
-  SE=&st_p[DIR+8*ss];					\
-  ptype=st_perm[DIR];					\
-  offset = SE->_offset;				\
-  local  = SE->_is_local;			\
-  perm   = SE->_permute;			\
-  if ( local ) {				\
-    LOAD_CHIMU;					\
-    PROJ;					\
-    if ( perm) {				\
-      PERMUTE_DIR(PERM);			\
-    }						\
-  } else if ( st.same_node[DIR] ) {		\
-    LOAD_CHI;					\
-  }						\
-  if (local || st.same_node[DIR] ) {		\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-  }
-
-#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
-  SE=st.GetEntry(ptype,DIR,ss);			\
-  offset = SE->_offset;				\
-  if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\
-    LOAD_CHI;					\
-    MULT_2SPIN(DIR);				\
-    RECON;					\
-    nmu++;					\
-  }
-
-#define HAND_RESULT(ss)				\
-  {						\
-    SiteSpinor & ref (out[ss]);			\
-    coalescedWrite(ref()(0)(0),result_00);		\
-    coalescedWrite(ref()(0)(1),result_01);		\
-    coalescedWrite(ref()(0)(2),result_02);		\
-    coalescedWrite(ref()(1)(0),result_10);		\
-    coalescedWrite(ref()(1)(1),result_11);		\
-    coalescedWrite(ref()(1)(2),result_12);		\
-    coalescedWrite(ref()(2)(0),result_20);		\
-    coalescedWrite(ref()(2)(1),result_21);		\
-    coalescedWrite(ref()(2)(2),result_22);		\
-    coalescedWrite(ref()(3)(0),result_30);		\
-    coalescedWrite(ref()(3)(1),result_31);		\
-    coalescedWrite(ref()(3)(2),result_32);		\
-  }
-
-#define HAND_RESULT_EXT(ss)			\
-  if (nmu){					\
-    SiteSpinor & ref (out[ss]);		\
-    ref()(0)(0)+=result_00;		\
-    ref()(0)(1)+=result_01;		\
-    ref()(0)(2)+=result_02;		\
-    ref()(1)(0)+=result_10;		\
-    ref()(1)(1)+=result_11;		\
-    ref()(1)(2)+=result_12;		\
-    ref()(2)(0)+=result_20;		\
-    ref()(2)(1)+=result_21;		\
-    ref()(2)(2)+=result_22;		\
-    ref()(3)(0)+=result_30;		\
-    ref()(3)(1)+=result_31;		\
-    ref()(3)(2)+=result_32;		\
-  }
-
-#define HAND_DECLARATIONS(Simd)			\
-  Simd result_00;				\
-  Simd result_01;				\
-  Simd result_02;				\
-  Simd result_10;				\
-  Simd result_11;				\
-  Simd result_12;				\
-  Simd result_20;				\
-  Simd result_21;				\
-  Simd result_22;				\
-  Simd result_30;				\
-  Simd result_31;				\
-  Simd result_32;				\
-  Simd Chi_00;					\
-  Simd Chi_01;					\
-  Simd Chi_02;					\
-  Simd Chi_10;					\
-  Simd Chi_11;					\
-  Simd Chi_12;					\
-  Simd UChi_00;					\
-  Simd UChi_01;					\
-  Simd UChi_02;					\
-  Simd UChi_10;					\
-  Simd UChi_11;					\
-  Simd UChi_12;					\
-  Simd U_00;					\
-  Simd U_10;					\
-  Simd U_20;					\
-  Simd U_01;					\
-  Simd U_11;					\
-  Simd U_21;
-
-#define ZERO_RESULT				\
-  result_00=Zero();				\
-  result_01=Zero();				\
-  result_02=Zero();				\
-  result_10=Zero();				\
-  result_11=Zero();				\
-  result_12=Zero();				\
-  result_20=Zero();				\
-  result_21=Zero();				\
-  result_22=Zero();				\
-  result_30=Zero();				\
-  result_31=Zero();				\
-  result_32=Zero();			
-
-#define Chimu_00 Chi_00
-#define Chimu_01 Chi_01
-#define Chimu_02 Chi_02
-#define Chimu_10 Chi_10
-#define Chimu_11 Chi_11
-#define Chimu_12 Chi_12
-#define Chimu_20 UChi_00
-#define Chimu_21 UChi_01
-#define Chimu_22 UChi_02
-#define Chimu_30 UChi_10
-#define Chimu_31 UChi_11
-#define Chimu_32 UChi_12
-
-NAMESPACE_BEGIN(Grid);
-
-template<class Impl> accelerator_inline void 
-WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
-				      int ss,int sU,const SiteSpinor *in, SiteSpinor *out)
-{
-// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
-  typedef typename Simd::scalar_type S;
-  typedef typename Simd::vector_type V;
-  typedef iSinglet<Simd> vCplx;
-  //  typedef decltype( coalescedRead( vCplx()()() )) Simt;
-  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
-
-  HAND_DECLARATIONS(Simt);
-
-  int offset,local,perm, ptype;
-  StencilEntry *SE;
-  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
-  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
-  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
-  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
-  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
-  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
-  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
-  HAND_RESULT(ss);
-}
-
-////////////// Wilson ; uses this implementation /////////////////////
-
-NAMESPACE_END(Grid);
-#undef LOAD_CHIMU  
-#undef LOAD_CHI 
-#undef MULT_2SPIN
-#undef PERMUTE_DIR
-#undef XP_PROJ  
-#undef YP_PROJ  
-#undef ZP_PROJ  
-#undef TP_PROJ  
-#undef XM_PROJ  
-#undef YM_PROJ  
-#undef ZM_PROJ  
-#undef TM_PROJ  
-#undef XP_RECON 
-#undef XP_RECON_ACCUM 
-#undef XM_RECON 
-#undef XM_RECON_ACCUM 
-#undef YP_RECON_ACCUM 
-#undef YM_RECON_ACCUM 
-#undef ZP_RECON_ACCUM 
-#undef ZM_RECON_ACCUM 
-#undef TP_RECON_ACCUM 
-#undef TM_RECON_ACCUM 
-#undef ZERO_RESULT				 
-#undef Chimu_00
-#undef Chimu_01
-#undef Chimu_02
-#undef Chimu_10
-#undef Chimu_11
-#undef Chimu_12
-#undef Chimu_20
-#undef Chimu_21
-#undef Chimu_22
-#undef Chimu_30
-#undef Chimu_31
-#undef Chimu_32
-#undef HAND_STENCIL_LEG
-#undef HAND_STENCIL_LEG_INT
-#undef HAND_STENCIL_LEG_EXT
-#undef HAND_RESULT
-#undef HAND_RESULT_INT
-#undef HAND_RESULT_EXT
-#undef HAND_DECLARATIONS
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -115,9 +115,9 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
  // All legs kernels ; comms then compute
  ////////////////////////////////////////////////////////////////////
 template <class Impl> accelerator_inline
-void WilsonKernels<Impl>::GenericDhopSiteDag(const StencilView &st, const DoubledGaugeFieldView &U,
+void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
-					     int sU, const FermionFieldView &in, const FermionFieldView &out)
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(buf[0]))   calcHalfSpinor;
  typedef decltype(coalescedRead(in[0])) calcSpinor;
@@ -141,9 +141,9 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(const StencilView &st, const Double
 };

 template <class Impl> accelerator_inline
-void WilsonKernels<Impl>::GenericDhopSite(const StencilView &st, const DoubledGaugeFieldView &U,
+void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
 					  SiteHalfSpinor *buf, int sF,
-					  int sU, const FermionFieldView &in, const FermionFieldView &out)
+					  int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
@@ -170,9 +170,9 @@ void WilsonKernels<Impl>::GenericDhopSite(const StencilView &st, const DoubledGa
  // Interior kernels
  ////////////////////////////////////////////////////////////////////
 template <class Impl> accelerator_inline
-void WilsonKernels<Impl>::GenericDhopSiteDagInt(const StencilView &st, const DoubledGaugeFieldView &U,
+void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
-						int sU, const FermionFieldView &in, const FermionFieldView &out)
+						int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
@@ -198,9 +198,9 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(const StencilView &st, const Dou
 };

 template <class Impl> accelerator_inline
-void WilsonKernels<Impl>::GenericDhopSiteInt(const StencilView &st, const DoubledGaugeFieldView &U,
+void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
 							 SiteHalfSpinor *buf, int sF,
-					     int sU, const FermionFieldView &in, const FermionFieldView &out)
+							 int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
@@ -228,9 +228,9 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(const StencilView &st, const Double
 // Exterior kernels
 ////////////////////////////////////////////////////////////////////
 template <class Impl> accelerator_inline
-void WilsonKernels<Impl>::GenericDhopSiteDagExt(const StencilView &st, const DoubledGaugeFieldView &U,
+void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
 						SiteHalfSpinor *buf, int sF,
-						int sU, const FermionFieldView &in, const FermionFieldView &out)
+						int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
@@ -259,9 +259,9 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(const StencilView &st, const Dou
 };

 template <class Impl> accelerator_inline
-void WilsonKernels<Impl>::GenericDhopSiteExt(const StencilView &st, const DoubledGaugeFieldView &U,
+void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
 					     SiteHalfSpinor *buf, int sF,
-					     int sU, const FermionFieldView &in, const FermionFieldView &out)
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
 {
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
@@ -291,8 +291,8 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(const StencilView &st, const Double

 #define DhopDirMacro(Dir,spProj,spRecon)	\
  template <class Impl> accelerator_inline				\
-  void WilsonKernels<Impl>::DhopDir##Dir(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
-					 int sU, const FermionFieldView &in, const FermionFieldView &out, int dir) \
+  void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
+					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
  {									\
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;		\
  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
@@ -319,8 +319,8 @@ DhopDirMacro(Zm,spProjZm,spReconZm);
 DhopDirMacro(Tm,spProjTm,spReconTm);

 template <class Impl> accelerator_inline
-void WilsonKernels<Impl>::DhopDirK(const StencilView &st, const DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
-				   int sU, const FermionFieldView &in, const FermionFieldView &out, int dir, int gamma)
+void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
+				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
 {
  typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
  typedef decltype(coalescedRead(in[0]))  calcSpinor;
@@ -345,7 +345,7 @@ void WilsonKernels<Impl>::DhopDirK(const StencilView &st, const DoubledGaugeFiel
 }

 template <class Impl>
-void WilsonKernels<Impl>::DhopDirAll(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
+void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
 				      int Nsite, const FermionField &in, std::vector<FermionField> &out)
 {
   autoView(U_v  ,U,AcceleratorRead);
@@ -416,14 +416,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }

-#define KERNEL_CALLNB(A) \
-  const uint64_t    NN = Nsite*Ls;					\
-  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
-      int sF = ss;							\
-      int sU = ss/Ls;							\
-      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
-  });
-
 #define KERNEL_CALL_TMP(A) \
  const uint64_t    NN = Nsite*Ls;					\
  auto U_p = & U_v[0];							\
@@ -438,6 +430,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    });									\
  accelerator_barrier();

+#define KERNEL_CALLNB(A)						\
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+  });
+
 #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();

 #define ASM_CALL(A)							\
@@ -459,21 +459,24 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField

   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
-#ifndef GRID_CUDA
+#ifdef SYCL_HACK     
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
-     //     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
+#else
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
+#endif     
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
   }
@@ -491,20 +494,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField

   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
   }
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/WilsonKernelsInstantiationWilsonAdjImplD.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/WilsonKernelsInstantiationWilsonAdjImplF.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/WilsonKernelsInstantiationWilsonImplD.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/WilsonKernelsInstantiationWilsonImplF.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiation.cc.master
@@ -32,7 +32,6 @@ directory
 #include <Grid/qcd/action/fermion/FermionCore.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
 #include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
-#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementationSycl.h>

 #ifndef AVX512
 #ifndef QPX
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplD.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexAntiSymmetricImplF.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplD.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/WilsonKernelsInstantiationWilsonTwoIndexSymmetricImplF.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplD/WilsonKernelsInstantiationZWilsonImplD.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplF/WilsonKernelsInstantiationZWilsonImplF.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc
@@ -1 +0,0 @@
-../WilsonKernelsInstantiation.cc.master
--- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc
+++ b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
+
+#ifndef AVX512
+#ifndef QPX
+#ifndef A64FX
+#ifndef A64FXFIXEDSIZE
+#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
+#endif
+#endif
+#endif
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class WilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/gauge/GaugeImplTypes.h
+++ b/Grid/qcd/action/gauge/GaugeImplTypes.h
@@ -96,7 +96,7 @@ public:
  ///////////////////////////////////////////////////////////
  // Move these to another class
  // HMC auxiliary functions
-  static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) 
+  static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG) 
  {
    // Zbigniew Srocinsky thesis:
    //
--- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
+++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h
@@ -49,7 +49,7 @@ public:

  virtual std::string action_name(){return "PlaqPlusRectangleAction";}
      
-  virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
      
  virtual std::string LogParameters(){
    std::stringstream sstream;
--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@@ -54,8 +54,7 @@ public:
    return sstream.str();
  }

-  virtual void refresh(const GaugeField &U,
-                       GridParallelRNG &pRNG){};  // noop as no pseudoferms
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){};  // noop as no pseudoferms

  virtual RealD S(const GaugeField &U) {
    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@@ -124,7 +124,7 @@ NAMESPACE_BEGIN(Grid);
      //
      // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
      //
-      virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG)
+      virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
      {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h
@@ -1,4 +1,3 @@
-
 /*************************************************************************************

 Grid physics library, www.github.com/paboyle/Grid
@@ -43,8 +42,7 @@ NAMESPACE_BEGIN(Grid);
 //

 template <class Impl>
-class OneFlavourEvenOddRationalPseudoFermionAction
-  : public Action<typename Impl::GaugeField> {
+class OneFlavourEvenOddRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(Impl);

@@ -103,7 +101,7 @@ public:
    return sstream.str();
  }
  
-  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
    //        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
    // Phi = MpcdagMpc^{1/4} eta
@@ -156,7 +154,10 @@ public:

    msCG(Mpc, PhiOdd, Y);

-    if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+    auto grid = FermOp.FermionGrid();
+    auto r=rand();
+    grid->Broadcast(0,r);
+    if ( (r%param.BoundsCheckFreq)==0 ) { 
      FermionField gauss(FermOp.FermionRedBlackGrid());
      gauss = PhiOdd;
      HighBoundCheck(Mpc,gauss,param.hi);
--- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h
@@ -101,7 +101,7 @@ NAMESPACE_BEGIN(Grid);
      }
      
      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
@@ -170,7 +170,10 @@ NAMESPACE_BEGIN(Grid);
 	msCG_M(MdagM,X,Y);

 	// Randomly apply rational bounds checks.
-	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+	auto grid = NumOp.FermionGrid();
+        auto r=rand();
+        grid->Broadcast(0,r);
+        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
--- a/Grid/qcd/action/pseudofermion/OneFlavourRational.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourRational.h
@@ -98,7 +98,7 @@ NAMESPACE_BEGIN(Grid);


      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

 	
 	// P(phi) = e^{- phi^dag (MdagM)^-1/2 phi}
@@ -142,7 +142,10 @@ NAMESPACE_BEGIN(Grid);

 	msCG(MdagMOp,Phi,Y);

-	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+	auto grid = FermOp.FermionGrid();
+        auto r=rand();
+        grid->Broadcast(0,r);
+        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(FermOp.FermionGrid());
 	  gauss = Phi;
 	  HighBoundCheck(MdagMOp,gauss,param.hi);
--- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
+++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h
@@ -95,7 +95,7 @@ NAMESPACE_BEGIN(Grid);
      }
      

-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
@@ -156,7 +156,10 @@ NAMESPACE_BEGIN(Grid);
 	msCG_M(MdagM,X,Y);

 	// Randomly apply rational bounds checks.
-	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+        auto grid = NumOp.FermionGrid();
+        auto r=rand();
+        grid->Broadcast(0,r);
+        if ( (r%param.BoundsCheckFreq)==0 ) { 	
 	  FermionField gauss(NumOp.FermionGrid());
 	  gauss = Phi;
 	  HighBoundCheck(MdagM,gauss,param.hi);
--- a/Grid/qcd/action/pseudofermion/TwoFlavour.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavour.h
@@ -73,7 +73,7 @@ public:
  //////////////////////////////////////////////////////////////////////////////////////
  // Push the gauge field in to the dops. Assume any BC's and smearing already applied
  //////////////////////////////////////////////////////////////////////////////////////
-  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
    // P(phi) = e^{- phi^dag (MdagM)^-1 phi}
    // Phi = Mdag eta
    // P(eta) = e^{- eta^dag eta}
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h
@@ -77,7 +77,7 @@ public:
  //////////////////////////////////////////////////////////////////////////////////////
  // Push the gauge field in to the dops. Assume any BC's and smearing already applied
  //////////////////////////////////////////////////////////////////////////////////////
-  virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
    
    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
    // Phi = McpDag eta 
--- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
@@ -84,7 +84,7 @@ NAMESPACE_BEGIN(Grid);
      } 

      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
        //
--- a/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h
@@ -64,7 +64,7 @@ public:
    return sstream.str();
  }  
      
-  virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

    // P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
    //
--- a/Grid/qcd/action/scalar/ScalarAction.h
+++ b/Grid/qcd/action/scalar/ScalarAction.h
@@ -55,7 +55,7 @@ public:
  }
  virtual std::string action_name() {return "ScalarAction";}

-  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms
+  virtual void refresh(const Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {}  // noop as no pseudoferms

  virtual RealD S(const Field &p) {
    return (mass_square * 0.5 + Nd) * ScalarObs<Impl>::sumphisquared(p) +
--- a/Grid/qcd/action/scalar/ScalarImpl.h
+++ b/Grid/qcd/action/scalar/ScalarImpl.h
@@ -27,7 +27,7 @@ public:
  typedef Field              FermionField;
  typedef Field              PropagatorField;
    
-  static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
+  static inline void generate_momenta(Field& P, GridSerialRNG &sRNG, GridParallelRNG& pRNG){
    RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
    gaussian(pRNG, P);
    P *= scale; 
@@ -151,7 +151,7 @@ public:
      out = one / out;
    }

-    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
+    static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG)
    {
      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
 #ifndef USE_FFT_ACCELERATION
--- a/Grid/qcd/action/scalar/ScalarInteractionAction.h
+++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h
@@ -77,7 +77,7 @@ public:

  virtual std::string action_name() { return "ScalarAction"; }

-  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
+  virtual void refresh(const Field &U, GridSerialRNG & sRNG, GridParallelRNG &pRNG) {}

  virtual RealD S(const Field &p)
  {
--- a/Grid/qcd/hmc/HMC.h
+++ b/Grid/qcd/hmc/HMC.h
@@ -139,7 +139,7 @@ private:
  // Evolution
  /////////////////////////////////////////////////////////
  RealD evolve_hmc_step(Field &U) {
-    TheIntegrator.refresh(U, pRNG);  // set U and initialize P and phi's
+    TheIntegrator.refresh(U, sRNG, pRNG);  // set U and initialize P and phi's

    RealD H0 = TheIntegrator.S(U);  // initial state action

--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -33,6 +33,7 @@ directory
 #define INTEGRATOR_INCLUDED

 #include <memory>
+#include "MomentumFilter.h"

 NAMESPACE_BEGIN(Grid);

@@ -78,8 +79,19 @@ protected:
  RepresentationPolicy Representations;
  IntegratorParameters Params;

+  //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC
+  //It is applied whenever the momentum is updated / refreshed
+  //The default filter does nothing
+  MomentumFilterBase<MomentaField> const* MomFilter;
+
  const ActionSet<Field, RepresentationPolicy> as;

+  //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
+  static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){ 
+    static MomentumFilterNone<MomentaField> filter;
+    return &filter;
+  }
+
  void update_P(Field& U, int level, double ep) 
  {
    t_P[level] += ep;
@@ -135,6 +147,8 @@ protected:

    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
+
+    MomFilter->applyFilter(Mom);
  }

  void update_U(Field& U, double ep) 
@@ -174,12 +188,24 @@ public:
    t_P.resize(levels, 0.0);
    t_U = 0.0;
    // initialization of smearer delegated outside of Integrator
+
+    //Default the momentum filter to "do-nothing"
+    MomFilter = getDefaultMomFilter();
  };

  virtual ~Integrator() {}

  virtual std::string integrator_name() = 0;
  
+  //Set the momentum filter allowing for manipulation of the conjugate momentum
+  void setMomentumFilter(const MomentumFilterBase<MomentaField> &filter){
+    MomFilter = &filter;
+  }
+
+  //Access the conjugate momentum
+  const MomentaField & getMomentum() const{ return P; }
+  
+
  void print_parameters()
  {
    std::cout << GridLogMessage << "[Integrator] Name : "<< integrator_name() << std::endl;
@@ -210,10 +236,9 @@ public:
  // over the representations
  struct _refresh {
    template <class FieldType, class Repr>
-    void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep,
-                    GridParallelRNG& pRNG) {
+    void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, GridSerialRNG & sRNG, GridParallelRNG& pRNG) {
      for (int a = 0; a < repr_set.size(); ++a){
-        repr_set.at(a)->refresh(Rep.U, pRNG);
+        repr_set.at(a)->refresh(Rep.U, sRNG, pRNG);
      
 	std::cout << GridLogDebug << "Hirep refreshing pseudofermions" << std::endl;
      }
@@ -221,12 +246,12 @@ public:
  } refresh_hireps{};

  // Initialization of momenta and actions
-  void refresh(Field& U, GridParallelRNG& pRNG) 
+  void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
  {
    assert(P.Grid() == U.Grid());
    std::cout << GridLogIntegrator << "Integrator refresh\n";

-    FieldImplementation::generate_momenta(P, pRNG);
+    FieldImplementation::generate_momenta(P, sRNG, pRNG);

    // Update the smeared fields, can be implemented as observer
    // necessary to keep the fields updated even after a reject
@@ -243,12 +268,14 @@ public:
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
-        as[level].actions.at(actionID)->refresh(Us, pRNG);
+        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
      }

      // Refresh the higher representation actions
-      as[level].apply(refresh_hireps, Representations, pRNG);
+      as[level].apply(refresh_hireps, Representations, sRNG, pRNG);
    }
+
+    MomFilter->applyFilter(P);
  }

  // to be used by the actionlevel class to iterate
--- a/Grid/qcd/hmc/integrators/MomentumFilter.h
+++ b/Grid/qcd/hmc/integrators/MomentumFilter.h
@@ -0,0 +1,94 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/hmc/integrators/MomentumFilter.h
+
+Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+//--------------------------------------------------------------------
+#ifndef MOMENTUM_FILTER
+#define MOMENTUM_FILTER
+
+NAMESPACE_BEGIN(Grid);
+
+//These filter objects allow the user to manipulate the conjugate momentum as part of the update / refresh
+
+template<typename MomentaField>
+struct MomentumFilterBase{
+  virtual void applyFilter(MomentaField &P) const;
+};
+
+//Do nothing
+template<typename MomentaField>
+struct MomentumFilterNone: public MomentumFilterBase<MomentaField>{
+  void applyFilter(MomentaField &P) const override{}
+};
+
+//Multiply each site/direction by a Lorentz vector complex number field
+//Can be used to implement a mask, zeroing out sites
+template<typename MomentaField>
+struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
+  typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
+  typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
+  typedef iVector<iScalar<iScalar<vector_type> >, Nd > LorentzScalarType; //complex phase for each site/direction
+  typedef Lattice<LorentzScalarType> LatticeLorentzScalarType;
+  
+  LatticeLorentzScalarType phase;
+ 
+  MomentumFilterApplyPhase(const LatticeLorentzScalarType _phase): phase(_phase){}
+
+  //Default to uniform field of (1,0)
+  MomentumFilterApplyPhase(GridBase* _grid): phase(_grid){
+    LorentzScalarType one;
+    for(int mu=0;mu<Nd;mu++)
+      one(mu)()() = scalar_type(1.);
+    
+    phase = one;
+  }
+
+  void applyFilter(MomentaField &P) const override{
+    conformable(P,phase);
+    autoView( P_v , P, AcceleratorWrite);
+    autoView( phase_v , phase, AcceleratorRead);
+
+    accelerator_for(ss,P_v.size(),MomentaField::vector_type::Nsimd(),{
+    	auto site_mom = P_v(ss);
+    	auto site_phase = phase_v(ss);
+	for(int mu=0;mu<Nd;mu++)
+	  site_mom(mu) = site_mom(mu) * site_phase(mu);
+    	coalescedWrite(P_v[ss], site_mom);
+      });
+    
+  }
+
+
+};
+
+
+
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/qcd/smearing/StoutSmearing.h
+++ b/Grid/qcd/smearing/StoutSmearing.h
@@ -85,21 +85,18 @@ public:

    std::cout << GridLogDebug << "Stout smearing started\n";

-    // Smear the configurations
+    // C contains the staples multiplied by some rho
+    u_smr = U ; // set the smeared field to the current gauge field
    SmearBase->smear(C, U);

    for (int mu = 0; mu < Nd; mu++) {
-      if( mu == OrthogDim )
-        tmp = 1.0;  // Don't smear in the orthogonal direction
-      else {
-        tmp = peekLorentz(C, mu);
+      if( mu == OrthogDim ) continue ;
+      // u_smr = exp(iQ_mu)*U_mu apart from Orthogdim
      Umu = peekLorentz(U, mu);
-        iq_mu = Ta(
-                   tmp *
-                   adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
+      tmp = peekLorentz(C, mu);
+      iq_mu = Ta( tmp * adj(Umu));  
      exponentiate_iQ(tmp, iq_mu);
-      }
-      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
+      pokeLorentz(u_smr, tmp * Umu, mu);
    }
    std::cout << GridLogDebug << "Stout smearing completed\n";
  };
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
--- a/Grid/qcd/utils/Metric.h
+++ b/Grid/qcd/utils/Metric.h
@@ -93,13 +93,13 @@ public:
  GeneralisedMomenta(GridBase* grid, Metric<MomentaField>& M): M(M), Mom(grid), AuxMom(grid), AuxField(grid){}

  // Correct
-  void MomentaDistribution(GridParallelRNG& pRNG){
+  void MomentaDistribution(GridSerialRNG & sRNG, GridParallelRNG& pRNG){
    // Generate a distribution for
    // P^dag G P
    // where G = M^-1

    // Generate gaussian momenta
-    Implementation::generate_momenta(Mom, pRNG);
+    Implementation::generate_momenta(Mom, sRNG, pRNG);
    // Modify the distribution with the metric
    M.MSquareRoot(Mom);

@@ -107,8 +107,8 @@ public:
      // Auxiliary momenta
      // do nothing if trivial, so hide in the metric
      MomentaField AuxMomTemp(Mom.Grid());
-      Implementation::generate_momenta(AuxMom, pRNG);
-      Implementation::generate_momenta(AuxField, pRNG);
+      Implementation::generate_momenta(AuxMom, sRNG, pRNG);
+      Implementation::generate_momenta(AuxField, sRNG, pRNG);
      // Modify the distribution with the metric
      // Aux^dag M Aux
      M.MInvSquareRoot(AuxMom);  // AuxMom = M^{-1/2} AuxMomTemp
--- a/Grid/simd/Grid_gpu_vec.h
+++ b/Grid/simd/Grid_gpu_vec.h
@@ -67,6 +67,7 @@ public:
  accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;};
  accelerator_inline Real real(void) const { return z.x; };
  accelerator_inline Real imag(void) const { return z.y; };
+  accelerator_inline GpuComplex &operator=(const Zero &zz) { z.x = 0; z.y=0; return *this; };
  accelerator_inline GpuComplex &operator*=(const GpuComplex &r) {
    *this = (*this) * r;
    return *this;
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@@ -208,8 +208,8 @@ struct RealPart<complex<T> > {
 //////////////////////////////////////
 // type alias used to simplify the syntax of std::enable_if
 template <typename T> using Invoke = typename T::type;
-template <typename Condition, typename ReturnType> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
-template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
+template <typename Condition, typename ReturnType = void> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
+template <typename Condition, typename ReturnType = void> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;

 ////////////////////////////////////////////////////////
 // Check for complexity with type traits
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -221,7 +221,7 @@ public:
  typedef typename cobj::vector_type vector_type;
  typedef typename cobj::scalar_type scalar_type;
  typedef typename cobj::scalar_object scalar_object;
-  typedef CartesianStencilView<vobj,cobj,Parameters> View_type;
+  typedef const CartesianStencilView<vobj,cobj,Parameters> View_type;
  typedef typename View_type::StencilVector StencilVector;
  ///////////////////////////////////////////
  // Helper structs
--- a/Grid/tensors/Tensor_SIMT.h
+++ b/Grid/tensors/Tensor_SIMT.h
@@ -65,8 +65,9 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 #else


-#if 0
-// Use the scalar as our own complex on GPU
+//#ifndef GRID_SYCL
+#if 1
+// Use the scalar as our own complex on GPU ... thrust::complex or std::complex
 template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
 typename vsimd::scalar_type
 coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
@@ -96,6 +97,8 @@ void coalescedWrite(vsimd & __restrict__ vec,
  p[lane]=extracted;
 }
 #else
+// For SyCL have option to use GpuComplex from inside the vector type in SIMT loops
+// Faster for some reason
 template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
 typename vsimd::vector_type::datum
 coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
--- a/Grid/tensors/Tensor_exp.h
+++ b/Grid/tensors/Tensor_exp.h
@@ -28,7 +28,7 @@ Author: neo <cossu@post.kek.jp>
 #ifndef GRID_MATH_EXP_H
 #define GRID_MATH_EXP_H

-#define DEFAULT_MAT_EXP 12
+#define DEFAULT_MAT_EXP 20

 NAMESPACE_BEGIN(Grid);

--- a/Grid/tensors/Tensor_outer.h
+++ b/Grid/tensors/Tensor_outer.h
@@ -34,6 +34,16 @@ NAMESPACE_BEGIN(Grid);
 // outerProduct Scalar x Scalar -> Scalar
 //              Vector x Vector -> Matrix
 ///////////////////////////////////////////////////////////////////////////////////////
+template<class CC,IfComplex<CC> = 0>
+accelerator_inline CC outerProduct(const CC &l, const CC& r)
+{
+  return l*conj(r);
+}
+template<class RR,IfReal<RR> = 0>
+accelerator_inline RR outerProduct(const RR &l, const RR& r)
+{
+  return l*r;
+}

 template<class l,class r,int N> accelerator_inline
 auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
@@ -57,17 +67,6 @@ auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<declt
  return ret;
 }

-template<class CC,IfComplex<CC> = 0>
-accelerator_inline CC outerProduct(const CC &l, const CC& r)
-{
-  return l*conj(r);
-}
-template<class RR,IfReal<RR> = 0>
-accelerator_inline RR outerProduct(const RR &l, const RR& r)
-{
-  return l*r;
-}
-
 NAMESPACE_END(Grid);

 #endif
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -53,7 +53,6 @@ void acceleratorInit(void)
    prop = gpu_props[i];
    totalDeviceMem = prop.totalGlobalMem;
    if ( world_rank == 0) {
-#ifndef GRID_DEFAULT_GPU
      if ( i==rank ) {
 	printf("AcceleratorCudaInit[%d]: ========================\n",rank);
 	printf("AcceleratorCudaInit[%d]: Device Number    : %d\n", rank,i);
@@ -67,8 +66,8 @@ void acceleratorInit(void)
 	GPU_PROP(warpSize);
 	GPU_PROP(pciBusID);
 	GPU_PROP(pciDeviceID);
+ 	printf("AcceleratorCudaInit[%d]: maxGridSize (%d,%d,%d)\n",rank,prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
      }
-#endif
      //      GPU_PROP(unifiedAddressing);
      //      GPU_PROP(l2CacheSize);
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -104,7 +104,7 @@ extern int acceleratorAbortOnGpuError;

 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
-  return threadIdx.z; 
+  return threadIdx.x; 
 #else
  return 0;
 #endif
@@ -112,36 +112,76 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
+    int nt=acceleratorThreads();					\
    typedef uint64_t Iterator;						\
    auto lambda = [=] accelerator					\
      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
      __VA_ARGS__;							\
    };									\
-    int nt=acceleratorThreads();					\
-    dim3 cu_threads(acceleratorThreads(),1,nsimd);			\
+    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
  }

+#define accelerator_for6dNB(iter1, num1,				\
+                            iter2, num2,				\
+                            iter3, num3,				\
+                            iter4, num4,				\
+                            iter5, num5,				\
+			    iter6, num6, ... )				\
+  {									\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator					\
+      (Iterator iter1,Iterator iter2,					\
+       Iterator iter3,Iterator iter4,					\
+       Iterator iter5,Iterator iter6) mutable {				\
+      __VA_ARGS__;							\
+    };									\
+    dim3 cu_blocks (num1,num2,num3);					\
+    dim3 cu_threads(num4,num5,num6);					\
+    Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \
+  }
+
 template<typename lambda>  __global__
 void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 {
-  uint64_t x = threadIdx.x + blockDim.x*blockIdx.x;
-  uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
-  uint64_t z = threadIdx.z;
+  // Weird permute is to make lane coalesce for large blocks
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
+  uint64_t z = threadIdx.x;
  if ( (x < num1) && (y<num2) && (z<num3) ) {
    Lambda(x,y,z);
  }
 }

+template<typename lambda>  __global__
+void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
+		  uint64_t num4, uint64_t num5, uint64_t num6,
+		  lambda Lambda)
+{
+  uint64_t iter1 = blockIdx.x;
+  uint64_t iter2 = blockIdx.y;
+  uint64_t iter3 = blockIdx.z;
+  uint64_t iter4 = threadIdx.x;
+  uint64_t iter5 = threadIdx.y;
+  uint64_t iter6 = threadIdx.z;
+
+  if ( (iter1 < num1) && (iter2<num2) && (iter3<num3)
+    && (iter4 < num4) && (iter5<num5) && (iter6<num6) )
+  {
+    Lambda(iter1,iter2,iter3,iter4,iter5,iter6);
+  }
+}
+
 #define accelerator_barrier(dummy)					\
  {									\
    cudaDeviceSynchronize();						\
    cudaError err = cudaGetLastError();					\
    if ( cudaSuccess != err ) {						\
-      printf("Cuda error %s \n", cudaGetErrorString( err ));		\
-      puts(__FILE__);							\
-      printf("Line %d\n",__LINE__);					\
+      printf("accelerator_barrier(): Cuda error %s \n",			\
+	     cudaGetErrorString( err ));				\
+      printf("File %s Line %d\n",__FILE__,__LINE__);			\
+      fflush(stdout);							\
      if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);		\
    }									\
  }
@@ -417,7 +457,7 @@ accelerator_inline void acceleratorSynchronise(void)
  __syncwarp();
 #endif
 #ifdef GRID_SYCL
-  // No barrier call on SYCL??  // Option get __spir:: stuff to do warp barrier
+  //cl::sycl::detail::workGroupBarrier();
 #endif
 #ifdef GRID_HIP
  __syncthreads();
--- a/Grid/util/CompilerCompatible.h
+++ b/Grid/util/CompilerCompatible.h
@@ -1,5 +1,16 @@
 #pragma once 

+#if defined(__NVCC__)
+
+#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 0)
+#error "NVCC version 11.0 breaks on Ampere, see Github issue 346"
+#endif
+#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 1)
+#error "NVCC version 11.1 breaks on Ampere, see Github issue 346"
+#endif
+
+#endif
+
 #if defined(__clang__)

  #if __clang_major__ < 3
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -140,7 +140,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec)
 }

 template<class VectorInt>
-void GridCmdOptionIntVector(std::string &str,VectorInt & vec)
+void GridCmdOptionIntVector(const std::string &str,VectorInt & vec)
 {
  vec.resize(0);
  std::stringstream ss(str);
@@ -153,6 +153,9 @@ void GridCmdOptionIntVector(std::string &str,VectorInt & vec)
  return;
 }

+template void GridCmdOptionIntVector(const std::string &str,std::vector<int> & vec);
+template void GridCmdOptionIntVector(const std::string &str,Coordinate & vec);
+
 void GridCmdOptionInt(std::string &str,int & val)
 {
  std::stringstream ss(str);
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -55,7 +55,7 @@ template<class VectorInt>
 std::string GridCmdVectorIntToString(const VectorInt & vec);
 void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
-void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
+void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
 void GridCmdOptionInt(std::string &str,int & val);


--- a/HMC/Mobius2p1fRHMC.cc
+++ b/HMC/Mobius2p1fRHMC.cc
@@ -56,12 +56,12 @@ int main(int argc, char **argv) {
  MD.trajL   = 1.0;

  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 30;
+  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("ColdStart");
-  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.StartingType     =std::string("ColdStart");
+  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);

--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) 

 **Data parallel C++ mathematical object library.**

@@ -149,7 +149,6 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-numa`: enable NUMA first touch optimisation
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
 - `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
 - `--disable-timers`: disable system dependent high-resolution timers.
--- a/75
+++ b/75
@@ -1,3 +1,6 @@
+-- comms threads issue??
+-- Part done: Staggered kernel performance on GPU
+
 =========================================================
 General
 =========================================================
@@ -5,28 +8,18 @@ General
 - Make representations code take Gimpl
 - Simplify the HMCand remove modules
 - Lattice_arith - are the mult, mac etc.. still needed after ET engine?
- Lattice_rng
- Lattice_transfer.h
- accelerate A2Autils -- off critical path for HMC
+- Lattice_rng - faster local only loop in init
+- Audit: accelerate A2Autils -- off critical path for HMC

 =========================================================
-GPU branch code item work list
+GPU  work list
 =========================================================

-* sum_cpu promote to double during summation for increased precisoin.
+* sum_cpu promote to double during summation for increased precision.
 * Introduce sumD & ReduceD 
 * GPU sum is probably better currently.
-
 * Accelerate the cshift & benchmark

-* 0) Single GPU
- 128 bit integer table load in GPU code.
-  - ImprovedStaggered accelerate & measure perf
-  - Gianluca's changes to Cayley into gpu-port
-  - Mobius kernel fusion.                     -- Gianluca?
-  - Lebesque order reintroduction. StencilView should have pointer to it
-  - Lebesgue reorder in all kernels
-
 * 3) Comms/NVlink
 - OpenMP tasks to run comms threads. Experiment with it 
 - Remove explicit openMP in staggered. 
@@ -35,14 +28,6 @@ GPU branch code item work list
 - Stencil gather ??
 - SIMD dirs in stencil

-* 4) ET enhancements
- eval -> scalar ops in ET engine
- coalescedRead, coalescedWrite in expressions.
-
-* 5) Misc
- Conserved current clean up.
- multLinkProp eliminate
-
 8) Merge develop and test HMC

 9) Gamma tables on GPU; check this. Appear to work, but no idea why. Are these done on CPU?
@@ -52,7 +37,7 @@ GPU branch code item work list
 -     Audit NAMESPACE CHANGES
 -     Audit changes

-----
+---------
 Gianluca's changes
 - Performance impact of construct in aligned allocator???
 ---------
@@ -62,6 +47,33 @@ Gianluca's changes
 -----------------------------
 DONE:
 -----------------------------
+=====
+-- Done: Remez X^-1/2 X^-1/2 X = 1 test.
+         Feed in MdagM^2 as a test and take its sqrt.
+         Automated test that MdagM invsqrt(MdagM)invsqrt(MdagM) = 1 in HMC for bounds satisfaction.
+
+-- Done: Sycl Kernels into develop. Compare to existing unroll and just use.
+-- Done: sRNG into refresh functions
+-- Done: Tuned decomposition on CUDA into develop
+-- Done: Sycl friend accessor. Const view attempt via typedef??
+
+
+* Done 5) Misc
+- Conserved current clean up.
+- multLinkProp eliminate
+
+* Done 0) Single GPU
+- 128 bit integer table load in GPU code.
+  - ImprovedStaggered accelerate & measure perf
+  - Gianluca's changes to Cayley into gpu-port
+  - Mobius kernel fusion.                     -- Gianluca?
+  - Lebesque order reintroduction. StencilView should have pointer to it
+  - Lebesgue reorder in all kernels
+
+* 4) ET enhancements
+- Done eval -> scalar ops in ET engine
+- Done coalescedRead, coalescedWrite in expressions.
+
 =============================================================================================
 AUDIT ContractWWVV with respect to develop    -- DONE
 - GPU accelerate EOFA                                                  -- DONE
@@ -125,23 +137,6 @@ AUDIT ContractWWVV with respect to develop    -- DONE
 - -      (4) omp parallel for collapse(n)
 - - Only (1) has a natural mirror in accelerator_loop
 - - Nested loop macros get cumbersome made a generic interface for N deep
- - Don't like thread_region and thread_loop_in_region
- - Could replace with 
-
-    thread_nested(1, 
-      for {
-
-      }
-    );
-    thread_nested(2,
-      for (){
-        for (){
-
-	}
-      }
-    );
-
-    and same "in_region".


 -----------------------------
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -53,7 +53,7 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();

  Coordinate latt4 = GridDefaultLatt();
-  int Ls=8;
+  int Ls=16;
  for(int i=0;i<argc;i++)
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
--- a/configure.ac
+++ b/configure.ac
@@ -7,7 +7,12 @@ AM_INIT_AUTOMAKE([subdir-objects 1.13])
 AM_EXTRA_RECURSIVE_TARGETS([tests bench])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([Grid/Grid.h])
-AC_CONFIG_HEADERS([Grid/Config.h],[sed -i 's|PACKAGE_|GRID_|' Grid/Config.h])
+AC_CONFIG_HEADERS([Grid/Config.h],[[$SED_INPLACE -e 's|PACKAGE_|GRID_|' -e 's|[[:space:]]PACKAGE[[:space:]]| GRID_PACKAGE |' -e 's|[[:space:]]VERSION[[:space:]]| GRID_PACKAGE_VERSION |' Grid/Config.h]],
+    [if test x"$host_os" == x"${host_os#darwin}" ; then]
+        [SED_INPLACE="sed -i"]
+    [else]
+        [SED_INPLACE="sed -i .bak"]
+    [fi])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])

 ################ Get git info
@@ -125,7 +130,7 @@ esac

 ############### fermions
 AC_ARG_ENABLE([fermion-reps],
-     [AC_HELP_STRING([--fermion-reps=yes|no], [enable extra fermion representation support])],
+     [AC_HELP_STRING([--enable-fermion-reps=yes|no], [enable extra fermion representation support])],
     [ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])

 AM_CONDITIONAL(BUILD_FERMION_REPS, [ test "${ac_FERMION_REPS}X" == "yesX" ])
@@ -135,12 +140,23 @@ AC_ARG_ENABLE([gparity],
     [ac_GPARITY=${enable_gparity}], [ac_GPARITY=yes])

 AM_CONDITIONAL(BUILD_GPARITY, [ test "${ac_GPARITY}X" == "yesX" ])
+
+AC_ARG_ENABLE([zmobius],
+     [AC_HELP_STRING([--enable-zmobius=yes|no], [enable Zmobius support])],
+     [ac_ZMOBIUS=${enable_zmobius}], [ac_ZMOBIUS=yes])
+
+AM_CONDITIONAL(BUILD_ZMOBIUS, [ test "${ac_ZMOBIUS}X" == "yesX" ])
+
+
 case ${ac_FERMION_REPS} in
   yes) AC_DEFINE([ENABLE_FERMION_REPS],[1],[non QCD fermion reps]);;
 esac
 case ${ac_GPARITY} in
   yes) AC_DEFINE([ENABLE_GPARITY],[1],[fermion actions with GPARITY BCs]);;
 esac
+case ${ac_ZMOBIUS} in
+   yes) AC_DEFINE([ENABLE_ZMOBIUS],[1],[Zmobius fermion actions]);;
+esac
 ############### Nc
 AC_ARG_ENABLE([Nc],
    [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
@@ -428,7 +444,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='-mavx2 -mfma -mf16c';;
      AVX512)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
-        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
+        SIMD_FLAGS='-mavx512f -mavx512cd';;
      SKL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon])
        SIMD_FLAGS='-march=skylake-avx512';;
@@ -481,6 +497,9 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
      AVX512)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        SIMD_FLAGS='-xcommon-avx512';;
+      SKL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-xcore-avx512';;
      KNC)
--- a/tests/core/Test_where.cc
+++ b/tests/core/Test_where.cc
@@ -40,9 +40,9 @@ int main (int argc, char ** argv)

  int N=16;
  
-  std::vector<int> latt_size  ({N,4,4});
-  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1});
-  std::vector<int> mpi_layout ({1,1,1});
+  std::vector<int> latt_size  ({N,N,N,N});
+  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout ({1,1,1,1});

  int vol = 1;
  int nd  = latt_size.size();
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
    for(int t=0;t<latt_size[mu];t++){
      LatticeCoordinate(coor,mu);
      sl=where(coor==Integer(t),rn,zz);
-      std::cout <<GridLogMessage<< " sl " << sl<<std::endl;
+      //      std::cout <<GridLogMessage<< " sl " << sl<<std::endl;
      std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
      ns=ns+norm2(sl);
    }
--- a/tests/core/Test_where_extended.cc
+++ b/tests/core/Test_where_extended.cc
@@ -0,0 +1,143 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_poisson_fft.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+ ;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  int N=16;
+  
+  std::vector<int> latt_size  ({N,4,4});
+  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1});
+  std::vector<int> mpi_layout ({1,1,1});
+
+  int vol = 1;
+  int nd  = latt_size.size();
+  for(int d=0;d<nd;d++){
+    vol = vol * latt_size[d];
+  }
+
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+  GridParallelRNG RNG(&GRID);
+  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));  
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"== LatticeComplex =="<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  {
+    LatticeComplexD      zz(&GRID);
+    LatticeInteger     coor(&GRID);
+    LatticeComplexD  rn(&GRID);
+    LatticeComplexD  sl(&GRID);
+
+    zz  = ComplexD(0.0,0.0);
+
+    gaussian(RNG,rn);
+    
+    RealD nn=norm2(rn);
+    for(int mu=0;mu<nd;mu++){
+      RealD ns=0.0;
+      for(int t=0;t<latt_size[mu];t++){
+	LatticeCoordinate(coor,mu);
+	sl=where(coor==Integer(t),rn,zz);
+	std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
+	ns=ns+norm2(sl);
+      }
+      std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" err " << nn-ns<<std::endl;
+      assert(abs(nn-ns) < 1.0e-10);
+    }
+  }
+
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"== LatticeFermion =="<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  {
+    LatticeFermionD      zz(&GRID);
+    LatticeInteger     coor(&GRID);
+    LatticeFermionD  rn(&GRID);
+    LatticeFermionD  sl(&GRID);
+
+    zz  = ComplexD(0.0,0.0);
+
+    gaussian(RNG,rn);
+    
+    RealD nn=norm2(rn);
+    for(int mu=0;mu<nd;mu++){
+      RealD ns=0.0;
+      for(int t=0;t<latt_size[mu];t++){
+	LatticeCoordinate(coor,mu);
+	sl=where(coor==Integer(t),rn,zz);
+	std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
+	ns=ns+norm2(sl);
+      }
+      std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" err " << nn-ns<<std::endl;
+      assert(abs(nn-ns) < 1.0e-10);
+    }
+  }
+
+
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"== LatticePropagator =="<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+
+  {
+    LatticePropagatorD      zz(&GRID);
+    LatticeInteger     coor(&GRID);
+    LatticePropagatorD  rn(&GRID);
+    LatticePropagatorD  sl(&GRID);
+
+    zz  = ComplexD(0.0,0.0);
+
+    gaussian(RNG,rn);
+    
+    RealD nn=norm2(rn);
+    for(int mu=0;mu<nd;mu++){
+      RealD ns=0.0;
+      for(int t=0;t<latt_size[mu];t++){
+	LatticeCoordinate(coor,mu);
+	sl=where(coor==Integer(t),rn,zz);
+	std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
+	ns=ns+norm2(sl);
+      }
+      std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" err " << nn-ns<<std::endl;
+      assert(abs(nn-ns) < 1.0e-10);
+    }
+  }
+
+  Grid_finalize();
+}
--- a/tests/debug/Test_cayley_mres.cc
+++ b/tests/debug/Test_cayley_mres.cc
@@ -33,13 +33,14 @@ using namespace Grid;


 template<class What> 
-void  TestConserved(What & Ddwf, What & Ddwfrev, 
+void  TestConserved(What & Ddwf,
 		    LatticeGaugeField &Umu,
 		    GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
 		    GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
 		    RealD mass, RealD M5,
 		    GridParallelRNG *RNG4,
-		    GridParallelRNG *RNG5);
+		    GridParallelRNG *RNG5,
+                    What *Ddwfrev=nullptr);

  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@@ -102,10 +103,11 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);


-  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG4(UGrid);
+  std::vector<int> seeds4({1,2,3,4}); RNG4.SeedFixedIntegers(seeds4);
+  //const std::string seeds4{ "test-gauge-3000" }; RNG4.SeedUniqueString( seeds4 );

  LatticeGaugeField Umu(UGrid);
  if( argc > 1 && argv[1][0] != '-' )
@@ -116,8 +118,8 @@ int main (int argc, char ** argv)
  }
  else
  {
-    std::cout<<GridLogMessage <<"Using cold configuration"<<std::endl;
-    //SU<Nc>::ColdConfiguration(Umu);
+    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
+    // SU<Nc>::ColdConfiguration(Umu);
    SU<Nc>::HotConfiguration(RNG4,Umu);
  }

@@ -127,7 +129,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  TestConserved<DomainWallFermionR>(Ddwf,Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestConserved<DomainWallFermionR>(Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);

  RealD b=1.5;// Scale factor b+c=2, b-c=1
  RealD c=0.5;
@@ -137,13 +139,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"MobiusFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  MobiusFermionR Dmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
-  TestConserved<MobiusFermionR>(Dmob,Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestConserved<MobiusFermionR>(Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);

  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"ScaledShamirFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  ScaledShamirFermionR Dsham(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,2.0);
-  TestConserved<ScaledShamirFermionR>(Dsham,Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestConserved<ScaledShamirFermionR>(Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);

  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"ZMobiusFermion test"<<std::endl;
@@ -152,8 +154,7 @@ int main (int argc, char ** argv)
  //  for(int s=0;s<Ls;s++) omegasrev[s]=omegas[Ls-1-s];
  ZMobiusFermionR ZDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,omegas,b,c);
  ZMobiusFermionR ZDmobrev(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,omegasrev,b,c);
-
-  TestConserved<ZMobiusFermionR>(ZDmob,ZDmobrev,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestConserved<ZMobiusFermionR>(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5,&ZDmobrev);

  Grid_finalize();
 }
@@ -162,20 +163,15 @@ int main (int argc, char ** argv)

 template<class Action> 
 void  TestConserved(Action & Ddwf,
-		    Action & Ddwfrev, 
 		    LatticeGaugeField &Umu,
 		    GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
 		    GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
 		    RealD mass, RealD M5,
 		    GridParallelRNG *RNG4,
-		    GridParallelRNG *RNG5)
+		    GridParallelRNG *RNG5,
+                    Action * Ddwfrev)
 {
-  int Ls=Ddwf.Ls;
-
  LatticePropagator phys_src(UGrid);
-
-  std::vector<LatticeColourMatrix> U(4,UGrid);
-  
  LatticePropagator seqsrc(FGrid);
  LatticePropagator prop5(FGrid); 
  LatticePropagator prop5rev(FGrid); 
@@ -194,9 +190,9 @@ void  TestConserved(Action & Ddwf,
  phys_src=Zero();
  pokeSite(kronecker,phys_src,coor);
  
-  MdagMLinearOperator<Action,LatticeFermion> HermOp(Ddwf);
-  MdagMLinearOperator<Action,LatticeFermion> HermOprev(Ddwfrev);
  ConjugateGradient<LatticeFermion> CG(1.0e-16,100000);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
+  ZeroGuesser<LatticeFermion> zpg;
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      LatticeFermion src4  (UGrid); 
@@ -206,20 +202,20 @@ void  TestConserved(Action & Ddwf,
      Ddwf.ImportPhysicalFermionSource(src4,src5);

      LatticeFermion result5(FGrid); result5=Zero();
-
-      // CGNE
-      LatticeFermion Mdagsrc5  (FGrid); 
-      Ddwf.Mdag(src5,Mdagsrc5);
-      CG(HermOp,Mdagsrc5,result5);
+      schur(Ddwf,src5,result5,zpg);
+      std::cout<<GridLogMessage<<"spin "<<s<<" color "<<c<<" norm2(sourc5d) "<<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
      FermToProp<Action>(prop5,result5,s,c);

      LatticeFermion result4(UGrid);
      Ddwf.ExportPhysicalFermionSolution(result5,result4);
      FermToProp<Action>(prop4,result4,s,c);

-      Ddwfrev.ImportPhysicalFermionSource(src4,src5);
-      Ddwfrev.Mdag(src5,Mdagsrc5);
-      CG(HermOprev,Mdagsrc5,result5);
+      if( Ddwfrev ) {
+        Ddwfrev->ImportPhysicalFermionSource(src4,src5);
+        result5 = Zero();
+        schur(*Ddwfrev,src5,result5,zpg);
+      }
      FermToProp<Action>(prop5rev,result5,s,c);
    }
  }
@@ -251,11 +247,7 @@ void  TestConserved(Action & Ddwf,
      PropToFerm<Action>(src5,seqsrc,s,c);

      LatticeFermion result5(FGrid); result5=Zero();
-
-      // CGNE
-      LatticeFermion Mdagsrc5  (FGrid); 
-      Ddwf.Mdag(src5,Mdagsrc5);
-      CG(HermOp,Mdagsrc5,result5);
+      schur(Ddwf,src5,result5,zpg);

      LatticeFermion result4(UGrid);
      Ddwf.ExportPhysicalFermionSolution(result5,result4);
@@ -276,10 +268,10 @@ void  TestConserved(Action & Ddwf,
  Ddwf.ContractConservedCurrent(prop5rev,prop5,Vector_mu,phys_src,Current::Vector,Tdir);
  Ddwf.ContractJ5q(prop5,PJ5q);
  
-  PA       = trace(g5*Axial_mu);
-  SV       = trace(Vector_mu);
-  VV       = trace(gT*Vector_mu);
-  PP       = trace(adj(prop4)*prop4);
+  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
+  SV       = trace(Vector_mu);        // Scalar-Vector conserved current
+  VV       = trace(gT*Vector_mu);     // (local) Vector-Vector conserved current
+  PP       = trace(adj(prop4)*prop4); // Pseudoscalar density
  
  // Spatial sum
  sliceSum(PA,sumPA,Tdir);
@@ -288,15 +280,17 @@ void  TestConserved(Action & Ddwf,
  sliceSum(PP,sumPP,Tdir);
  sliceSum(PJ5q,sumPJ5q,Tdir);

-  int Nt=sumPA.size();
+  const int Nt{static_cast<int>(sumPA.size())};
+  std::cout<<GridLogMessage<<"Vector Ward identity by timeslice (~ 0)"<<std::endl;
  for(int t=0;t<Nt;t++){
-    std::cout <<" SV "<<real(TensorRemove(sumSV[t]));
-    std::cout <<" VV "<<real(TensorRemove(sumVV[t]))<<std::endl;
+    std::cout<<GridLogMessage <<" t "<<t<<" SV "<<real(TensorRemove(sumSV[t]))<<" VV "<<real(TensorRemove(sumVV[t]))<<std::endl;
  }
+  std::cout<<GridLogMessage<<"Axial Ward identity by timeslice (defect ~ 0)"<<std::endl;
  for(int t=0;t<Nt;t++){
-    std::cout <<" PAc "<<real(TensorRemove(sumPA[t]));
-    std::cout <<" PJ5q "<<real(TensorRemove(sumPJ5q[t]));
-    std::cout <<" Ward Identity defect " <<real(TensorRemove(sumPA[t]-sumPA[(t-1+Nt)%Nt] - 2.0*(Ddwf.mass*sumPP[t] + sumPJ5q[t]) ))<<"\n";
+    const RealD DmuPAmu{real(TensorRemove(sumPA[t]-sumPA[(t-1+Nt)%Nt]))};
+    std::cout<<GridLogMessage<<" t "<<t<<" DmuPAmu "<<DmuPAmu
+             <<" PP "<<real(TensorRemove(sumPP[t]))<<" PJ5q "<<real(TensorRemove(sumPJ5q[t]))
+             <<" Ward Identity defect " <<(DmuPAmu - 2.*real(TensorRemove(Ddwf.mass*sumPP[t] + sumPJ5q[t])))<<std::endl;
  }
  
  ///////////////////////////////
--- a/tests/debug/Test_heatbath_dwf_eofa.cc
+++ b/tests/debug/Test_heatbath_dwf_eofa.cc
@@ -66,7 +66,9 @@ int main(int argc, char** argv)
  // Set up RNGs
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
+  GridSerialRNG sRNG;
  GridParallelRNG RNG5(FGrid);
+  sRNG.SeedFixedIntegers(seeds5);
  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
@@ -84,7 +86,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -94,7 +96,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Antonin Portelli	d4290a7434	finer timers in Benchmark_IO	2021-06-17 11:57:02 +01:00
Peter Boyle	92def28bd3	Update README.md	2021-06-06 04:52:05 -04:00
Antonin Portelli	ca10bfa1c7	removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)	2021-06-04 11:12:22 +01:00
Peter Boyle	0e27e3847d	Remove synch	2021-06-03 04:24:19 +00:00
u61464	8cfc7342cd	staggered hand unroll read coalesce	2021-05-05 14:17:18 -07:00
u61464	15ae317858	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-05-04 08:40:38 -07:00
u61464	834f536b5f	Fastest option on SyCL is now std::complex	2021-05-04 08:40:18 -07:00
Peter Boyle	c332d9f08b	Merge pull request #356 from felixerben/bugfix/stoutSmearing Jamie's fix	2021-04-27 14:10:49 -04:00
Felix Erben	cf2923d5dd	Jamie's fix	2021-04-27 16:53:37 +01:00
Peter Boyle	0e4413ddde	Merge pull request #355 from felixerben/bugfix/stoutSmearing bugfix 3D stout smearing	2021-04-27 08:01:55 -04:00
Felix Erben	009ccd581e	bugfix 3D stout smearing	2021-04-26 10:36:33 +01:00
Peter Boyle	8cd4263974	Tests compile	2021-04-25 22:20:37 -04:00
Peter Boyle	d45c868656	Change interface	2021-04-25 10:53:34 -04:00
Peter Boyle	955a8113de	Expose label only to reduce number of parameters	2021-04-25 10:36:38 -04:00
Peter Boyle	dbe210dd53	Open the ens_id	2021-04-25 10:25:59 -04:00
Peter Boyle	86e11743ca	set twists	2021-04-20 10:19:11 -04:00
Peter Boyle	980e721f6e	Update MetaData.h	2021-04-13 09:33:01 -04:00
Peter Boyle	e2a0142d87	Merge pull request #348 from AndrewYongZhenNing/develop Conserved Tadpole Implementation for Shamir Action Only	2021-04-06 10:49:00 -04:00
Andrew Zhen Ning Yong	895244ecc3	Merge with upstream; implemented conserved tadpole for Shamir action.	2021-04-06 13:46:33 +01:00
Andrew Zhen Ning Yong	addeb621a7	Implemented tadpole operator for Shamir action.	2021-04-06 13:45:37 +01:00
Peter Boyle	a7fb25adf6	Make Cshift fields static to avoid repeated reallocaate overhead	2021-03-29 21:44:14 +02:00
Peter Boyle	e947992957	Improved force terms	2021-03-29 20:04:06 +02:00
Peter Boyle	bb89a82a07	Staggered coalseced read	2021-03-29 20:01:15 +02:00
Peter Boyle	8bdadbadac	Cold start	2021-03-18 15:41:14 -04:00
Peter Boyle	15c50a7442	Explicit instantiate the template function	2021-03-18 15:40:42 -04:00
Peter Boyle	49b0af2c95	Update of tests to compile with the sRNG addition. Audited the code conventions (again) with the CPS momentum denominator and added anti periodic in time to the Test_mobius_force.cc and tested the Test_dwf_gpforce. Promoted thesee to test full HMC hamiltonian, tr P^2/2 + phidag MdagM phi with the same pdot and Udot as audited in the Integrator.h etc... With full comments and sources for factors.	2021-03-18 09:10:02 -04:00
Peter Boyle	9c2b37218a	sRNG parameter added	2021-03-18 06:24:11 -04:00
Peter Boyle	3c67d626ba	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-12 15:36:55 +01:00
Peter Boyle	51f506553c	Read out the local ID once, and store	2021-03-12 15:33:04 +01:00
Peter Boyle	226be84937	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-12 09:31:50 -05:00
Peter Boyle	001814b442	updated to do list. Start adding DDHMC work items	2021-03-12 09:31:17 -05:00
Peter Boyle	db3ac67506	Update thread issue	2021-03-12 14:55:07 +01:00
Peter Boyle	da91a884ef	NVCC versions found buggy added as guard	2021-03-11 23:54:53 +01:00
Peter Boyle	a71e6755e3	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-11 22:43:06 +01:00
Peter Boyle	cd5891eecd	Test that fails on Cuda 11.0	2021-03-11 22:34:28 +01:00
Peter Boyle	5bb7336f27	Merge pull request #347 from pjgeorg/fix-autotools-avx512 Fix inconsistent SIMD option AVX512 Thanks	2021-03-11 16:29:07 -05:00
Peter Boyle	ce1fc1f48a	Possible fallback plan for Fionn's compiler bbug in nvcc	2021-03-11 22:20:53 +01:00
Peter Georg	82402c6a7c	Add simd option SKL for ICC	2021-03-11 13:08:40 +01:00
Peter Georg	d9c4afe5b7	Fix inconsistent configure option AVX512 Before this change AVX512 enabled different instruction sets depending on the compiler: For Intel C++ Compiler Classic (ICC): AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL i.e. Intel Xeon Skylake and newer For Intel ICX, gcc, clang: AVX512F, AVX512CD, AVX512ER, AVX512PF i.e. Intel Xeon Phi x200/x205 (KNL/KNM) With this commit AVX512 now only enables the common instruction sets supported by all CPUs supporting any AVX-512 instructions set: AVX512F and AVX512CD (called COMMON-AVX512 by icc)	2021-03-11 12:58:49 +01:00
Peter Boyle	f786ff8d69	Extend test from Fionn, fails on A100 apparently	2021-03-10 14:32:06 -05:00
u61464	a651caed5f	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-10 06:23:51 -08:00
u61464	0e21adb3f6	Gives 200GF/s on SyCL/DG1 8^4, doesn't uglify develop for other platforms too badly. Easy to revert to clean more C++ stylistic code. Theres a SYCL_HACK macro I will clean up later once dpcpp evolves a central nervous systems.	2021-03-10 05:40:51 -08:00
Peter Boyle	58bf9b9e6d	Clean up test	2021-03-10 02:45:22 +01:00
Peter Boyle	2146eebb65	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-09 04:31:46 +01:00
Peter Boyle	6a429ee6d3	2d loop hits Nvidia 16bit limit on large local vols	2021-03-09 04:31:10 +01:00
Peter Boyle	4d1ea15c79	More verbosity. The 16bit limit on Grid.y, Grid.z is annoying	2021-03-09 04:29:37 +01:00
Peter Boyle	a76cb005e0	Update Tensor_exp.h	2021-03-08 13:37:57 -05:00
Peter Boyle	a9604367c1	Merge pull request #336 from lehner/feature/gpt Make ShmDims configurable; adjust GRID_MAX_SIMD to allow for 128 byte width on GPUs	2021-03-05 13:17:19 -05:00
Peter Boyle	d7065023cc	Merge pull request #332 from mmphys/feature/mres_schur Optional changes to Test_cayley_mres e.g. Schur solver	2021-03-05 12:47:07 -05:00
Peter Boyle	89d299ceec	Merge pull request #333 from mmphys/bugfix/LatTransfer Fix convertType for GPU in Lattice_transfer.h	2021-03-05 12:46:33 -05:00
Peter Boyle	e34eda66df	Merge pull request #344 from felixerben/feature/XiToSigma Feature/xi to sigma	2021-03-05 12:45:44 -05:00
Christoph Lehner	b24181aa4f	Update Coordinate.h Revert GRID_MAX_SIMD change	2021-03-05 16:56:58 +01:00
Peter Boyle	aa173e2998	Update README.md	2021-03-05 10:25:33 -05:00
Felix Erben	7a19432e0b	whitespace	2021-03-05 10:57:09 +00:00
Felix Erben	9b15704290	tested and consitent	2021-03-05 10:42:32 +00:00
Michael Marshall	017f955b2d	Merge branch 'develop' into feature/mres_schur * develop: Pass serial RNG around Sycl happier	2021-03-04 20:42:02 +00:00
Michael Marshall	f252d69eef	Merge branch 'develop' into bugfix/LatTransfer * develop: Pass serial RNG around Sycl happier	2021-03-04 20:41:30 +00:00
Felix Erben	3b06e4655e	Merge branch 'develop' into feature/XiToSigma	2021-03-04 20:06:16 +00:00
Felix Erben	d4b4de8f42	changes	2021-03-04 20:01:24 +00:00
Peter Boyle	c90beee774	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-03 23:50:29 +01:00
Peter Boyle	1eea9d73b9	Pass serial RNG around	2021-03-03 23:50:01 +01:00
u61464	679d1d22f7	Sycl happier	2021-03-03 11:21:43 -08:00
Michael Marshall	b2b5e0b98c	Merge branch 'develop' into feature/mres_schur * develop: Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path. Better SIMD usage/coalescence	2021-03-03 16:15:12 +00:00
Michael Marshall	03e54722c1	Merge branch 'develop' into bugfix/LatTransfer * develop: Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.	2021-03-03 16:13:23 +00:00
Peter Boyle	442336bd96	Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.	2021-03-02 14:50:51 +01:00
Christoph Lehner	9c9566b9c9	Merge pull request #23 from paboyle/develop Sync	2021-03-01 12:33:51 +01:00
Michael Marshall	1059a81a3c	Merge branch 'develop' into bugfix/LatTransfer * develop: Better SIMD usage/coalescence	2021-02-27 00:21:36 +00:00
Peter Boyle	2e61556389	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-02-26 17:52:20 +01:00
Peter Boyle	f9b1f240f6	Better SIMD usage/coalescence	2021-02-26 17:51:41 +01:00
Michael Marshall	69f41469dd	Merge branch 'develop' into bugfix/LatTransfer * develop: (26 commits) Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links Correct misleading ac help string Enable performance counting in WilsonFermion like in others changed back A2AUtils warning changed if and accelerator_for - no runtime errors any more Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons. Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error maxLocalNorm2() change back benchmark_ITT prettify Flop cout matches DiRAC-ITT-2020 revert changes merge develop fixes weird bug in 2pt function... revert changes final version, tested on CPU and GPU bugfix ...	2021-02-25 09:19:17 +00:00
Michael Marshall	d620b303ff	Merge branch 'develop' into feature/mres_schur * develop: (26 commits) Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links Correct misleading ac help string Enable performance counting in WilsonFermion like in others changed back A2AUtils warning changed if and accelerator_for - no runtime errors any more Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons. Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error maxLocalNorm2() change back benchmark_ITT prettify Flop cout matches DiRAC-ITT-2020 revert changes merge develop fixes weird bug in 2pt function... revert changes final version, tested on CPU and GPU bugfix ...	2021-02-24 18:07:27 +00:00
Peter Boyle	157fd1428d	Merge pull request #342 from paboyle/feature/link-update-mask Feature/link update mask	2021-02-24 11:29:52 -05:00
Christopher Kelly	c791cb2214	Merge branch 'develop' into feature/link-update-mask	2021-02-23 11:51:54 -05:00
Christopher Kelly	d5ab571a89	Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links	2021-02-23 11:49:56 -05:00
Felix Erben	0ed800f6e4	merge develop	2021-02-23 14:54:46 +00:00
Peter Boyle	0a32183825	Merge pull request #335 from felixerben/gpu/baryons Gpu/baryons	2021-02-23 09:30:16 -05:00
Peter Boyle	2cacfbde2a	Merge pull request #341 from DanielRichtmann/fix/minor-things Minor fixes	2021-02-22 09:28:50 -05:00
Daniel Richtmann	c073e62e0b	Correct misleading ac help string	2021-02-22 15:25:44 +01:00
Daniel Richtmann	e3d019bc2f	Enable performance counting in WilsonFermion like in others	2021-02-22 15:25:40 +01:00
Felix Erben	7ae030f585	changed back A2AUtils warning	2021-02-18 13:24:50 +00:00
Felix Erben	86b58d5aff	changed if and accelerator_for - no runtime errors any more	2021-02-18 12:04:32 +00:00
Peter Boyle	26e8b9f4a5	Merge pull request #340 from mmphys/bugfix/config Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu	2021-02-17 11:56:21 -05:00
Michael Marshall	35114c9e62	Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu	2021-02-17 13:24:15 +00:00
Peter Boyle	dfd28a85c9	Merge pull request #339 from mmphys/bugfix/config Optional rename PACKAGE_ to GRID_ in Grid/Config.h	2021-02-15 13:53:26 -05:00
Michael Marshall	a503332924	Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.	2021-02-14 21:27:54 +00:00
Christoph Lehner	4705aa541d	Allow user to configure ShmDims via environment variables	2021-02-04 14:25:55 +01:00
Michael Marshall	3215d88a91	Simplify syntax with Grid::EnableIf post code review. Updated EnableIf so that ReturnType defaults to void in same way as std::enable_if see https://en.cppreference.com/w/cpp/types/enable_if	2021-02-03 15:17:03 +00:00
Felix Erben	9b9a53f870	...	2021-02-02 13:06:43 +00:00
Christoph Lehner	019ffe17d4	Allow for GPU vector width beyond 64	2021-02-02 11:32:23 +01:00
Felix Erben	bc496dd844	change back benchmark_ITT	2021-01-28 14:29:56 +00:00
Felix Erben	a673b6a54d	prettify	2021-01-28 14:15:09 +00:00
Felix Erben	1bf2e4d187	Merge branch 'develop' into gpu/baryons	2021-01-27 21:17:37 +00:00
Peter Boyle	96dd7a8fbd	Flop cout matches DiRAC-ITT-2020	2021-01-27 21:14:52 +00:00
Felix Erben	7905afa9f5	revert changes	2021-01-27 21:14:52 +00:00
Felix Erben	712bb40650	merge develop	2021-01-27 21:14:52 +00:00
Felix Erben	81d88d9f4d	fixes	2021-01-27 21:09:51 +00:00
Michael Marshall	77063418da	Fix issue for GPU by ensuring accelerator_inline version of convertType is available for Grid::complex<T>. This removes many warnings in Hadrons Simplify the SFINAE syntax and correct convertType for iScalar	2021-01-25 15:09:36 +00:00
Michael Marshall	2983b6fdf6	Optional (superficial) changes to make comparison with Hadrons WardIdentity module easier: use Schur solver; example of Hadrons random gauge init; logging updates; only solve reverse propagator if provided	2021-01-23 12:41:48 +00:00
Felix Erben	df16202865	weird bug in 2pt function...	2021-01-19 19:25:27 +00:00
Felix Erben	3ff7c2c02a	Merge branch 'develop' into gpu/baryons	2021-01-19 12:34:13 +00:00
Felix Erben	fc6d07897f	revert changes	2021-01-19 12:32:48 +00:00
Felix Erben	f9c8e5c8ef	Merge branch 'develop' of github.com:paboyle/Grid into develop	2021-01-19 12:30:29 +00:00
Felix Erben	8bfa0e74f8	final version, tested on CPU and GPU	2021-01-19 12:27:57 +00:00
Felix Erben	9b73a937e7	bugfix	2021-01-18 18:57:05 +00:00
Felix Erben	fa12b9a329	bugfix	2021-01-13 10:04:17 +00:00
Felix Erben	45fc7ded3a	test for sum	2021-01-12 09:10:37 +00:00
Felix Erben	74de2d9742	whitespace changes	2021-01-08 18:28:36 +00:00
Felix Erben	e759367d42	tested and working	2021-01-08 18:04:50 +00:00
Christoph Lehner	299d0de066	Merge pull request #21 from paboyle/develop Sync	2020-12-22 20:59:15 +01:00
Christoph Lehner	b4c1317ab4	Merge pull request #22 from DanielRichtmann/feature/clover-access-specifier Clover access specifier	2020-12-18 16:20:19 +01:00
Felix Erben	f36d6f3923	compiles on GPU. 3pt still wrong!!!!	2020-12-17 17:04:08 +00:00
Felix Erben	808f1e0e8c	merge develop	2020-12-15 16:33:29 +00:00
Daniel Richtmann	c438118fd7	Change access specifier of clover fields in order to allow deriving classes to access these	2020-12-08 14:42:11 +01:00
Christoph Lehner	17ec9c5545	Merge pull request #20 from paboyle/develop Sync	2020-11-24 12:20:43 +01:00
Felix Erben	3594ce877b	speedup in Sigma-to-nucleon	2020-11-03 20:04:30 +00:00
Felix Erben	9bae6b889a	speedup in Sigma-to-nucleon	2020-11-03 20:03:09 +00:00
Felix Erben	4014dfd5b9	first tested version	2020-11-03 16:13:08 +00:00
Felix Erben	67023c334b	bugfix	2020-11-03 13:07:37 +00:00
Felix Erben	a3de7026c8	bugfix	2020-11-03 12:51:50 +00:00
Felix Erben	ee11678b1f	added Xi-to-Sigma rare decays	2020-11-03 12:41:35 +00:00