finer timers in Benchmark_IO

Update README.md
removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)
2026-07-19 16:43:27 +01:00 · 2021-06-17 11:57:02 +01:00 · 2021-06-06 04:52:05 -04:00 · 2021-06-04 11:12:22 +01:00 · 2021-06-03 04:24:19 +00:00 · 2021-05-05 14:17:18 -07:00
88 changed files with 2736 additions and 1456 deletions
@@ -1,56 +0,0 @@
-language: cpp
-
-cache:
-  directories:
-    - clang
-
-matrix:
-  include:
-    - os:        osx
-      osx_image: xcode8.3
-      compiler: clang
-      
-before_install:
-    - export GRIDDIR=`pwd`
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
-    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
-    
-install:
-    - export CWD=`pwd`
-    - echo $CWD
-    - export CC=$CC$VERSION
-    - export CXX=$CXX$VERSION
-    - echo $PATH
-    - which autoconf
-    - autoconf  --version
-    - which automake
-    - automake  --version
-    - which $CC
-    - $CC  --version
-    - which $CXX
-    - $CXX --version
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
-    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
-    
-script:
-    - ./bootstrap.sh
-    - mkdir build
-    - cd build
-    - mkdir lime
-    - cd lime
-    - mkdir build
-    - cd build
-    - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
-    - tar xf lime-1.3.2.tar.gz
-    - cd lime-1.3.2
-    - ./configure --prefix=$CWD/build/lime/install
-    - make -j4
-    - make install
-    - cd $CWD/build
-    - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
-    - make -j4 
-    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
-    - make check
@@ -54,9 +54,11 @@ Version.h: version-cache
 include Make.inc
 include Eigen.inc

-extra_sources+=$(ZWILS_FERMION_FILES)
 extra_sources+=$(WILS_FERMION_FILES)
 extra_sources+=$(STAG_FERMION_FILES)
+if BUILD_ZMOBIUS
+  extra_sources+=$(ZWILS_FERMION_FILES)
+endif
 if BUILD_GPARITY
  extra_sources+=$(GP_FERMION_FILES)
 endif
@@ -36,7 +36,7 @@ static const int CbBlack=1;
 static const int Even   =CbRed;
 static const int Odd    =CbBlack;

-accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
+accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk)
 {
  int nd=rdim.size();
  Coordinate coor(nd);
@@ -1,4 +1,3 @@
-
 /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -108,6 +107,8 @@ public:
  ////////////////////////////////////////////////////////////
  // Reduction
  ////////////////////////////////////////////////////////////
+  void GlobalMax(RealD &);
+  void GlobalMax(RealF &);
  void GlobalSum(RealF &);
  void GlobalSumVector(RealF *,int N);
  void GlobalSum(RealD &);
@@ -275,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
+void CartesianCommunicator::GlobalMax(float &f)
+{
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalMax(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
+  assert(ierr==0);
+}
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
@@ -67,6 +67,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)

 CartesianCommunicator::~CartesianCommunicator(){}

+void CartesianCommunicator::GlobalMax(float &){}
+void CartesianCommunicator::GlobalMax(double &){}
 void CartesianCommunicator::GlobalSum(float &){}
 void CartesianCommunicator::GlobalSumVector(float *,int N){}
 void CartesianCommunicator::GlobalSum(double &){}
@@ -7,6 +7,7 @@
    Copyright (C) 2015

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christoph Lehner <christoph@lhnr.de>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -169,6 +170,23 @@ static inline int divides(int a,int b)
 }
 void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
 {
+  ////////////////////////////////////////////////////////////////
+  // Allow user to configure through environment variable
+  ////////////////////////////////////////////////////////////////
+  char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
+  if ( str ) {
+    std::vector<int> IntShmDims;
+    GridCmdOptionIntVector(std::string(str),IntShmDims);
+    assert(IntShmDims.size() == WorldDims.size());
+    long ShmSize = 1;
+    for (int dim=0;dim<WorldDims.size();dim++) {
+      ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
+      assert(divides(ShmDims[dim],WorldDims[dim]));
+    }
+    assert(ShmSize == WorldShmSize);
+    return;
+  }
+  
  ////////////////////////////////////////////////////////////////
  // Powers of 2,3,5 only in prime decomposition for now
  ////////////////////////////////////////////////////////////////
@@ -110,9 +110,11 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
  int n1=rhs.Grid()->_slice_stride[dimension];

  if ( cbmask ==0x3){
-#ifdef ACCELERATOR_CSHIFT    
+#ifdef ACCELERATOR_CSHIFT
    autoView(rhs_v , rhs, AcceleratorRead);
-    accelerator_for2d(n,e1,b,e2,1,{
+    accelerator_for(nn,e1*e2,1,{
+	int n = nn%e1;
+	int b = nn/e1;
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	
@@ -135,7 +137,9 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
    std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
 #ifdef ACCELERATOR_CSHIFT    
    autoView(rhs_v , rhs, AcceleratorRead);
-    accelerator_for2d(n,e1,b,e2,1,{
+    accelerator_for(nn,e1*e2,1,{
+	int n = nn%e1;
+	int b = nn/e1;

 	Coordinate coor;

@@ -257,7 +261,9 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
    int _slice_block = rhs.Grid()->_slice_block[dimension];
 #ifdef ACCELERATOR_CSHIFT    
    autoView( rhs_v , rhs, AcceleratorWrite);
-    accelerator_for2d(n,e1,b,e2,1,{
+    accelerator_for(nn,e1*e2,1,{
+	int n = nn%e1;
+	int b = nn/e1;
 	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
@@ -274,7 +280,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA

    // Case of SIMD split AND checker dim cannot currently be hit, except in 
    // Test_cshift_red_black code.
-    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
+    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
    assert(0); // This will fail if hit on GPU
    autoView( rhs_v, rhs, CpuWrite);
@@ -122,8 +122,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  cshiftVector<vobj> send_buf(buffer_size);
-  cshiftVector<vobj> recv_buf(buffer_size);
+  static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
    
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -198,8 +198,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
-  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
 
@@ -294,8 +294,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  cshiftVector<vobj> send_buf_v(buffer_size);
-  cshiftVector<vobj> recv_buf_v(buffer_size);
+  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
+  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
  vobj *send_buf;
  vobj *recv_buf;
  {
@@ -381,8 +381,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  //  int words = sizeof(vobj)/sizeof(vector_type);

-  std::vector<cshiftVector<scalar_object> >  send_buf_extract(Nsimd);
-  std::vector<cshiftVector<scalar_object> >  recv_buf_extract(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
+  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
  {
@@ -96,8 +96,34 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
  ssobj ret = ssum;
  return ret;
 }
+/*
+Threaded max, don't use for now
+template<class Double>
+inline Double max(const Double *arg, Integer osites)
+{
+  //  const int Nsimd = vobj::Nsimd();
+  const int nthread = GridThread::GetThreads();

-
+  std::vector<Double> maxarray(nthread);
+  
+  thread_for(thr,nthread, {
+    int nwork, mywork, myoff;
+    nwork = osites;
+    GridThread::GetWork(nwork,thr,mywork,myoff);
+    Double max=arg[0];
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      if( arg[ss] > max ) max = arg[ss];
+    }
+    maxarray[thr]=max;
+  });
+  
+  Double tmax=maxarray[0];
+  for(int i=0;i<nthread;i++){
+    if (maxarray[i]>tmax) tmax = maxarray[i];
+  } 
+  return tmax;
+}
+*/
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
@@ -141,6 +167,32 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }

+//The global maximum of the site norm2
+template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
+{
+  typedef typename vobj::tensor_reduced vscalar;  //iScalar<iScalar<.... <vPODtype> > >
+  typedef typename vscalar::scalar_object  scalar;   //iScalar<iScalar<.... <PODtype> > >
+
+  Lattice<vscalar> inner = localNorm2(arg);
+
+  auto grid = arg.Grid();
+
+  RealD max;
+  for(int l=0;l<grid->lSites();l++){
+    Coordinate coor;
+    scalar val;
+    RealD r;
+    grid->LocalIndexToLocalCoor(l,coor);
+    peekLocalSite(val,inner,coor);
+    r=real(TensorRemove(val));
+    if( (l==0) || (r>max)){
+      max=r;
+    }
+  }
+  grid->GlobalMax(max);
+  return max;
+}
+
 // Double inner product
 template<class vobj>
 inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@@ -97,6 +97,20 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
  out = in;
 }

+template<typename T>
+accelerator_inline EnableIf<isGridFundamental<T>> convertType(T & out, const T & in) {
+  out = in;
+}
+
+// This would allow for conversions between GridFundamental types, but is not strictly needed as yet
+/*template<typename T1, typename T2>
+accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
+// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
+//accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
+convertType(T1 & out, const T2 & in) {
+  out = in;
+}*/
+
 #ifdef GRID_SIMT
 accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
  ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
@@ -117,23 +131,18 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
  Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
 }

-template<typename T1,typename T2,int N>
-  accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
-template<typename T1,typename T2,int N>
-  accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
-
-template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
-accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
-  convertType(out,in._internal);
+template<typename T1,typename T2>
+accelerator_inline void convertType(iScalar<T1> & out, const iScalar<T2> & in) {
+  convertType(out._internal,in._internal);
 }

-template<typename T1, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
-accelerator_inline void convertType(T1 & out, const iScalar<T1> & in) {
+template<typename T1,typename T2>
+accelerator_inline NotEnableIf<isGridScalar<T1>> convertType(T1 & out, const iScalar<T2> & in) {
  convertType(out,in._internal);
 }

 template<typename T1,typename T2>
-accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
+accelerator_inline NotEnableIf<isGridScalar<T2>> convertType(iScalar<T1> & out, const T2 & in) {
  convertType(out._internal,in);
 }

@@ -150,11 +159,6 @@ accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & i
    convertType(out._internal[i],in._internal[i]);
 }

-template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
-accelerator_inline void convertType(T & out, const T & in) {
-  out = in;
-}
-
 template<typename T1,typename T2>
 accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
  autoView( out_v , out,AcceleratorWrite);
@@ -67,9 +67,14 @@ public:
  accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
 #endif

+#if 1
+  //  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
+  accelerator_inline vobj       & operator[](size_t i) const { return this->_odata[i]; };
+#else
  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
-
+#endif
+  
  accelerator_inline uint64_t begin(void) const { return 0;};
  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
@@ -43,7 +43,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
  conformable(iftrue,predicate);
  conformable(iftrue,ret);

-  GridBase *grid=iftrue._grid;
+  GridBase *grid=iftrue.Grid();

  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
@@ -52,22 +52,23 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<

  const int Nsimd = grid->Nsimd();

-  std::vector<Integer> mask(Nsimd);
-  std::vector<scalar_object> truevals (Nsimd);
-  std::vector<scalar_object> falsevals(Nsimd);
-
-  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
-
-    extract(iftrue._odata[ss]   ,truevals);
-    extract(iffalse._odata[ss]  ,falsevals);
-    extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
-
-    for(int s=0;s<Nsimd;s++){
-      if (mask[s]) falsevals[s]=truevals[s];
+  autoView(iftrue_v,iftrue,CpuRead);
+  autoView(iffalse_v,iffalse,CpuRead);
+  autoView(predicate_v,predicate,CpuRead);
+  autoView(ret_v,ret,CpuWrite);
+  Integer NN= grid->oSites();
+  thread_for(ss,NN,{
+    Integer mask;
+    scalar_object trueval;
+    scalar_object falseval;
+    for(int l=0;l<Nsimd;l++){
+      trueval =extractLane(l,iftrue_v[ss]);
+      falseval=extractLane(l,iffalse_v[ss]);
+      mask    =extractLane(l,predicate_v[ss]);
+      if (mask) falseval=trueval;
+      insertLane(l,ret_v[ss],falseval);
    }
-
-    merge(ret._odata[ss],falsevals);
-  }
+  });
 }

 template<class vobj,class iobj>
@@ -76,9 +77,9 @@ inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &ift
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);

-  Lattice<vobj> ret(iftrue._grid);
+  Lattice<vobj> ret(iftrue.Grid());

-  where(ret,predicate,iftrue,iffalse);
+  whereWolf(ret,predicate,iftrue,iffalse);

  return ret;
 }
@@ -271,7 +271,7 @@ class BinaryIO {
 			      uint32_t &scidac_csumb)
  {
    grid->Barrier();
-    GridStopWatch timer; 
+    GridStopWatch timer, insideTimer; 
    GridStopWatch bstimer;
    
    nersc_csum=0;
@@ -363,7 +363,10 @@ class BinaryIO {
 	std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
 	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);    assert(ierr==0);
 	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);    assert(ierr==0);
-	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);    assert(ierr==0);
+  insideTimer.Start();
+	ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);
+  insideTimer.Stop();
+  assert(ierr==0);
 	MPI_File_close(&fh);
 	MPI_Type_free(&fileArray);
 	MPI_Type_free(&localArray);
@@ -438,7 +441,9 @@ class BinaryIO {
        assert(ierr == 0);

        std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
+        insideTimer.Start();
        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
+        insideTimer.Stop();
        assert(ierr == 0);

        MPI_Offset os;
@@ -516,8 +521,13 @@ class BinaryIO {
    if ( control & BINARYIO_READ) std::cout << " read  ";
    else                          std::cout << " write ";
    uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
-    std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
+    std::cout<< lastPerf.size <<"bytes in "<< timer.Elapsed() <<" "
 	     << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
+    std::cout << GridLogMessage << "IOobject: pure MPI IO call " 
+              << lastPerf.size <<" bytes in " 
+              << insideTimer.Elapsed() << " "
+              << lastPerf.size/1024./1024./(insideTimer.useconds()/1.0e6)
+              <<" MB/s "<<std::endl;

    std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl;

@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
  std::time_t t = std::time(nullptr);
  std::tm tm_ = *std::localtime(&t);
  std::ostringstream oss; 
-  //      oss << std::put_time(&tm_, "%c %Z");
+  oss << std::put_time(&tm_, "%c %Z");
  header.creation_date = oss.str();
  header.archive_date  = header.creation_date;

@@ -205,11 +205,20 @@ public:
    std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
  }

+  // Preferred interface
+  template<class GaugeStats=PeriodicGaugeStatistics>
+  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
+					std::string file, 
+					std::string ens_label = std::string("DWF"))
+  {
+    writeConfiguration(Umu,file,0,1,ens_label);
+  }
  template<class GaugeStats=PeriodicGaugeStatistics>
  static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
 					std::string file, 
 					int two_row,
-					int bits32)
+					int bits32,
+					std::string ens_label = std::string("DWF"))
  {
    typedef vLorentzColourMatrixD vobj;
    typedef typename vobj::scalar_object sobj;
@@ -219,8 +228,8 @@ public:
    // Following should become arguments
    ///////////////////////////////////////////
    header.sequence_number = 1;
-    header.ensemble_id     = "UKQCD";
-    header.ensemble_label  = "DWF";
+    header.ensemble_id     = std::string("UKQCD");
+    header.ensemble_label  = ens_label;

    typedef LorentzColourMatrixD fobj3D;
    typedef LorentzColour2x3D    fobj2D;
@@ -232,7 +241,7 @@ public:
    GaugeStats Stats; Stats(Umu,header);
    MachineCharacteristics(header);

-	uint64_t offset;
+    uint64_t offset;

    // Sod it -- always write 3x3 double
    header.floating_point = std::string("IEEE64BIG");
@@ -41,7 +41,7 @@ class Action
 public:
  bool is_smeared = false;
  // Heatbath?
-  virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
+  virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
  virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
  virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
  virtual std::string action_name()    = 0;                             // return the action name
@@ -291,12 +291,6 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
 typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
 typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

-#ifndef GRID_CUDA
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
-typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
-#endif
-
 NAMESPACE_END(Grid);

 ////////////////////
@@ -153,8 +153,8 @@ public:
  typedef typename Impl::StencilImpl             StencilImpl;		\
  typedef typename Impl::ImplParams               ImplParams;	        \
  typedef typename Impl::StencilImpl::View_type  StencilView;		\
-  typedef typename ViewMap<FermionField>::Type      FermionFieldView;	\
-  typedef typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
+  typedef const typename ViewMap<FermionField>::Type      FermionFieldView;	\
+  typedef const typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;

 #define INHERIT_IMPL_TYPES(Base)		\
  INHERIT_GIMPL_TYPES(Base)			\
@@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered);
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour one component spinors with colour index. 5d vec
 /////////////////////////////////////////////////////////////////////////////
-#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
-NAMESPACE_CHECK(ImplStaggered5dVec);  
+// Deprecate Vec5d
+//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> 
+//NAMESPACE_CHECK(ImplStaggered5dVec);  


@@ -85,7 +85,7 @@ class MADWF
      maxiter     =_maxiter;
    };
   
-  void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
+  void operator() (const FermionFieldo &src,FermionFieldo &sol5)
  {
    std::cout << GridLogMessage<< " ************************************************" << std::endl;
    std::cout << GridLogMessage<< "  MADWF-like algorithm                           " << std::endl;
@@ -114,8 +114,16 @@ class MADWF
    ///////////////////////////////////////
    //Import source, include Dminus factors
    ///////////////////////////////////////
-    Mato.ImportPhysicalFermionSource(src4,b); 
-    std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
+    GridBase *src_grid = src.Grid();
+
+    assert( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
+
+    if ( src_grid == Mato.GaugeGrid() ) {
+      Mato.ImportPhysicalFermionSource(src,b);
+    } else {
+      b=src;
+    }
+    std::cout << GridLogMessage << " src " <<norm2(src)<<std::endl;
    std::cout << GridLogMessage << " b    " <<norm2(b)<<std::endl;

    defect = b;
@@ -72,19 +72,23 @@ public:
    
  StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
      
-  static accelerator_inline void multLink(SiteSpinor &phi,
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
 		       const SiteDoubledGaugeField &U,
-		       const SiteSpinor &chi,
+		       const _Spinor &chi,
 		       int mu)
  {
-    mult(&phi(), &U(mu), &chi());
+    auto UU = coalescedRead(U(mu));
+    mult(&phi(), &UU, &chi());
  }
-  static accelerator_inline void multLinkAdd(SiteSpinor &phi,
+  template<class _Spinor>
+  static accelerator_inline void multLinkAdd(_Spinor &phi,
 			  const SiteDoubledGaugeField &U,
-			  const SiteSpinor &chi,
+			  const _Spinor &chi,
 			  int mu)
  {
-    mac(&phi(), &U(mu), &chi());
+    auto UU = coalescedRead(U(mu));
+    mac(&phi(), &UU, &chi());
  }
      
  template <class ref>
@@ -245,7 +245,7 @@ public:
    return out;
  }

-private:
+protected:
  // here fixing the 4 dimensions, make it more general?

  RealD csw_r;                                               // Clover coefficient - spatial
@@ -61,7 +61,7 @@ public:
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);

-  accelerator_inline int CommDatumSize(void) {
+  accelerator_inline int CommDatumSize(void) const {
    return sizeof(SiteHalfCommSpinor);
  }

@@ -69,7 +69,7 @@ public:
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
  template<class _SiteHalfSpinor, class _SiteSpinor>
-  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
+  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const {
    _SiteHalfSpinor tmp;
    projector::Proj(tmp,in,mu,dag);
    vstream(buf[o],tmp);
@@ -81,7 +81,7 @@ public:
  accelerator_inline void Exchange(SiteHalfSpinor *mp,
 				   const SiteHalfSpinor * __restrict__ vp0,
 				   const SiteHalfSpinor * __restrict__ vp1,
-				   Integer type,Integer o){
+				   Integer type,Integer o) const {
    SiteHalfSpinor tmp1;
    SiteHalfSpinor tmp2;
    exchange(tmp1,tmp2,vp0[o],vp1[o],type);
@@ -93,7 +93,7 @@ public:
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
  accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
-				     SiteHalfSpinor * __restrict__ in, Integer o) {    
+				     SiteHalfSpinor * __restrict__ in, Integer o) const {    
    assert(0);
  }

@@ -103,7 +103,7 @@ public:
  accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
 					   SiteHalfSpinor * __restrict__ out1,
 					   const SiteSpinor * __restrict__ in,
-					   Integer j,Integer k, Integer m,Integer type)
+					   Integer j,Integer k, Integer m,Integer type) const
  {
    SiteHalfSpinor temp1, temp2;
    SiteHalfSpinor temp3, temp4;
@@ -117,7 +117,7 @@ public:
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  accelerator_inline bool DecompressionStep(void) { return false; }
+  accelerator_inline bool DecompressionStep(void) const { return false; }

 };

@@ -142,7 +142,7 @@ public:
  typedef typename SiteHalfSpinor::vector_type     vComplexHigh;
  constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);

-  accelerator_inline int CommDatumSize(void) {
+  accelerator_inline int CommDatumSize(void) const {
    return sizeof(SiteHalfCommSpinor);
  }

@@ -150,7 +150,7 @@ public:
  /* Compress includes precision change if mpi data is not same */
  /*****************************************************/
  template<class _SiteHalfSpinor, class _SiteSpinor>
-  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
+  accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const {
    _SiteHalfSpinor hsp;
    SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
    projector::Proj(hsp,in,mu,dag);
@@ -163,7 +163,7 @@ public:
  accelerator_inline void Exchange(SiteHalfSpinor *mp,
                       SiteHalfSpinor *vp0,
                       SiteHalfSpinor *vp1,
-		       Integer type,Integer o){
+		       Integer type,Integer o) const {
    SiteHalfSpinor vt0,vt1;
    SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
    SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
@@ -175,7 +175,7 @@ public:
  /*****************************************************/
  /* Have a decompression step if mpi data is not same */
  /*****************************************************/
-  accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){
+  accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
    SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
    precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
  }
@@ -186,7 +186,7 @@ public:
  accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
 			       SiteHalfSpinor *out1,
 			       const SiteSpinor *in,
-			       Integer j,Integer k, Integer m,Integer type){
+			       Integer j,Integer k, Integer m,Integer type) const {
    SiteHalfSpinor temp1, temp2,temp3,temp4;
    SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
    SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
@@ -200,7 +200,7 @@ public:
  /*****************************************************/
  /* Pass the info to the stencil */
  /*****************************************************/
-  accelerator_inline bool DecompressionStep(void) { return true; }
+  accelerator_inline bool DecompressionStep(void) const { return true; }

 };

@@ -72,7 +72,7 @@ public:
  typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
  typedef WilsonImplParams ImplParams;
  typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
-  typedef typename StencilImpl::View_type StencilView;
+  typedef const typename StencilImpl::View_type StencilView;
    
  ImplParams Params;

@@ -106,11 +106,15 @@ public:
 			    const _SpinorField & phi,
 			    int mu)
  {
+    const int Nsimd = SiteHalfSpinor::Nsimd();
    autoView( out_v, out, AcceleratorWrite);
    autoView( phi_v, phi, AcceleratorRead);
    autoView( Umu_v, Umu, AcceleratorRead);
-    accelerator_for(sss,out.Grid()->oSites(),1,{
-	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
+    typedef decltype(coalescedRead(out_v[0]))   calcSpinor;
+    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
+	calcSpinor tmp;
+	multLink(tmp,Umu_v[sss],phi_v(sss),mu);
+	coalescedWrite(out_v[sss],tmp);
    });
  }
 					   
@@ -180,18 +184,22 @@ public:
      mat = TraceIndex<SpinIndex>(P); 
    }
      
-    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
+    {
      for (int mu = 0; mu < Nd; mu++)
      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
    }

-
-  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
-      
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
+  {
+#undef USE_OLD_INSERT_FORCE    
    int Ls=Btilde.Grid()->_fdimensions[0];
+    autoView( mat_v , mat, AcceleratorWrite);
+#ifdef USE_OLD_INSERT_FORCE    
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
+      const int Nsimd = SiteSpinor::Nsimd();
      autoView( tmp_v , tmp, AcceleratorWrite);
      autoView( Btilde_v , Btilde, AcceleratorRead);
      autoView( Atilde_v , Atilde, AcceleratorRead);
@@ -204,6 +212,29 @@ public:
 	});
    }
    PokeIndex<LorentzIndex>(mat,tmp,mu);
+#else
+    {
+      const int Nsimd = SiteSpinor::Nsimd();
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
+	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+	  zeroit(sum);  
+	  for(int s=0;s<Ls;s++){
+	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+  	      auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF]()(spn) );
+	      auto op = outerProduct(bb,aa);
+  	      sum = sum + op;
+	    }
+	  }
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
+      });
+    }
+#endif    
  }
 };

@@ -49,9 +49,17 @@ public:

  INHERIT_IMPL_TYPES(Impl);
  typedef FermionOperator<Impl> Base;
-   
+  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
 public:

+#ifdef GRID_SYCL
+#define SYCL_HACK
+#endif  
+#ifdef SYCL_HACK
+  static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
+			       int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
+#endif
+  
  static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,
 			 int interior=1,int exterior=1) ;
@@ -880,11 +880,23 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
  }

  std::vector<RealD> G_s(Ls,1.0);
+  Integer sign = 1; // sign flip for vector/tadpole
  if ( curr_type == Current::Axial ) {
    for(int s=0;s<Ls/2;s++){
      G_s[s] = -1.0;
    }
  }
+  else if ( curr_type == Current::Tadpole ) {
+    auto b=this->_b;
+    auto c=this->_c;
+    if ( b == 1 && c == 0 ) {
+      sign = -1;    
+    }
+    else {
+      std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
+      assert(b==1 && c==0);
+    }
+  }

  for(int s=0;s<Ls;s++){

@@ -907,7 +919,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,

    tmp    = Cshift(tmp,mu,1);
    Impl::multLinkField(Utmp,this->Umu,tmp,mu);
-    tmp    = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
+    tmp    = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
    tmp    = where((lcoor>=tmin),tmp,zz); // Mask the time 
    L_Q    = where((lcoor<=tmax),tmp,zz); // Position of current complicated

@@ -680,7 +680,8 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
  gauge2 =(uint64_t)&UU[sU]( Z );				\
  gauge3 =(uint64_t)&UU[sU]( T ); 
  
-
+#undef STAG_VEC5D
+#ifdef STAG_VEC5D
  // This is the single precision 5th direction vectorised kernel
 #include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
@@ -790,7 +791,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
 #endif
 }
   
-   
+#endif   


 #define PERMUTE_DIR3 __asm__ (	\
@@ -32,25 +32,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-#define LOAD_CHI(b)		\
+#ifdef GRID_SIMT
+
+#define LOAD_CHI(ptype,b)			\
+  const SiteSpinor & ref (b[offset]);				\
+  Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane);	\
+  Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane);	\
+  Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
+
+#define LOAD_CHI_COMMS(b)		\
  const SiteSpinor & ref (b[offset]);	\
-    Chi_0=ref()()(0);\
-    Chi_1=ref()()(1);\
-    Chi_2=ref()()(2);
+  Chi_0=coalescedRead(ref()()(0),lane);	\
+  Chi_1=coalescedRead(ref()()(1),lane);	\
+  Chi_2=coalescedRead(ref()()(2),lane);
+
+#define PERMUTE_DIR(dir)	;
+#else
+#define LOAD_CHI(ptype,b)      LOAD_CHI_COMMS(b)
+
+#define LOAD_CHI_COMMS(b)		\
+  const SiteSpinor & ref (b[offset]);	\
+  Chi_0=ref()()(0);			\
+  Chi_1=ref()()(1);			\
+  Chi_2=ref()()(2);
+
+#define PERMUTE_DIR(dir)			\
+  permute##dir(Chi_0,Chi_0);			\
+  permute##dir(Chi_1,Chi_1);			\
+  permute##dir(Chi_2,Chi_2);
+
+#endif


 // To splat or not to splat depends on the implementation
 #define MULT(A,UChi)				\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    U_02=coalescedRead(ref()(0,2),lane);				\
+    U_12=coalescedRead(ref()(1,2),lane);				\
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0  = U_00*Chi_0;	       \
    UChi ## _1  = U_10*Chi_0;\
    UChi ## _2  = U_20*Chi_0;\
@@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid);

 #define MULT_ADD(U,A,UChi)			\
  auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));      \
-   Impl::loadLinkElement(U_10,ref()(1,0));      \
-   Impl::loadLinkElement(U_20,ref()(2,0));      \
-   Impl::loadLinkElement(U_01,ref()(0,1));      \
-   Impl::loadLinkElement(U_11,ref()(1,1));      \
-   Impl::loadLinkElement(U_21,ref()(2,1));      \
-   Impl::loadLinkElement(U_02,ref()(0,2));     \
-   Impl::loadLinkElement(U_12,ref()(1,2));     \
-   Impl::loadLinkElement(U_22,ref()(2,2));     \
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    U_02=coalescedRead(ref()(0,2),lane);				\
+    U_12=coalescedRead(ref()(1,2),lane);				\
+    U_22=coalescedRead(ref()(2,2),lane);				\
    UChi ## _0 += U_00*Chi_0;	       \
    UChi ## _1 += U_10*Chi_0;\
    UChi ## _2 += U_20*Chi_0;\
@@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid);
    UChi ## _2 += U_22*Chi_2;


-#define PERMUTE_DIR(dir)			\
-  permute##dir(Chi_0,Chi_0);			\
-  permute##dir(Chi_1,Chi_1);			\
-  permute##dir(Chi_2,Chi_2);
-
-
 #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);	\
  offset = SE->_offset;			\
  local  = SE->_is_local;		\
  perm   = SE->_permute;		\
  if ( local ) {						\
-    LOAD_CHI(in);					\
+    LOAD_CHI(Perm,in);						\
    if ( perm) {						\
      PERMUTE_DIR(Perm);					\
    }								\
  } else {							\
-    LOAD_CHI(buf);						\
+    LOAD_CHI_COMMS(buf);					\
  }								

 #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\
@@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid);
  }


-
 #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\
  SE=st.GetEntry(ptype,Dir+skew,sF);			\
  offset = SE->_offset;					\
  local  = SE->_is_local;				\
  perm   = SE->_permute;				\
  if ( local ) {					\
-    LOAD_CHI(in);				\
+    LOAD_CHI(Perm,in);					\
    if ( perm) {					\
      PERMUTE_DIR(Perm);				\
    }							\
  } else if ( st.same_node[Dir] ) {			\
-    LOAD_CHI(buf);					\
+    LOAD_CHI_COMMS(buf);				\
  }							\
  if (local || st.same_node[Dir] ) {		\
    MULT_ADD(U,Dir,even);				\
@@ -140,10 +158,32 @@ NAMESPACE_BEGIN(Grid);
  local  = SE->_is_local;				\
  if ((!local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    { LOAD_CHI(buf);	  }					\
+    { LOAD_CHI_COMMS(buf);	  }				\
    { MULT_ADD(U,Dir,even); }					\
  }								

+#define HAND_DECLARATIONS(Simd) \
+  Simd even_0;			\
+  Simd even_1;			\
+  Simd even_2;			\
+  Simd odd_0;			\
+  Simd odd_1;			\
+  Simd odd_2;		        \
+		      		\
+  Simd Chi_0;			\
+  Simd Chi_1;			\
+  Simd Chi_2;			\
+				\
+  Simd U_00;			\
+  Simd U_10;			\
+  Simd U_20;			\
+  Simd U_01;			\
+  Simd U_11;			\
+  Simd U_21;			\
+  Simd U_02;			\
+  Simd U_12;			\
+  Simd U_22;			
+  

 template <class Impl>
 template <int Naik> accelerator_inline
@@ -155,28 +195,14 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset,local,perm, ptype;

  StencilEntry *SE;
@@ -215,7 +241,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }

@@ -230,28 +256,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset, ptype, local, perm;

  StencilEntry *SE;
@@ -261,8 +272,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
  //    int sF=s+LLs*sU;
  {

-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);

    skew = 0;
    HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);  
@@ -294,7 +305,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
      result()()(1) = even_1 + odd_1;
      result()()(2) = even_2 + odd_2;
    }
-    vstream(out[sF],result);
+    coalescedWrite(out[sF],result);
  }
 }

@@ -309,28 +320,13 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;

-  Simd even_0; // 12 regs on knc
-  Simd even_1;
-  Simd even_2;
-  Simd odd_0; // 12 regs on knc
-  Simd odd_1;
-  Simd odd_2;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
+  HAND_DECLARATIONS(Simt);

-  Simd Chi_0;    // two spinor; 6 regs
-  Simd Chi_1;
-  Simd Chi_2;
-  
-  Simd U_00;  // two rows of U matrix
-  Simd U_10;
-  Simd U_20;  
-  Simd U_01;
-  Simd U_11;
-  Simd U_21;  // 2 reg left.
-  Simd U_02;
-  Simd U_12;
-  Simd U_22; 
-
-  SiteSpinor result;
+  typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
+  calcSiteSpinor result;
  int offset, ptype, local;

  StencilEntry *SE;
@@ -340,8 +336,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
  //    int sF=s+LLs*sU;
  {

-    even_0 = Zero();    even_1 = Zero();    even_2 = Zero();
-     odd_0 = Zero();     odd_1 = Zero();     odd_2 = Zero();
+    zeroit(even_0);    zeroit(even_1);    zeroit(even_2);
+    zeroit(odd_0);    zeroit(odd_1);    zeroit(odd_2);
    int nmu=0;
    skew = 0;
    HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);  
@@ -374,7 +370,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 	result()()(1) = even_1 + odd_1;
 	result()()(2) = even_2 + odd_2;
      }
-      out[sF] = out[sF] + result;
+      coalescedWrite(out[sF] , out(sF)+ result);
    }
  }
 }
@@ -397,6 +393,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
 						     const FermionFieldView &in, FermionFieldView &out, int dag); \
 */
 #undef LOAD_CHI
+#undef HAND_DECLARATIONS

 NAMESPACE_END(Grid);

@@ -35,39 +35,32 @@ NAMESPACE_BEGIN(Grid);
 #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);			\
-    } else {							\
-      chi_p = &in[SE->_offset];					\
-    }								\
+    int perm= SE->_permute;						\
+    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
  } else {							\
-    chi_p = &buf[SE->_offset];					\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
  }								\
-  multLink(Uchi, U[sU], *chi_p, Dir);			
+  acceleratorSynchronise();					\
+  multLink(Uchi, U[sU], chi, Dir);			

 #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if (SE->_is_local ) {						\
-    if (SE->_permute) {						\
-      chi_p = &chi;						\
-      permute(chi,  in[SE->_offset], ptype);			\
-    } else {							\
-      chi_p = &in[SE->_offset];					\
-    }								\
+    int perm= SE->_permute;						\
+    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
  } else if ( st.same_node[Dir] ) {				\
-    chi_p = &buf[SE->_offset];					\
+    chi = coalescedRead(buf[SE->_offset],lane);                 \
  }								\
  if (SE->_is_local || st.same_node[Dir] ) {			\
-    multLink(Uchi, U[sU], *chi_p, Dir);				\
+    multLink(Uchi, U[sU], chi, Dir);				\
  }

 #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\
  SE = st.GetEntry(ptype, Dir+skew, sF);			\
  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\
    nmu++;							\
-    chi_p = &buf[SE->_offset];					\
-    multLink(Uchi, U[sU], *chi_p, Dir);				\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+    multLink(Uchi, U[sU], chi, Dir);				\
  }

 template <class Impl>
@@ -84,12 +77,14 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
 					     SiteSpinor *buf, int sF, int sU, 
 					     const FermionFieldView &in, FermionFieldView &out, int dag) 
 {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);

  //  for(int s=0;s<LLs;s++){
  //
@@ -118,7 +113,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
    if ( dag ) { 
      Uchi = - Uchi;
    } 
-    vstream(out[sF], Uchi);
+    coalescedWrite(out[sF], Uchi,lane);
  }
 };

@@ -130,13 +125,16 @@ template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU, 
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
-  const SiteSpinor *chi_p;
-  SiteSpinor chi;
-  SiteSpinor Uchi;
+						const FermionFieldView &in, FermionFieldView &out,int dag)
+{
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int skew ;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);

  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -165,7 +163,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
    if ( dag ) {
      Uchi = - Uchi;
    }
-    vstream(out[sF], Uchi);
+    coalescedWrite(out[sF], Uchi,lane);
  }
 };

@@ -178,14 +176,17 @@ template <int Naik> accelerator_inline
 void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, 
 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
 						SiteSpinor *buf, int sF, int sU,
-						const FermionFieldView &in, FermionFieldView &out,int dag) {
-  const SiteSpinor *chi_p;
-  //  SiteSpinor chi;
-  SiteSpinor Uchi;
+						const FermionFieldView &in, FermionFieldView &out,int dag)
+{
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
  StencilEntry *SE;
  int ptype;
  int nmu=0;
  int skew ;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);

  //  for(int s=0;s<LLs;s++){
  //    int sF=LLs*sU+s;
@@ -211,11 +212,12 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
    }
-    if ( nmu ) { 
-      if ( dag ) { 
-	out[sF] = out[sF] - Uchi;
+    if ( nmu ) {
+      auto _out = coalescedRead(out[sF],lane);
+      if ( dag ) {
+	coalescedWrite(out[sF], _out-Uchi,lane);
      } else { 
-	out[sF] = out[sF] + Uchi;
+	coalescedWrite(out[sF], _out+Uchi,lane);
      }
    }
  }
@@ -261,6 +263,8 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v , UUU, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
@@ -301,6 +305,8 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
  GridBase *FGrid=in.Grid();  
  GridBase *UGrid=U.Grid();  
  typedef StaggeredKernels<Impl> ThisKernel;
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
  autoView( UUU_v ,   U, AcceleratorRead);
  autoView( U_v   ,   U, AcceleratorRead);
  autoView( in_v  ,  in, AcceleratorRead);
@@ -397,6 +397,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
 {
+  DhopCalls+=2;
  conformable(in.Grid(), _grid);  // verifies full grid
  conformable(in.Grid(), out.Grid());

@@ -408,6 +409,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
 {
+  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@@ -420,6 +422,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls++;
  conformable(in.Grid(), _cbgrid);    // verifies half grid
  conformable(in.Grid(), out.Grid());  // drops the cb check

@@ -76,7 +76,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define REGISTER

-#define LOAD_CHIMU \
+#ifdef GRID_SIMT
+#define LOAD_CHIMU(ptype)		\
+  {const SiteSpinor & ref (in[offset]);	\
+    Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane);	\
+    Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane);		\
+    Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane);		\
+    Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane);		\
+    Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane);		\
+    Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane);		\
+    Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane);		\
+    Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane);		\
+    Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane);		\
+    Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane);		\
+    Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane);		\
+    Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane);	}
+#define PERMUTE_DIR(dir) ;
+#else
+#define LOAD_CHIMU(ptype)		\
  {const SiteSpinor & ref (in[offset]);	\
    Chimu_00=ref()(0)(0);\
    Chimu_01=ref()(0)(1);\
@@ -91,55 +108,55 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    Chimu_31=ref()(3)(1);\
    Chimu_32=ref()(3)(2);}

-#define LOAD_CHI\
-  {const SiteHalfSpinor &ref(buf[offset]);	\
-    Chi_00 = ref()(0)(0);\
-    Chi_01 = ref()(0)(1);\
-    Chi_02 = ref()(0)(2);\
-    Chi_10 = ref()(1)(0);\
-    Chi_11 = ref()(1)(1);\
-    Chi_12 = ref()(1)(2);}
-
-// To splat or not to splat depends on the implementation
-#define MULT_2SPIN(A)\
-  {auto & ref(U[sU](A));			\
-   Impl::loadLinkElement(U_00,ref()(0,0));	\
-   Impl::loadLinkElement(U_10,ref()(1,0));	\
-   Impl::loadLinkElement(U_20,ref()(2,0));	\
-   Impl::loadLinkElement(U_01,ref()(0,1));	\
-   Impl::loadLinkElement(U_11,ref()(1,1));	\
-   Impl::loadLinkElement(U_21,ref()(2,1));	\
-    UChi_00 = U_00*Chi_00;\
-    UChi_10 = U_00*Chi_10;\
-    UChi_01 = U_10*Chi_00;\
-    UChi_11 = U_10*Chi_10;\
-    UChi_02 = U_20*Chi_00;\
-    UChi_12 = U_20*Chi_10;\
-    UChi_00+= U_01*Chi_01;\
-    UChi_10+= U_01*Chi_11;\
-    UChi_01+= U_11*Chi_01;\
-    UChi_11+= U_11*Chi_11;\
-    UChi_02+= U_21*Chi_01;\
-    UChi_12+= U_21*Chi_11;\
-    Impl::loadLinkElement(U_00,ref()(0,2));	\
-    Impl::loadLinkElement(U_10,ref()(1,2));	\
-    Impl::loadLinkElement(U_20,ref()(2,2));	\
-    UChi_00+= U_00*Chi_02;\
-    UChi_10+= U_00*Chi_12;\
-    UChi_01+= U_10*Chi_02;\
-    UChi_11+= U_10*Chi_12;\
-    UChi_02+= U_20*Chi_02;\
-    UChi_12+= U_20*Chi_12;}
-
-
 #define PERMUTE_DIR(dir)			\
-      permute##dir(Chi_00,Chi_00);\
+  permute##dir(Chi_00,Chi_00);	\
      permute##dir(Chi_01,Chi_01);\
      permute##dir(Chi_02,Chi_02);\
-      permute##dir(Chi_10,Chi_10);\
+      permute##dir(Chi_10,Chi_10);	\
      permute##dir(Chi_11,Chi_11);\
      permute##dir(Chi_12,Chi_12);

+#endif
+
+#define MULT_2SPIN(A)\
+  {auto & ref(U[sU](A));						\
+    U_00=coalescedRead(ref()(0,0),lane);				\
+    U_10=coalescedRead(ref()(1,0),lane);				\
+    U_20=coalescedRead(ref()(2,0),lane);				\
+    U_01=coalescedRead(ref()(0,1),lane);				\
+    U_11=coalescedRead(ref()(1,1),lane);				\
+    U_21=coalescedRead(ref()(2,1),lane);				\
+    UChi_00 = U_00*Chi_00;						\
+    UChi_10 = U_00*Chi_10;						\
+    UChi_01 = U_10*Chi_00;						\
+    UChi_11 = U_10*Chi_10;						\
+    UChi_02 = U_20*Chi_00;						\
+    UChi_12 = U_20*Chi_10;						\
+    UChi_00+= U_01*Chi_01;						\
+    UChi_10+= U_01*Chi_11;						\
+    UChi_01+= U_11*Chi_01;						\
+    UChi_11+= U_11*Chi_11;						\
+    UChi_02+= U_21*Chi_01;						\
+    UChi_12+= U_21*Chi_11;						\
+    U_00=coalescedRead(ref()(0,2),lane);				\
+    U_10=coalescedRead(ref()(1,2),lane);				\
+    U_20=coalescedRead(ref()(2,2),lane);				\
+    UChi_00+= U_00*Chi_02;						\
+    UChi_10+= U_00*Chi_12;						\
+    UChi_01+= U_10*Chi_02;						\
+    UChi_11+= U_10*Chi_12;						\
+    UChi_02+= U_20*Chi_02;						\
+    UChi_12+= U_20*Chi_12;}
+
+#define LOAD_CHI				\
+  {const SiteHalfSpinor &ref(buf[offset]);	\
+    Chi_00 = coalescedRead(ref()(0)(0),lane);	\
+    Chi_01 = coalescedRead(ref()(0)(1),lane);	\
+    Chi_02 = coalescedRead(ref()(0)(2),lane);	\
+    Chi_10 = coalescedRead(ref()(1)(0),lane);	\
+    Chi_11 = coalescedRead(ref()(1)(1),lane);	\
+    Chi_12 = coalescedRead(ref()(1)(2),lane);}
+
 //      hspin(0)=fspin(0)+timesI(fspin(3));
 //      hspin(1)=fspin(1)+timesI(fspin(2));
 #define XP_PROJ \
@@ -353,13 +370,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  result_31-= UChi_11;	\
  result_32-= UChi_12;

-#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
  offset = SE->_offset;				\
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU(PERM);				\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
@@ -367,6 +384,37 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  } else {					\
    LOAD_CHI;					\
  }						\
+  acceleratorSynchronise();			\
+  MULT_2SPIN(DIR);				\
+  RECON;					
+
+#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\
+  SE=&st_p[DIR+8*ss];				\
+  ptype=st_perm[DIR];				\
+  offset = SE->_offset;				\
+  local  = SE->_is_local;			\
+  perm   = SE->_permute;			\
+  if ( local ) {				\
+    LOAD_CHIMU(PERM);				\
+    PROJ;					\
+    if ( perm) {				\
+      PERMUTE_DIR(PERM);			\
+    }						\
+  } else {					\
+    LOAD_CHI;					\
+  }						\
+  acceleratorSynchronise();			\
+  MULT_2SPIN(DIR);				\
+  RECON;					
+
+#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON)				\
+  SE=&st_p[DIR+8*ss];							\
+  ptype=st_perm[DIR];							\
+ /*SE=st.GetEntry(ptype,DIR,ss);*/					\
+  offset = SE->_offset;				\
+  perm   = SE->_permute;			\
+  LOAD_CHIMU(PERM);				\
+  PROJ;						\
  MULT_2SPIN(DIR);				\
  RECON;					

@@ -376,7 +424,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  local  = SE->_is_local;			\
  perm   = SE->_permute;			\
  if ( local ) {				\
-    LOAD_CHIMU;					\
+    LOAD_CHIMU(PERM);				\
    PROJ;					\
    if ( perm) {				\
      PERMUTE_DIR(PERM);			\
@@ -384,10 +432,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  } else if ( st.same_node[DIR] ) {		\
    LOAD_CHI;					\
  }						\
+  acceleratorSynchronise();			\
  if (local || st.same_node[DIR] ) {		\
    MULT_2SPIN(DIR);				\
    RECON;					\
-  }
+  }						\
+  acceleratorSynchronise();			

 #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\
  SE=st.GetEntry(ptype,DIR,ss);			\
@@ -397,44 +447,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    MULT_2SPIN(DIR);				\
    RECON;					\
    nmu++;					\
-  }
+  }						\
+  acceleratorSynchronise();			

 #define HAND_RESULT(ss)				\
  {						\
-    SiteSpinor & ref (out[ss]);		\
-    vstream(ref()(0)(0),result_00);		\
-    vstream(ref()(0)(1),result_01);		\
-    vstream(ref()(0)(2),result_02);		\
-    vstream(ref()(1)(0),result_10);		\
-    vstream(ref()(1)(1),result_11);		\
-    vstream(ref()(1)(2),result_12);		\
-    vstream(ref()(2)(0),result_20);		\
-    vstream(ref()(2)(1),result_21);		\
-    vstream(ref()(2)(2),result_22);		\
-    vstream(ref()(3)(0),result_30);		\
-    vstream(ref()(3)(1),result_31);		\
-    vstream(ref()(3)(2),result_32);		\
+    SiteSpinor & ref (out[ss]);			\
+    coalescedWrite(ref()(0)(0),result_00,lane);		\
+    coalescedWrite(ref()(0)(1),result_01,lane);		\
+    coalescedWrite(ref()(0)(2),result_02,lane);		\
+    coalescedWrite(ref()(1)(0),result_10,lane);		\
+    coalescedWrite(ref()(1)(1),result_11,lane);		\
+    coalescedWrite(ref()(1)(2),result_12,lane);		\
+    coalescedWrite(ref()(2)(0),result_20,lane);		\
+    coalescedWrite(ref()(2)(1),result_21,lane);		\
+    coalescedWrite(ref()(2)(2),result_22,lane);		\
+    coalescedWrite(ref()(3)(0),result_30,lane);		\
+    coalescedWrite(ref()(3)(1),result_31,lane);		\
+    coalescedWrite(ref()(3)(2),result_32,lane);		\
  }

-#define HAND_RESULT_EXT(ss)			\
-  if (nmu){					\
-    SiteSpinor & ref (out[ss]);		\
-    ref()(0)(0)+=result_00;		\
-    ref()(0)(1)+=result_01;		\
-    ref()(0)(2)+=result_02;		\
-    ref()(1)(0)+=result_10;		\
-    ref()(1)(1)+=result_11;		\
-    ref()(1)(2)+=result_12;		\
-    ref()(2)(0)+=result_20;		\
-    ref()(2)(1)+=result_21;		\
-    ref()(2)(2)+=result_22;		\
-    ref()(3)(0)+=result_30;		\
-    ref()(3)(1)+=result_31;		\
-    ref()(3)(2)+=result_32;		\
+#define HAND_RESULT_EXT(ss)				\
+  {							\
+    SiteSpinor & ref (out[ss]);				\
+    coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00,lane);	\
+    coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01,lane);	\
+    coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02,lane);	\
+    coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10,lane);	\
+    coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11,lane);	\
+    coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12,lane);	\
+    coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20,lane);	\
+    coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21,lane);	\
+    coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22,lane);	\
+    coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30,lane);	\
+    coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31,lane);	\
+    coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32,lane);	\
  }

-
-#define HAND_DECLARATIONS(a)			\
+#define HAND_DECLARATIONS(Simd)			\
  Simd result_00;				\
  Simd result_01;				\
  Simd result_02;				\
@@ -466,19 +516,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  Simd U_11;					\
  Simd U_21;

-#define ZERO_RESULT				\
-  result_00=Zero();				\
-  result_01=Zero();				\
-  result_02=Zero();				\
-  result_10=Zero();				\
-  result_11=Zero();				\
-  result_12=Zero();				\
-  result_20=Zero();				\
-  result_21=Zero();				\
-  result_22=Zero();				\
-  result_30=Zero();				\
-  result_31=Zero();				\
-  result_32=Zero();			
+#define ZERO_RESULT							\
+  zeroit(result_00);							\
+  zeroit(result_01);							\
+  zeroit(result_02);							\
+  zeroit(result_10);							\
+  zeroit(result_11);							\
+  zeroit(result_12);							\
+  zeroit(result_20);							\
+  zeroit(result_21);							\
+  zeroit(result_22);							\
+  zeroit(result_30);							\
+  zeroit(result_31);							\
+  zeroit(result_32);			

 #define Chimu_00 Chi_00
 #define Chimu_01 Chi_01
@@ -495,15 +545,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+
+#ifdef SYCL_HACK
 template<class Impl> accelerator_inline void 
-WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
-				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
+WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
+				      int ss,int sU,const SiteSpinor *in, SiteSpinor *out)
 {
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef iSinglet<Simd> vCplx;
+  //  typedef decltype( coalescedRead( vCplx()()() )) Simt;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);
+
+  int offset,local,perm, ptype;
+  StencilEntry *SE;
+  HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
+  HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
+  HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
+  HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
+  HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
+  HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
+  HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
+  HAND_RESULT(ss);
+}
+#endif
+
+template<class Impl> accelerator_inline void 
+WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
+				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
+{
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
+// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
+  typedef typename Simd::scalar_type S;
+  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
+
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  int offset,local,perm, ptype;
  StencilEntry *SE;
@@ -523,10 +611,16 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
  int offset,local,perm, ptype;
@@ -546,11 +640,17 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  int offset,local,perm, ptype;
  StencilEntry *SE;
@@ -570,10 +670,16 @@ template<class Impl> accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
  int offset,local,perm, ptype;
@@ -593,11 +699,17 @@ template<class Impl>  accelerator_inline void
 WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf,
 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
 // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  int offset, ptype;
  StencilEntry *SE;
@@ -618,10 +730,16 @@ template<class Impl>  accelerator_inline
 void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
 {
+  auto st_p = st._entries_p;						
+  auto st_perm = st._permute_type;					
  typedef typename Simd::scalar_type S;
  typedef typename Simd::vector_type V;
+  typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;

-  HAND_DECLARATIONS(ignore);
+  const int Nsimd = SiteHalfSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  HAND_DECLARATIONS(Simt);

  StencilEntry *SE;
  int offset, ptype;
@@ -682,3 +800,4 @@ NAMESPACE_END(Grid);
 #undef HAND_RESULT
 #undef HAND_RESULT_INT
 #undef HAND_RESULT_EXT
+#undef HAND_DECLARATIONS
@@ -416,7 +416,21 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
 #undef LoopBody
 }

-#define KERNEL_CALLNB(A) \
+#define KERNEL_CALL_TMP(A) \
+  const uint64_t    NN = Nsite*Ls;					\
+  auto U_p = & U_v[0];							\
+  auto in_p = & in_v[0];						\
+  auto out_p = & out_v[0];						\
+  auto st_p = st_v._entries_p;						\
+  auto st_perm = st_v._permute_type;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p);	\
+    });									\
+  accelerator_barrier();
+
+#define KERNEL_CALLNB(A)						\
  const uint64_t    NN = Nsite*Ls;					\
  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
      int sF = ss;							\
@@ -445,20 +459,24 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField

   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
-#ifndef GRID_CUDA
+#ifdef SYCL_HACK     
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
+#else
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
+#endif     
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);    return;}
 #endif
   }
@@ -476,20 +494,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField

   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
   } else if( exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
-#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;}
+#ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;}
 #endif
   }
@@ -96,7 +96,7 @@ public:
  ///////////////////////////////////////////////////////////
  // Move these to another class
  // HMC auxiliary functions
-  static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) 
+  static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG) 
  {
    // Zbigniew Srocinsky thesis:
    //
@@ -49,7 +49,7 @@ public:

  virtual std::string action_name(){return "PlaqPlusRectangleAction";}
      
-  virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
      
  virtual std::string LogParameters(){
    std::stringstream sstream;
@@ -54,8 +54,7 @@ public:
    return sstream.str();
  }

-  virtual void refresh(const GaugeField &U,
-                       GridParallelRNG &pRNG){};  // noop as no pseudoferms
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){};  // noop as no pseudoferms

  virtual RealD S(const GaugeField &U) {
    RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
@@ -124,7 +124,7 @@ NAMESPACE_BEGIN(Grid);
      //
      // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
      //
-      virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG)
+      virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
      {
        Lop.ImportGauge(U);
        Rop.ImportGauge(U);
@@ -1,4 +1,3 @@
-
 /*************************************************************************************

 Grid physics library, www.github.com/paboyle/Grid
@@ -43,8 +42,7 @@ NAMESPACE_BEGIN(Grid);
 //

 template <class Impl>
-class OneFlavourEvenOddRationalPseudoFermionAction
-  : public Action<typename Impl::GaugeField> {
+class OneFlavourEvenOddRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
 public:
  INHERIT_IMPL_TYPES(Impl);

@@ -103,7 +101,7 @@ public:
    return sstream.str();
  }
  
-  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
    //        = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
    // Phi = MpcdagMpc^{1/4} eta
@@ -156,7 +154,10 @@ public:

    msCG(Mpc, PhiOdd, Y);

-    if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+    auto grid = FermOp.FermionGrid();
+    auto r=rand();
+    grid->Broadcast(0,r);
+    if ( (r%param.BoundsCheckFreq)==0 ) { 
      FermionField gauss(FermOp.FermionRedBlackGrid());
      gauss = PhiOdd;
      HighBoundCheck(Mpc,gauss,param.hi);
@@ -101,7 +101,7 @@ NAMESPACE_BEGIN(Grid);
      }
      
      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
@@ -170,7 +170,10 @@ NAMESPACE_BEGIN(Grid);
 	msCG_M(MdagM,X,Y);

 	// Randomly apply rational bounds checks.
-	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+	auto grid = NumOp.FermionGrid();
+        auto r=rand();
+        grid->Broadcast(0,r);
+        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(NumOp.FermionRedBlackGrid());
 	  gauss = PhiOdd;
 	  HighBoundCheck(MdagM,gauss,param.hi);
@@ -98,7 +98,7 @@ NAMESPACE_BEGIN(Grid);


      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

 	
 	// P(phi) = e^{- phi^dag (MdagM)^-1/2 phi}
@@ -142,7 +142,10 @@ NAMESPACE_BEGIN(Grid);

 	msCG(MdagMOp,Phi,Y);

-	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+	auto grid = FermOp.FermionGrid();
+        auto r=rand();
+        grid->Broadcast(0,r);
+        if ( (r%param.BoundsCheckFreq)==0 ) { 
 	  FermionField gauss(FermOp.FermionGrid());
 	  gauss = Phi;
 	  HighBoundCheck(MdagMOp,gauss,param.hi);
@@ -95,7 +95,7 @@ NAMESPACE_BEGIN(Grid);
      }
      

-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

 	// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi       
 	//
@@ -156,7 +156,10 @@ NAMESPACE_BEGIN(Grid);
 	msCG_M(MdagM,X,Y);

 	// Randomly apply rational bounds checks.
-	if ( (rand()%param.BoundsCheckFreq)==0 ) { 
+        auto grid = NumOp.FermionGrid();
+        auto r=rand();
+        grid->Broadcast(0,r);
+        if ( (r%param.BoundsCheckFreq)==0 ) { 	
 	  FermionField gauss(NumOp.FermionGrid());
 	  gauss = Phi;
 	  HighBoundCheck(MdagM,gauss,param.hi);
@@ -73,7 +73,7 @@ public:
  //////////////////////////////////////////////////////////////////////////////////////
  // Push the gauge field in to the dops. Assume any BC's and smearing already applied
  //////////////////////////////////////////////////////////////////////////////////////
-  virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
    // P(phi) = e^{- phi^dag (MdagM)^-1 phi}
    // Phi = Mdag eta
    // P(eta) = e^{- eta^dag eta}
@@ -77,7 +77,7 @@ public:
  //////////////////////////////////////////////////////////////////////////////////////
  // Push the gauge field in to the dops. Assume any BC's and smearing already applied
  //////////////////////////////////////////////////////////////////////////////////////
-  virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
    
    // P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
    // Phi = McpDag eta 
@@ -84,7 +84,7 @@ NAMESPACE_BEGIN(Grid);
      } 

      
-      virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+      virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

        // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
        //
@@ -64,7 +64,7 @@ public:
    return sstream.str();
  }  
      
-  virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
+  virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {

    // P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
    //
@@ -55,7 +55,7 @@ public:
  }
  virtual std::string action_name() {return "ScalarAction";}

-  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms
+  virtual void refresh(const Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {}  // noop as no pseudoferms

  virtual RealD S(const Field &p) {
    return (mass_square * 0.5 + Nd) * ScalarObs<Impl>::sumphisquared(p) +
@@ -27,7 +27,7 @@ public:
  typedef Field              FermionField;
  typedef Field              PropagatorField;
    
-  static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
+  static inline void generate_momenta(Field& P, GridSerialRNG &sRNG, GridParallelRNG& pRNG){
    RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
    gaussian(pRNG, P);
    P *= scale; 
@@ -151,7 +151,7 @@ public:
      out = one / out;
    }

-    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
+    static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG)
    {
      RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
 #ifndef USE_FFT_ACCELERATION
@@ -77,7 +77,7 @@ public:

  virtual std::string action_name() { return "ScalarAction"; }

-  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
+  virtual void refresh(const Field &U, GridSerialRNG & sRNG, GridParallelRNG &pRNG) {}

  virtual RealD S(const Field &p)
  {
@@ -139,7 +139,7 @@ private:
  // Evolution
  /////////////////////////////////////////////////////////
  RealD evolve_hmc_step(Field &U) {
-    TheIntegrator.refresh(U, pRNG);  // set U and initialize P and phi's
+    TheIntegrator.refresh(U, sRNG, pRNG);  // set U and initialize P and phi's

    RealD H0 = TheIntegrator.S(U);  // initial state action

@@ -33,6 +33,7 @@ directory
 #define INTEGRATOR_INCLUDED

 #include <memory>
+#include "MomentumFilter.h"

 NAMESPACE_BEGIN(Grid);

@@ -78,8 +79,19 @@ protected:
  RepresentationPolicy Representations;
  IntegratorParameters Params;

+  //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC
+  //It is applied whenever the momentum is updated / refreshed
+  //The default filter does nothing
+  MomentumFilterBase<MomentaField> const* MomFilter;
+
  const ActionSet<Field, RepresentationPolicy> as;

+  //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
+  static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){ 
+    static MomentumFilterNone<MomentaField> filter;
+    return &filter;
+  }
+
  void update_P(Field& U, int level, double ep) 
  {
    t_P[level] += ep;
@@ -135,6 +147,8 @@ protected:

    // Force from the other representations
    as[level].apply(update_P_hireps, Representations, Mom, U, ep);
+
+    MomFilter->applyFilter(Mom);
  }

  void update_U(Field& U, double ep) 
@@ -174,11 +188,23 @@ public:
    t_P.resize(levels, 0.0);
    t_U = 0.0;
    // initialization of smearer delegated outside of Integrator
+
+    //Default the momentum filter to "do-nothing"
+    MomFilter = getDefaultMomFilter();
  };

  virtual ~Integrator() {}

  virtual std::string integrator_name() = 0;
+  
+  //Set the momentum filter allowing for manipulation of the conjugate momentum
+  void setMomentumFilter(const MomentumFilterBase<MomentaField> &filter){
+    MomFilter = &filter;
+  }
+
+  //Access the conjugate momentum
+  const MomentaField & getMomentum() const{ return P; }
+  

  void print_parameters()
  {
@@ -210,10 +236,9 @@ public:
  // over the representations
  struct _refresh {
    template <class FieldType, class Repr>
-    void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep,
-                    GridParallelRNG& pRNG) {
+    void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, GridSerialRNG & sRNG, GridParallelRNG& pRNG) {
      for (int a = 0; a < repr_set.size(); ++a){
-        repr_set.at(a)->refresh(Rep.U, pRNG);
+        repr_set.at(a)->refresh(Rep.U, sRNG, pRNG);
      
 	std::cout << GridLogDebug << "Hirep refreshing pseudofermions" << std::endl;
      }
@@ -221,12 +246,12 @@ public:
  } refresh_hireps{};

  // Initialization of momenta and actions
-  void refresh(Field& U, GridParallelRNG& pRNG) 
+  void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG) 
  {
    assert(P.Grid() == U.Grid());
    std::cout << GridLogIntegrator << "Integrator refresh\n";

-    FieldImplementation::generate_momenta(P, pRNG);
+    FieldImplementation::generate_momenta(P, sRNG, pRNG);

    // Update the smeared fields, can be implemented as observer
    // necessary to keep the fields updated even after a reject
@@ -243,12 +268,14 @@ public:
        // get gauge field from the SmearingPolicy and
        // based on the boolean is_smeared in actionID
        Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
-        as[level].actions.at(actionID)->refresh(Us, pRNG);
+        as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
      }

      // Refresh the higher representation actions
-      as[level].apply(refresh_hireps, Representations, pRNG);
+      as[level].apply(refresh_hireps, Representations, sRNG, pRNG);
    }
+
+    MomFilter->applyFilter(P);
  }

  // to be used by the actionlevel class to iterate
@@ -0,0 +1,94 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/hmc/integrators/MomentumFilter.h
+
+Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+//--------------------------------------------------------------------
+#ifndef MOMENTUM_FILTER
+#define MOMENTUM_FILTER
+
+NAMESPACE_BEGIN(Grid);
+
+//These filter objects allow the user to manipulate the conjugate momentum as part of the update / refresh
+
+template<typename MomentaField>
+struct MomentumFilterBase{
+  virtual void applyFilter(MomentaField &P) const;
+};
+
+//Do nothing
+template<typename MomentaField>
+struct MomentumFilterNone: public MomentumFilterBase<MomentaField>{
+  void applyFilter(MomentaField &P) const override{}
+};
+
+//Multiply each site/direction by a Lorentz vector complex number field
+//Can be used to implement a mask, zeroing out sites
+template<typename MomentaField>
+struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
+  typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
+  typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
+  typedef iVector<iScalar<iScalar<vector_type> >, Nd > LorentzScalarType; //complex phase for each site/direction
+  typedef Lattice<LorentzScalarType> LatticeLorentzScalarType;
+  
+  LatticeLorentzScalarType phase;
+ 
+  MomentumFilterApplyPhase(const LatticeLorentzScalarType _phase): phase(_phase){}
+
+  //Default to uniform field of (1,0)
+  MomentumFilterApplyPhase(GridBase* _grid): phase(_grid){
+    LorentzScalarType one;
+    for(int mu=0;mu<Nd;mu++)
+      one(mu)()() = scalar_type(1.);
+    
+    phase = one;
+  }
+
+  void applyFilter(MomentaField &P) const override{
+    conformable(P,phase);
+    autoView( P_v , P, AcceleratorWrite);
+    autoView( phase_v , phase, AcceleratorRead);
+
+    accelerator_for(ss,P_v.size(),MomentaField::vector_type::Nsimd(),{
+    	auto site_mom = P_v(ss);
+    	auto site_phase = phase_v(ss);
+	for(int mu=0;mu<Nd;mu++)
+	  site_mom(mu) = site_mom(mu) * site_phase(mu);
+    	coalescedWrite(P_v[ss], site_mom);
+      });
+    
+  }
+
+
+};
+
+
+
+
+NAMESPACE_END(Grid);
+
+#endif
@@ -85,21 +85,18 @@ public:

    std::cout << GridLogDebug << "Stout smearing started\n";

-    // Smear the configurations
+    // C contains the staples multiplied by some rho
+    u_smr = U ; // set the smeared field to the current gauge field
    SmearBase->smear(C, U);

    for (int mu = 0; mu < Nd; mu++) {
-      if( mu == OrthogDim )
-        tmp = 1.0;  // Don't smear in the orthogonal direction
-      else {
-        tmp = peekLorentz(C, mu);
-        Umu = peekLorentz(U, mu);
-        iq_mu = Ta(
-                   tmp *
-                   adj(Umu));  // iq_mu = Ta(Omega_mu) to match the signs with the paper
-        exponentiate_iQ(tmp, iq_mu);
-      }
-      pokeLorentz(u_smr, tmp * Umu, mu);  // u_smr = exp(iQ_mu)*U_mu
+      if( mu == OrthogDim ) continue ;
+      // u_smr = exp(iQ_mu)*U_mu apart from Orthogdim
+      Umu = peekLorentz(U, mu);
+      tmp = peekLorentz(C, mu);
+      iq_mu = Ta( tmp * adj(Umu));  
+      exponentiate_iQ(tmp, iq_mu);
+      pokeLorentz(u_smr, tmp * Umu, mu);
    }
    std::cout << GridLogDebug << "Stout smearing completed\n";
  };
@@ -93,13 +93,13 @@ public:
  GeneralisedMomenta(GridBase* grid, Metric<MomentaField>& M): M(M), Mom(grid), AuxMom(grid), AuxField(grid){}

  // Correct
-  void MomentaDistribution(GridParallelRNG& pRNG){
+  void MomentaDistribution(GridSerialRNG & sRNG, GridParallelRNG& pRNG){
    // Generate a distribution for
    // P^dag G P
    // where G = M^-1

    // Generate gaussian momenta
-    Implementation::generate_momenta(Mom, pRNG);
+    Implementation::generate_momenta(Mom, sRNG, pRNG);
    // Modify the distribution with the metric
    M.MSquareRoot(Mom);

@@ -107,8 +107,8 @@ public:
      // Auxiliary momenta
      // do nothing if trivial, so hide in the metric
      MomentaField AuxMomTemp(Mom.Grid());
-      Implementation::generate_momenta(AuxMom, pRNG);
-      Implementation::generate_momenta(AuxField, pRNG);
+      Implementation::generate_momenta(AuxMom, sRNG, pRNG);
+      Implementation::generate_momenta(AuxField, sRNG, pRNG);
      // Modify the distribution with the metric
      // Aux^dag M Aux
      M.MInvSquareRoot(AuxMom);  // AuxMom = M^{-1/2} AuxMomTemp
@@ -60,11 +60,26 @@ template<class pair>
 class GpuComplex {
 public:
  pair z;
-  typedef decltype(z.x) real;
+  typedef decltype(z.x) Real;
 public: 
  accelerator_inline GpuComplex() = default;
-  accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; };
+  accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; };
  accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;};
+  accelerator_inline Real real(void) const { return z.x; };
+  accelerator_inline Real imag(void) const { return z.y; };
+  accelerator_inline GpuComplex &operator=(const Zero &zz) { z.x = 0; z.y=0; return *this; };
+  accelerator_inline GpuComplex &operator*=(const GpuComplex &r) {
+    *this = (*this) * r;
+    return *this;
+  }
+  accelerator_inline GpuComplex &operator+=(const GpuComplex &r) {
+    *this = (*this) + r;
+    return *this;
+  }
+  accelerator_inline GpuComplex &operator-=(const GpuComplex &r) {
+    *this = (*this) - r;
+    return *this;
+  }
  friend accelerator_inline  GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) { 
    GpuComplex r ; 
    r.z.x = lhs.z.x + rhs.z.x; 
@@ -157,6 +172,11 @@ typedef GpuVector<NSIMD_RealD,    double      > GpuVectorRD;
 typedef GpuVector<NSIMD_ComplexD, GpuComplexD > GpuVectorCD;
 typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI;

+accelerator_inline GpuComplexF timesI(const GpuComplexF &r)     { return(GpuComplexF(-r.imag(),r.real()));}
+accelerator_inline GpuComplexD timesI(const GpuComplexD &r)     { return(GpuComplexD(-r.imag(),r.real()));}
+accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r){ return(GpuComplexF(r.imag(),-r.real()));}
+accelerator_inline GpuComplexD timesMinusI(const GpuComplexD &r){ return(GpuComplexD(r.imag(),-r.real()));}
+
 accelerator_inline float half2float(half h)
 {
  float f;
@@ -208,8 +208,8 @@ struct RealPart<complex<T> > {
 //////////////////////////////////////
 // type alias used to simplify the syntax of std::enable_if
 template <typename T> using Invoke = typename T::type;
-template <typename Condition, typename ReturnType> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
-template <typename Condition, typename ReturnType> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;
+template <typename Condition, typename ReturnType = void> using EnableIf    = Invoke<std::enable_if<Condition::value, ReturnType> >;
+template <typename Condition, typename ReturnType = void> using NotEnableIf = Invoke<std::enable_if<!Condition::value, ReturnType> >;

 ////////////////////////////////////////////////////////
 // Check for complexity with type traits
@@ -148,10 +148,14 @@ accelerator_inline void sub (ComplexF * __restrict__ y,const ComplexF * __restri
 accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
  
 //conjugate already supported for complex
-accelerator_inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
-accelerator_inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
-accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
-accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
+accelerator_inline ComplexF timesI(const ComplexF &r)     { return(ComplexF(-r.imag(),r.real()));}
+accelerator_inline ComplexD timesI(const ComplexD &r)     { return(ComplexD(-r.imag(),r.real()));}
+accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(ComplexF(r.imag(),-r.real()));}
+accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(ComplexD(r.imag(),-r.real()));}
+//accelerator_inline ComplexF timesI(const ComplexF &r)     { return(r*ComplexF(0.0,1.0));}
+//accelerator_inline ComplexD timesI(const ComplexD &r)     { return(r*ComplexD(0.0,1.0));}
+//accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
+//accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}

 // define projections to real and imaginay parts
 accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));}
@@ -7,20 +7,20 @@ template<class vobj>
 class SimpleCompressor {
 public:
  void Point(int) {};
-  accelerator_inline int  CommDatumSize(void) { return sizeof(vobj); }
-  accelerator_inline bool DecompressionStep(void) { return false; }
-  template<class cobj> accelerator_inline void Compress(cobj *buf,int o,const cobj &in) { buf[o]=in; }
-  accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){
+  accelerator_inline int  CommDatumSize(void) const { return sizeof(vobj); }
+  accelerator_inline bool DecompressionStep(void) const { return false; }
+  template<class cobj> accelerator_inline void Compress(cobj *buf,int o,const cobj &in) const { buf[o]=in; }
+  accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o) const {
    exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
  }
-  accelerator_inline void Decompress(vobj *out,vobj *in, int o){ assert(0); }
+  accelerator_inline void Decompress(vobj *out,vobj *in, int o) const { assert(0); }
  accelerator_inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in,
-			       int j,int k, int m,int type){
+			       int j,int k, int m,int type) const {
    exchange(out0[j],out1[j],in[k],in[m],type);
  }
  // For cshift. Cshift should drop compressor coupling altogether 
  // because I had to decouple the code from the Stencil anyway
-  accelerator_inline vobj operator() (const vobj &arg) {
+  accelerator_inline vobj operator() (const vobj &arg) const {
    return arg;
  }
 };
@@ -147,16 +147,16 @@ class CartesianStencilAccelerator {
  cobj* u_recv_buf_p;
  cobj* u_send_buf_p;

-  accelerator_inline cobj *CommBuf(void) { return u_recv_buf_p; }
+  accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; }

-  accelerator_inline int GetNodeLocal(int osite,int point) {
+  accelerator_inline int GetNodeLocal(int osite,int point) const {
    return this->_entries_p[point+this->_npoints*osite]._is_local;
  }
-  accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) {
+  accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const {
    ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite];
  }

-  accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
+  accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const {
    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
    local = this->_entries_p[ent]._is_local;
    perm  = this->_entries_p[ent]._permute;
@@ -168,14 +168,14 @@ class CartesianStencilAccelerator {
    }
  }

-  accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) {
+  accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) const {
    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
    int local = this->_entries_p[ent]._is_local;
    if (local) return  base + this->_entries_p[ent]._byte_offset;
    else       return cbase + this->_entries_p[ent]._byte_offset;
  }

-  accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane)
+  accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) const
  {
    Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout);
  }
@@ -221,7 +221,7 @@ public:
  typedef typename cobj::vector_type vector_type;
  typedef typename cobj::scalar_type scalar_type;
  typedef typename cobj::scalar_object scalar_object;
-  typedef CartesianStencilView<vobj,cobj,Parameters> View_type;
+  typedef const CartesianStencilView<vobj,cobj,Parameters> View_type;
  typedef typename View_type::StencilVector StencilVector;
  ///////////////////////////////////////////
  // Helper structs
@@ -64,6 +64,71 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
 }
 #else

+
+//#ifndef GRID_SYCL
+#if 1
+// Use the scalar as our own complex on GPU ... thrust::complex or std::complex
+template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+typename vsimd::scalar_type
+coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::scalar_type S;
+  S * __restrict__ p=(S *)&vec;
+  return p[lane];
+}
+template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+typename vsimd::scalar_type
+coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::scalar_type S;
+
+  S * __restrict__ p=(S *)&vec;
+  int mask = vsimd::Nsimd() >> (ptype + 1);
+  int plane= doperm ? lane ^ mask : lane;
+  return p[plane];
+}
+template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+void coalescedWrite(vsimd & __restrict__ vec,
+		    const typename vsimd::scalar_type & __restrict__ extracted,
+		    int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::scalar_type S;
+  S * __restrict__ p=(S *)&vec;
+  p[lane]=extracted;
+}
+#else
+// For SyCL have option to use GpuComplex from inside the vector type in SIMT loops
+// Faster for some reason
+template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+typename vsimd::vector_type::datum
+coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::vector_type::datum S;
+  S * __restrict__ p=(S *)&vec;
+  return p[lane];
+}
+template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+typename vsimd::vector_type::datum
+coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::vector_type::datum S;
+
+  S * __restrict__ p=(S *)&vec;
+  int mask = vsimd::Nsimd() >> (ptype + 1);
+  int plane= doperm ? lane ^ mask : lane;
+  return p[plane];
+}
+template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
+void coalescedWrite(vsimd & __restrict__ vec,
+		    const typename vsimd::vector_type::datum & __restrict__ extracted,
+		    int lane=acceleratorSIMTlane(vsimd::Nsimd()))
+{
+  typedef typename vsimd::vector_type::datum S;
+  S * __restrict__ p=(S *)&vec;
+  p[lane]=extracted;
+}
+#endif
+
 //////////////////////////////////////////
 // Extract and insert slices on the GPU
 //////////////////////////////////////////
@@ -28,7 +28,7 @@ Author: neo <cossu@post.kek.jp>
 #ifndef GRID_MATH_EXP_H
 #define GRID_MATH_EXP_H

-#define DEFAULT_MAT_EXP 12
+#define DEFAULT_MAT_EXP 20

 NAMESPACE_BEGIN(Grid);

@@ -34,6 +34,16 @@ NAMESPACE_BEGIN(Grid);
 // outerProduct Scalar x Scalar -> Scalar
 //              Vector x Vector -> Matrix
 ///////////////////////////////////////////////////////////////////////////////////////
+template<class CC,IfComplex<CC> = 0>
+accelerator_inline CC outerProduct(const CC &l, const CC& r)
+{
+  return l*conj(r);
+}
+template<class RR,IfReal<RR> = 0>
+accelerator_inline RR outerProduct(const RR &l, const RR& r)
+{
+  return l*r;
+}

 template<class l,class r,int N> accelerator_inline
 auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
@@ -57,17 +67,6 @@ auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<declt
  return ret;
 }

-template<class CC,IfComplex<CC> = 0>
-accelerator_inline CC outerProduct(const CC &l, const CC& r)
-{
-  return l*conj(r);
-}
-template<class RR,IfReal<RR> = 0>
-accelerator_inline RR outerProduct(const RR &l, const RR& r)
-{
-  return l*r;
-}
-
 NAMESPACE_END(Grid);

 #endif
@@ -1,6 +1,7 @@
 #include <Grid/GridCore.h>

 NAMESPACE_BEGIN(Grid);
+int      acceleratorAbortOnGpuError=1;
 uint32_t accelerator_threads=2;
 uint32_t acceleratorThreads(void)       {return accelerator_threads;};
 void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
@@ -52,7 +53,6 @@ void acceleratorInit(void)
    prop = gpu_props[i];
    totalDeviceMem = prop.totalGlobalMem;
    if ( world_rank == 0) {
-#ifndef GRID_DEFAULT_GPU
      if ( i==rank ) {
 	printf("AcceleratorCudaInit[%d]: ========================\n",rank);
 	printf("AcceleratorCudaInit[%d]: Device Number    : %d\n", rank,i);
@@ -66,8 +66,8 @@ void acceleratorInit(void)
 	GPU_PROP(warpSize);
 	GPU_PROP(pciBusID);
 	GPU_PROP(pciDeviceID);
+ 	printf("AcceleratorCudaInit[%d]: maxGridSize (%d,%d,%d)\n",rank,prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
      }
-#endif
      //      GPU_PROP(unifiedAddressing);
      //      GPU_PROP(l2CacheSize);
      //      GPU_PROP(singleToDoublePrecisionPerfRatio);
@@ -100,9 +100,11 @@ void     acceleratorInit(void);
 #define accelerator        __host__ __device__
 #define accelerator_inline __host__ __device__ inline

+extern int acceleratorAbortOnGpuError;
+
 accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #ifdef GRID_SIMT
-  return threadIdx.z; 
+  return threadIdx.x; 
 #else
  return 0;
 #endif
@@ -110,36 +112,77 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  {									\
+    int nt=acceleratorThreads();					\
    typedef uint64_t Iterator;						\
    auto lambda = [=] accelerator					\
      (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\
      __VA_ARGS__;							\
    };									\
-    int nt=acceleratorThreads();					\
-    dim3 cu_threads(acceleratorThreads(),1,nsimd);			\
+    dim3 cu_threads(nsimd,acceleratorThreads(),1);			\
    dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\
    LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda);	\
  }

+#define accelerator_for6dNB(iter1, num1,				\
+                            iter2, num2,				\
+                            iter3, num3,				\
+                            iter4, num4,				\
+                            iter5, num5,				\
+			    iter6, num6, ... )				\
+  {									\
+    typedef uint64_t Iterator;						\
+    auto lambda = [=] accelerator					\
+      (Iterator iter1,Iterator iter2,					\
+       Iterator iter3,Iterator iter4,					\
+       Iterator iter5,Iterator iter6) mutable {				\
+      __VA_ARGS__;							\
+    };									\
+    dim3 cu_blocks (num1,num2,num3);					\
+    dim3 cu_threads(num4,num5,num6);					\
+    Lambda6Apply<<<cu_blocks,cu_threads>>>(num1,num2,num3,num4,num5,num6,lambda); \
+  }
+
 template<typename lambda>  __global__
 void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
 {
-  uint64_t x = threadIdx.x + blockDim.x*blockIdx.x;
-  uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
-  uint64_t z = threadIdx.z;
+  // Weird permute is to make lane coalesce for large blocks
+  uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
+  uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
+  uint64_t z = threadIdx.x;
  if ( (x < num1) && (y<num2) && (z<num3) ) {
    Lambda(x,y,z);
  }
 }

+template<typename lambda>  __global__
+void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
+		  uint64_t num4, uint64_t num5, uint64_t num6,
+		  lambda Lambda)
+{
+  uint64_t iter1 = blockIdx.x;
+  uint64_t iter2 = blockIdx.y;
+  uint64_t iter3 = blockIdx.z;
+  uint64_t iter4 = threadIdx.x;
+  uint64_t iter5 = threadIdx.y;
+  uint64_t iter6 = threadIdx.z;
+
+  if ( (iter1 < num1) && (iter2<num2) && (iter3<num3)
+    && (iter4 < num4) && (iter5<num5) && (iter6<num6) )
+  {
+    Lambda(iter1,iter2,iter3,iter4,iter5,iter6);
+  }
+}
+
 #define accelerator_barrier(dummy)					\
  {									\
    cudaDeviceSynchronize();						\
    cudaError err = cudaGetLastError();					\
    if ( cudaSuccess != err ) {						\
-      printf("Cuda error %s \n", cudaGetErrorString( err ));		\
-      puts(__FILE__);							\
-      printf("Line %d\n",__LINE__);					\
+      printf("accelerator_barrier(): Cuda error %s \n",			\
+	     cudaGetErrorString( err ));				\
+      printf("File %s Line %d\n",__FILE__,__LINE__);			\
+      fflush(stdout);							\
+      if (acceleratorAbortOnGpuError) assert(err==cudaSuccess);		\
    }									\
  }

@@ -218,7 +261,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
      cgh.parallel_for<class dslash>(					\
      cl::sycl::nd_range<3>(global,local), \
-      [=] (cl::sycl::nd_item<3> item) mutable {       \
+      [=] (cl::sycl::nd_item<3> item) /*mutable*/ {   \
      auto iter1    = item.get_global_id(0);	      \
      auto iter2    = item.get_global_id(1);	      \
      auto lane     = item.get_global_id(2);	      \
@@ -414,7 +457,7 @@ accelerator_inline void acceleratorSynchronise(void)
  __syncwarp();
 #endif
 #ifdef GRID_SYCL
-  // No barrier call on SYCL??  // Option get __spir:: stuff to do warp barrier
+  //cl::sycl::detail::workGroupBarrier();
 #endif
 #ifdef GRID_HIP
  __syncthreads();
@@ -1,5 +1,16 @@
 #pragma once 

+#if defined(__NVCC__)
+
+#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 0)
+#error "NVCC version 11.0 breaks on Ampere, see Github issue 346"
+#endif
+#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 1)
+#error "NVCC version 11.1 breaks on Ampere, see Github issue 346"
+#endif
+
+#endif
+
 #if defined(__clang__)

  #if __clang_major__ < 3
@@ -140,7 +140,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec)
 }

 template<class VectorInt>
-void GridCmdOptionIntVector(std::string &str,VectorInt & vec)
+void GridCmdOptionIntVector(const std::string &str,VectorInt & vec)
 {
  vec.resize(0);
  std::stringstream ss(str);
@@ -153,6 +153,9 @@ void GridCmdOptionIntVector(std::string &str,VectorInt & vec)
  return;
 }

+template void GridCmdOptionIntVector(const std::string &str,std::vector<int> & vec);
+template void GridCmdOptionIntVector(const std::string &str,Coordinate & vec);
+
 void GridCmdOptionInt(std::string &str,int & val)
 {
  std::stringstream ss(str);
@@ -55,7 +55,7 @@ template<class VectorInt>
 std::string GridCmdVectorIntToString(const VectorInt & vec);
 void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
-void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
+void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
 void GridCmdOptionInt(std::string &str,int & val);


@@ -56,12 +56,12 @@ int main(int argc, char **argv) {
  MD.trajL   = 1.0;

  HMCparameters HMCparams;
-  HMCparams.StartTrajectory  = 30;
+  HMCparams.StartTrajectory  = 0;
  HMCparams.Trajectories     = 200;
  HMCparams.NoMetropolisUntil=  0;
  // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
-  //  HMCparams.StartingType     =std::string("ColdStart");
-  HMCparams.StartingType     =std::string("CheckpointStart");
+  HMCparams.StartingType     =std::string("ColdStart");
+  //  HMCparams.StartingType     =std::string("CheckpointStart");
  HMCparams.MD = MD;
  HMCWrapper TheHMC(HMCparams);

@@ -1,4 +1,4 @@
-# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) 

 **Data parallel C++ mathematical object library.**

@@ -149,7 +149,6 @@ If you want to build all the tests at once just use `make tests`.
 - `--enable-numa`: enable NUMA first touch optimisation
 - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option**
 - `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
 - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
 - `--disable-timers`: disable system dependent high-resolution timers.
@@ -1,3 +1,6 @@
+-- comms threads issue??
+-- Part done: Staggered kernel performance on GPU
+
 =========================================================
 General
 =========================================================
@@ -5,28 +8,18 @@ General
 - Make representations code take Gimpl
 - Simplify the HMCand remove modules
 - Lattice_arith - are the mult, mac etc.. still needed after ET engine?
- Lattice_rng
- Lattice_transfer.h
- accelerate A2Autils -- off critical path for HMC
+- Lattice_rng - faster local only loop in init
+- Audit: accelerate A2Autils -- off critical path for HMC

 =========================================================
-GPU branch code item work list
+GPU  work list
 =========================================================

-* sum_cpu promote to double during summation for increased precisoin.
+* sum_cpu promote to double during summation for increased precision.
 * Introduce sumD & ReduceD 
 * GPU sum is probably better currently.
-
 * Accelerate the cshift & benchmark

-* 0) Single GPU
- 128 bit integer table load in GPU code.
-  - ImprovedStaggered accelerate & measure perf
-  - Gianluca's changes to Cayley into gpu-port
-  - Mobius kernel fusion.                     -- Gianluca?
-  - Lebesque order reintroduction. StencilView should have pointer to it
-  - Lebesgue reorder in all kernels
-
 * 3) Comms/NVlink
 - OpenMP tasks to run comms threads. Experiment with it 
 - Remove explicit openMP in staggered. 
@@ -35,14 +28,6 @@ GPU branch code item work list
 - Stencil gather ??
 - SIMD dirs in stencil

-* 4) ET enhancements
- eval -> scalar ops in ET engine
- coalescedRead, coalescedWrite in expressions.
-
-* 5) Misc
- Conserved current clean up.
- multLinkProp eliminate
-
 8) Merge develop and test HMC

 9) Gamma tables on GPU; check this. Appear to work, but no idea why. Are these done on CPU?
@@ -52,7 +37,7 @@ GPU branch code item work list
 -     Audit NAMESPACE CHANGES
 -     Audit changes

-----
+---------
 Gianluca's changes
 - Performance impact of construct in aligned allocator???
 ---------
@@ -62,6 +47,33 @@ Gianluca's changes
 -----------------------------
 DONE:
 -----------------------------
+=====
+-- Done: Remez X^-1/2 X^-1/2 X = 1 test.
+         Feed in MdagM^2 as a test and take its sqrt.
+         Automated test that MdagM invsqrt(MdagM)invsqrt(MdagM) = 1 in HMC for bounds satisfaction.
+
+-- Done: Sycl Kernels into develop. Compare to existing unroll and just use.
+-- Done: sRNG into refresh functions
+-- Done: Tuned decomposition on CUDA into develop
+-- Done: Sycl friend accessor. Const view attempt via typedef??
+
+
+* Done 5) Misc
+- Conserved current clean up.
+- multLinkProp eliminate
+
+* Done 0) Single GPU
+- 128 bit integer table load in GPU code.
+  - ImprovedStaggered accelerate & measure perf
+  - Gianluca's changes to Cayley into gpu-port
+  - Mobius kernel fusion.                     -- Gianluca?
+  - Lebesque order reintroduction. StencilView should have pointer to it
+  - Lebesgue reorder in all kernels
+
+* 4) ET enhancements
+- Done eval -> scalar ops in ET engine
+- Done coalescedRead, coalescedWrite in expressions.
+
 =============================================================================================
 AUDIT ContractWWVV with respect to develop    -- DONE
 - GPU accelerate EOFA                                                  -- DONE
@@ -125,23 +137,6 @@ AUDIT ContractWWVV with respect to develop    -- DONE
 - -      (4) omp parallel for collapse(n)
 - - Only (1) has a natural mirror in accelerator_loop
 - - Nested loop macros get cumbersome made a generic interface for N deep
- - Don't like thread_region and thread_loop_in_region
- - Could replace with 
-
-    thread_nested(1, 
-      for {
-
-      }
-    );
-    thread_nested(2,
-      for (){
-        for (){
-
-	}
-      }
-    );
-
-    and same "in_region".


 -----------------------------
@@ -53,7 +53,7 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();

  Coordinate latt4 = GridDefaultLatt();
-  int Ls=8;
+  int Ls=16;
  for(int i=0;i<argc;i++)
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
@@ -7,7 +7,12 @@ AM_INIT_AUTOMAKE([subdir-objects 1.13])
 AM_EXTRA_RECURSIVE_TARGETS([tests bench])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([Grid/Grid.h])
-AC_CONFIG_HEADERS([Grid/Config.h],[sed -i 's|PACKAGE_|GRID_|' Grid/Config.h])
+AC_CONFIG_HEADERS([Grid/Config.h],[[$SED_INPLACE -e 's|PACKAGE_|GRID_|' -e 's|[[:space:]]PACKAGE[[:space:]]| GRID_PACKAGE |' -e 's|[[:space:]]VERSION[[:space:]]| GRID_PACKAGE_VERSION |' Grid/Config.h]],
+    [if test x"$host_os" == x"${host_os#darwin}" ; then]
+        [SED_INPLACE="sed -i"]
+    [else]
+        [SED_INPLACE="sed -i .bak"]
+    [fi])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])

 ################ Get git info
@@ -125,7 +130,7 @@ esac

 ############### fermions
 AC_ARG_ENABLE([fermion-reps],
-     [AC_HELP_STRING([--fermion-reps=yes|no], [enable extra fermion representation support])],
+     [AC_HELP_STRING([--enable-fermion-reps=yes|no], [enable extra fermion representation support])],
     [ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])

 AM_CONDITIONAL(BUILD_FERMION_REPS, [ test "${ac_FERMION_REPS}X" == "yesX" ])
@@ -135,12 +140,23 @@ AC_ARG_ENABLE([gparity],
     [ac_GPARITY=${enable_gparity}], [ac_GPARITY=yes])

 AM_CONDITIONAL(BUILD_GPARITY, [ test "${ac_GPARITY}X" == "yesX" ])
+
+AC_ARG_ENABLE([zmobius],
+     [AC_HELP_STRING([--enable-zmobius=yes|no], [enable Zmobius support])],
+     [ac_ZMOBIUS=${enable_zmobius}], [ac_ZMOBIUS=yes])
+
+AM_CONDITIONAL(BUILD_ZMOBIUS, [ test "${ac_ZMOBIUS}X" == "yesX" ])
+
+
 case ${ac_FERMION_REPS} in
   yes) AC_DEFINE([ENABLE_FERMION_REPS],[1],[non QCD fermion reps]);;
 esac
 case ${ac_GPARITY} in
   yes) AC_DEFINE([ENABLE_GPARITY],[1],[fermion actions with GPARITY BCs]);;
 esac
+case ${ac_ZMOBIUS} in
+   yes) AC_DEFINE([ENABLE_ZMOBIUS],[1],[Zmobius fermion actions]);;
+esac
 ############### Nc
 AC_ARG_ENABLE([Nc],
    [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])],
@@ -428,7 +444,7 @@ case ${ax_cv_cxx_compiler_vendor} in
        SIMD_FLAGS='-mavx2 -mfma -mf16c';;
      AVX512)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
-        SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
+        SIMD_FLAGS='-mavx512f -mavx512cd';;
      SKL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon])
        SIMD_FLAGS='-march=skylake-avx512';;
@@ -481,6 +497,9 @@ case ${ax_cv_cxx_compiler_vendor} in
        AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
        SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
      AVX512)
+        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
+        SIMD_FLAGS='-xcommon-avx512';;
+      SKL)
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
        SIMD_FLAGS='-xcore-avx512';;
      KNC)
@@ -231,6 +231,20 @@ int main(int argc, char **argv) {
      scalar = localInnerProduct(cVec, cVec);
      scalar = localNorm2(cVec);

+      std::cout << "Testing maxLocalNorm2" <<std::endl;
+      
+      LatticeComplex rand_scalar(&Fine);
+      random(FineRNG, rand_scalar);  //uniform [0,1]
+      for(Integer gsite=0;gsite<Fine.gSites();gsite++){ //check on every site independently
+	scalar = rand_scalar;
+	TComplex big(10.0);
+	Coordinate coor;
+	Fine.GlobalIndexToGlobalCoor(gsite,coor);
+        pokeSite(big,scalar,coor);
+	
+	RealD Linfty = maxLocalNorm2(scalar);
+	assert(Linfty == 100.0);
+      }
      //     -=,+=,*=,()
      //     add,+,sub,-,mult,mac,*
      //     adj,conjugate
@@ -549,7 +563,8 @@ int main(int argc, char **argv) {

                  std::vector<int> shiftcoor = coor;
                  shiftcoor[dir] = (shiftcoor[dir] + shift + latt_size[dir]) %
-                                   (latt_size[dir] / mpi_layout[dir]);
+                                   (latt_size[dir]);
+		  //                                   (latt_size[dir] / mpi_layout[dir]);

                  std::vector<int> rl(4);
                  for (int dd = 0; dd < 4; dd++) {
@@ -40,9 +40,9 @@ int main (int argc, char ** argv)

  int N=16;
  
-  std::vector<int> latt_size  ({N,4,4});
-  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1});
-  std::vector<int> mpi_layout ({1,1,1});
+  std::vector<int> latt_size  ({N,N,N,N});
+  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1,1});
+  std::vector<int> mpi_layout ({1,1,1,1});

  int vol = 1;
  int nd  = latt_size.size();
@@ -69,7 +69,7 @@ int main (int argc, char ** argv)
    for(int t=0;t<latt_size[mu];t++){
      LatticeCoordinate(coor,mu);
      sl=where(coor==Integer(t),rn,zz);
-      std::cout <<GridLogMessage<< " sl " << sl<<std::endl;
+      //      std::cout <<GridLogMessage<< " sl " << sl<<std::endl;
      std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
      ns=ns+norm2(sl);
    }
@@ -0,0 +1,143 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_poisson_fft.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+ ;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  int N=16;
+  
+  std::vector<int> latt_size  ({N,4,4});
+  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1});
+  std::vector<int> mpi_layout ({1,1,1});
+
+  int vol = 1;
+  int nd  = latt_size.size();
+  for(int d=0;d<nd;d++){
+    vol = vol * latt_size[d];
+  }
+
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+  GridParallelRNG RNG(&GRID);
+  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));  
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"== LatticeComplex =="<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  {
+    LatticeComplexD      zz(&GRID);
+    LatticeInteger     coor(&GRID);
+    LatticeComplexD  rn(&GRID);
+    LatticeComplexD  sl(&GRID);
+
+    zz  = ComplexD(0.0,0.0);
+
+    gaussian(RNG,rn);
+    
+    RealD nn=norm2(rn);
+    for(int mu=0;mu<nd;mu++){
+      RealD ns=0.0;
+      for(int t=0;t<latt_size[mu];t++){
+	LatticeCoordinate(coor,mu);
+	sl=where(coor==Integer(t),rn,zz);
+	std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
+	ns=ns+norm2(sl);
+      }
+      std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" err " << nn-ns<<std::endl;
+      assert(abs(nn-ns) < 1.0e-10);
+    }
+  }
+
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"== LatticeFermion =="<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  {
+    LatticeFermionD      zz(&GRID);
+    LatticeInteger     coor(&GRID);
+    LatticeFermionD  rn(&GRID);
+    LatticeFermionD  sl(&GRID);
+
+    zz  = ComplexD(0.0,0.0);
+
+    gaussian(RNG,rn);
+    
+    RealD nn=norm2(rn);
+    for(int mu=0;mu<nd;mu++){
+      RealD ns=0.0;
+      for(int t=0;t<latt_size[mu];t++){
+	LatticeCoordinate(coor,mu);
+	sl=where(coor==Integer(t),rn,zz);
+	std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
+	ns=ns+norm2(sl);
+      }
+      std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" err " << nn-ns<<std::endl;
+      assert(abs(nn-ns) < 1.0e-10);
+    }
+  }
+
+
+
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+  std::cout<<GridLogMessage<<"== LatticePropagator =="<<std::endl;
+  std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
+
+  {
+    LatticePropagatorD      zz(&GRID);
+    LatticeInteger     coor(&GRID);
+    LatticePropagatorD  rn(&GRID);
+    LatticePropagatorD  sl(&GRID);
+
+    zz  = ComplexD(0.0,0.0);
+
+    gaussian(RNG,rn);
+    
+    RealD nn=norm2(rn);
+    for(int mu=0;mu<nd;mu++){
+      RealD ns=0.0;
+      for(int t=0;t<latt_size[mu];t++){
+	LatticeCoordinate(coor,mu);
+	sl=where(coor==Integer(t),rn,zz);
+	std::cout <<GridLogMessage<<" slice "<<t<<" " << norm2(sl)<<std::endl;
+	ns=ns+norm2(sl);
+      }
+      std::cout <<GridLogMessage <<" sliceNorm" <<mu<<" "<< nn <<" "<<ns<<" err " << nn-ns<<std::endl;
+      assert(abs(nn-ns) < 1.0e-10);
+    }
+  }
+
+  Grid_finalize();
+}
@@ -33,13 +33,14 @@ using namespace Grid;


 template<class What> 
-void  TestConserved(What & Ddwf, What & Ddwfrev, 
+void  TestConserved(What & Ddwf,
 		    LatticeGaugeField &Umu,
 		    GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
 		    GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
 		    RealD mass, RealD M5,
 		    GridParallelRNG *RNG4,
-		    GridParallelRNG *RNG5);
+		    GridParallelRNG *RNG5,
+                    What *Ddwfrev=nullptr);

  Gamma::Algebra Gmu [] = {
    Gamma::Algebra::GammaX,
@@ -102,10 +103,11 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);


-  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          RNG4(UGrid);
+  std::vector<int> seeds4({1,2,3,4}); RNG4.SeedFixedIntegers(seeds4);
+  //const std::string seeds4{ "test-gauge-3000" }; RNG4.SeedUniqueString( seeds4 );

  LatticeGaugeField Umu(UGrid);
  if( argc > 1 && argv[1][0] != '-' )
@@ -116,9 +118,9 @@ int main (int argc, char ** argv)
  }
  else
  {
-    std::cout<<GridLogMessage <<"Using cold configuration"<<std::endl;
-    SU<Nc>::ColdConfiguration(Umu);
-    //  SU<Nc>::HotConfiguration(RNG4,Umu);
+    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
+    // SU<Nc>::ColdConfiguration(Umu);
+    SU<Nc>::HotConfiguration(RNG4,Umu);
  }

  RealD mass=0.3;
@@ -127,7 +129,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  TestConserved<DomainWallFermionR>(Ddwf,Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestConserved<DomainWallFermionR>(Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);

  RealD b=1.5;// Scale factor b+c=2, b-c=1
  RealD c=0.5;
@@ -137,13 +139,13 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage <<"MobiusFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  MobiusFermionR Dmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
-  TestConserved<MobiusFermionR>(Dmob,Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestConserved<MobiusFermionR>(Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);

  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"ScaledShamirFermion test"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  ScaledShamirFermionR Dsham(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,2.0);
-  TestConserved<ScaledShamirFermionR>(Dsham,Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestConserved<ScaledShamirFermionR>(Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);

  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"ZMobiusFermion test"<<std::endl;
@@ -152,8 +154,7 @@ int main (int argc, char ** argv)
  //  for(int s=0;s<Ls;s++) omegasrev[s]=omegas[Ls-1-s];
  ZMobiusFermionR ZDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,omegas,b,c);
  ZMobiusFermionR ZDmobrev(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,omegasrev,b,c);
-
-  TestConserved<ZMobiusFermionR>(ZDmob,ZDmobrev,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
+  TestConserved<ZMobiusFermionR>(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5,&ZDmobrev);

  Grid_finalize();
 }
@@ -161,22 +162,17 @@ int main (int argc, char ** argv)


 template<class Action> 
-void  TestConserved(Action & Ddwf, 
-		    Action & Ddwfrev, 
+void  TestConserved(Action & Ddwf,
 		    LatticeGaugeField &Umu,
 		    GridCartesian         * FGrid,	       GridRedBlackCartesian * FrbGrid,
 		    GridCartesian         * UGrid,	       GridRedBlackCartesian * UrbGrid,
 		    RealD mass, RealD M5,
 		    GridParallelRNG *RNG4,
-		    GridParallelRNG *RNG5)
+		    GridParallelRNG *RNG5,
+                    Action * Ddwfrev)
 {
-  int Ls=Ddwf.Ls;
-
-  LatticePropagator phys_src(UGrid); 
-
-  std::vector<LatticeColourMatrix> U(4,UGrid);
-  
-  LatticePropagator seqsrc(FGrid); 
+  LatticePropagator phys_src(UGrid);
+  LatticePropagator seqsrc(FGrid);
  LatticePropagator prop5(FGrid); 
  LatticePropagator prop5rev(FGrid); 
  LatticePropagator prop4(UGrid); 
@@ -194,9 +190,9 @@ void  TestConserved(Action & Ddwf,
  phys_src=Zero();
  pokeSite(kronecker,phys_src,coor);
  
-  MdagMLinearOperator<Action,LatticeFermion> HermOp(Ddwf);
-  MdagMLinearOperator<Action,LatticeFermion> HermOprev(Ddwfrev);
  ConjugateGradient<LatticeFermion> CG(1.0e-16,100000);
+  SchurRedBlackDiagTwoSolve<LatticeFermion> schur(CG);
+  ZeroGuesser<LatticeFermion> zpg;
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      LatticeFermion src4  (UGrid); 
@@ -206,20 +202,20 @@ void  TestConserved(Action & Ddwf,
      Ddwf.ImportPhysicalFermionSource(src4,src5);

      LatticeFermion result5(FGrid); result5=Zero();
-
-      // CGNE
-      LatticeFermion Mdagsrc5  (FGrid); 
-      Ddwf.Mdag(src5,Mdagsrc5);
-      CG(HermOp,Mdagsrc5,result5);
+      schur(Ddwf,src5,result5,zpg);
+      std::cout<<GridLogMessage<<"spin "<<s<<" color "<<c<<" norm2(sourc5d) "<<norm2(src5)
+               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
      FermToProp<Action>(prop5,result5,s,c);

      LatticeFermion result4(UGrid);
      Ddwf.ExportPhysicalFermionSolution(result5,result4);
      FermToProp<Action>(prop4,result4,s,c);

-      Ddwfrev.ImportPhysicalFermionSource(src4,src5);
-      Ddwfrev.Mdag(src5,Mdagsrc5);
-      CG(HermOprev,Mdagsrc5,result5);
+      if( Ddwfrev ) {
+        Ddwfrev->ImportPhysicalFermionSource(src4,src5);
+        result5 = Zero();
+        schur(*Ddwfrev,src5,result5,zpg);
+      }
      FermToProp<Action>(prop5rev,result5,s,c);
    }
  }
@@ -251,11 +247,7 @@ void  TestConserved(Action & Ddwf,
      PropToFerm<Action>(src5,seqsrc,s,c);

      LatticeFermion result5(FGrid); result5=Zero();
-
-      // CGNE
-      LatticeFermion Mdagsrc5  (FGrid); 
-      Ddwf.Mdag(src5,Mdagsrc5);
-      CG(HermOp,Mdagsrc5,result5);
+      schur(Ddwf,src5,result5,zpg);

      LatticeFermion result4(UGrid);
      Ddwf.ExportPhysicalFermionSolution(result5,result4);
@@ -276,10 +268,10 @@ void  TestConserved(Action & Ddwf,
  Ddwf.ContractConservedCurrent(prop5rev,prop5,Vector_mu,phys_src,Current::Vector,Tdir);
  Ddwf.ContractJ5q(prop5,PJ5q);
  
-  PA       = trace(g5*Axial_mu);
-  SV       = trace(Vector_mu);
-  VV       = trace(gT*Vector_mu);
-  PP       = trace(adj(prop4)*prop4);
+  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
+  SV       = trace(Vector_mu);        // Scalar-Vector conserved current
+  VV       = trace(gT*Vector_mu);     // (local) Vector-Vector conserved current
+  PP       = trace(adj(prop4)*prop4); // Pseudoscalar density
  
  // Spatial sum
  sliceSum(PA,sumPA,Tdir);
@@ -288,15 +280,17 @@ void  TestConserved(Action & Ddwf,
  sliceSum(PP,sumPP,Tdir);
  sliceSum(PJ5q,sumPJ5q,Tdir);

-  int Nt=sumPA.size();
+  const int Nt{static_cast<int>(sumPA.size())};
+  std::cout<<GridLogMessage<<"Vector Ward identity by timeslice (~ 0)"<<std::endl;
  for(int t=0;t<Nt;t++){
-    std::cout <<" SV "<<real(TensorRemove(sumSV[t]));
-    std::cout <<" VV "<<real(TensorRemove(sumVV[t]))<<std::endl;
+    std::cout<<GridLogMessage <<" t "<<t<<" SV "<<real(TensorRemove(sumSV[t]))<<" VV "<<real(TensorRemove(sumVV[t]))<<std::endl;
  }
+  std::cout<<GridLogMessage<<"Axial Ward identity by timeslice (defect ~ 0)"<<std::endl;
  for(int t=0;t<Nt;t++){
-    std::cout <<" PAc "<<real(TensorRemove(sumPA[t]));
-    std::cout <<" PJ5q "<<real(TensorRemove(sumPJ5q[t]));
-    std::cout <<" Ward Identity defect " <<real(TensorRemove(sumPA[t]-sumPA[(t-1+Nt)%Nt] - 2.0*(Ddwf.mass*sumPP[t] + sumPJ5q[t]) ))<<"\n";
+    const RealD DmuPAmu{real(TensorRemove(sumPA[t]-sumPA[(t-1+Nt)%Nt]))};
+    std::cout<<GridLogMessage<<" t "<<t<<" DmuPAmu "<<DmuPAmu
+             <<" PP "<<real(TensorRemove(sumPP[t]))<<" PJ5q "<<real(TensorRemove(sumPJ5q[t]))
+             <<" Ward Identity defect " <<(DmuPAmu - 2.*real(TensorRemove(Ddwf.mass*sumPP[t] + sumPJ5q[t])))<<std::endl;
  }
  
  ///////////////////////////////
@@ -66,7 +66,9 @@ int main(int argc, char** argv)
  // Set up RNGs
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
+  GridSerialRNG sRNG;
  GridParallelRNG RNG5(FGrid);
+  sRNG.SeedFixedIntegers(seeds5);
  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
@@ -84,7 +86,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -94,7 +96,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -74,6 +74,9 @@ int main(int argc, char** argv)
  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
+  GridSerialRNG sRNG;
+  RNG4.SeedFixedIntegers(seeds4);
+  sRNG.SeedFixedIntegers(seeds5);

  // Random gauge field
  LatticeGaugeField Umu(UGrid);
@@ -90,7 +93,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -100,7 +103,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, true);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu,sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -68,8 +68,10 @@ int main(int argc, char** argv)
  // Set up RNGs
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
+  GridSerialRNG sRNG;
  GridParallelRNG RNG5(FGrid);
  RNG5.SeedFixedIntegers(seeds5);
+  sRNG.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);

@@ -86,7 +88,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu, sRNG,RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -96,7 +98,7 @@ int main(int argc, char** argv)
    ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu, sRNG,RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -73,7 +73,9 @@ int main(int argc, char** argv)
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
  GridParallelRNG RNG5(FGrid);
+  GridSerialRNG   sRNG;
  RNG5.SeedFixedIntegers(seeds5);
+  sRNG.SeedFixedIntegers(seeds5);
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);

@@ -91,7 +93,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu, sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -101,7 +103,7 @@ int main(int argc, char** argv)
    ConjugateGradient<FermionField> CG(1.0e-12, 5000);
    ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, true);

-    Meofa.refresh(Umu, RNG5);
+    Meofa.refresh(Umu, sRNG, RNG5);
    printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
  }

@@ -86,7 +86,9 @@ int main (int argc, char** argv)
  ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
  ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, true);

-  Meofa.refresh(U, RNG5);
+  GridSerialRNG  sRNG; sRNG.SeedFixedIntegers(seeds4);
+  Meofa.refresh(U, sRNG, RNG5 );
+
  RealD S = Meofa.S(U); // pdag M p

  // get the deriv of phidag M phi with respect to "U"
@@ -84,6 +84,13 @@ int main (int argc, char ** argv)
  GparityDomainWallFermionR::ImplParams params;
  params.twists = twists;

+  /*
+  params.boundary_phases[0] = 1.0;
+  params.boundary_phases[1] = 1.0;
+  params.boundary_phases[2] = 1.0;
+  params.boundary_phases[3] =- 1.0;
+  */
+  
  GparityDomainWallFermionR Dw(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);

  Dw.M   (phi,Mphi);
@@ -96,6 +103,16 @@ int main (int argc, char ** argv)

  Dw.MDeriv(tmp , Mphi,  phi,DaggerNo );  UdSdU=tmp;
  Dw.MDeriv(tmp , phi,  Mphi,DaggerYes ); UdSdU=(UdSdU+tmp);
+
+  // *****************************************************************************************
+  // *** There is a funny negative sign in all derivatives. This is - UdSdU.               ***
+  // ***                                                                                   ***
+  // *** Deriv in both Wilson gauge action and the TwoFlavour.h seems to miss a minus sign ***
+  // *** UdSdU is negated relative to what I think - call what is returned mUdSdU,         ***
+  // *** and insert minus sign                                                             ***
+  // *****************************************************************************************
+
+  UdSdU = - UdSdU ; // Follow sign convention of actions in Grid. Seems crazy.
  
  FermionField Ftmp      (FGrid);

@@ -106,7 +123,7 @@ int main (int argc, char ** argv)
  RealD Hmom = 0.0;
  RealD Hmomprime = 0.0;
  LatticeColourMatrix mommu(UGrid); 
-  LatticeColourMatrix forcemu(UGrid); 
+  LatticeColourMatrix mUdSdUmu(UGrid); 
  LatticeGaugeField mom(UGrid); 
  LatticeGaugeField Uprime(UGrid); 

@@ -114,10 +131,20 @@ int main (int argc, char ** argv)

    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg

-    Hmom -= real(sum(trace(mommu*mommu)));
+  // Momentum Hamiltonian is - trace(p^2)/HMC_MOM_DENOMINATOR
+  //
+  // Integrator.h:   RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom                                                                                       //     GaugeImplTypes.h:        Hloc += trace(Pmu * Pmu);
+  //                          Sign comes from a sneaky multiply by "i" in GaussianFundemantalLie algebra
+  //                          P is i P^a_\mu T^a, not Pa Ta
+  // 
+  // Integrator.h: H =  Hmom + sum S(action)
+    Hmom -= real(sum(trace(mommu*mommu)))/ HMC_MOMENTUM_DENOMINATOR;

    PokeIndex<LorentzIndex>(mom,mommu,mu);

+  // -- Drops factor of "i" in the U update: U' = e^{P dt} U   [ _not_ e^{iPdt}U ]. P is anti hermitian already
+  // -- Udot = p U
+
    // fourth order exponential approx
    autoView( mom_v, mom, CpuRead);
    autoView( U_v , U, CpuRead);
@@ -134,8 +161,8 @@ int main (int argc, char ** argv)
 	;
    });
  }
-
  std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <<std::endl;
+
  Dw.ImportGauge(Uprime);
  Dw.M          (phi,MphiPrime);

@@ -145,53 +172,60 @@ int main (int argc, char ** argv)
  // Use derivative to estimate dS
  //////////////////////////////////////////////

-
-  for(int mu=0;mu<Nd;mu++){
-    std::cout << "" <<std::endl;
-    mommu   = PeekIndex<LorentzIndex>(mom,mu);
-    std::cout << GridLogMessage<< " Mommu  " << norm2(mommu)<<std::endl;
-    mommu   = mommu+adj(mommu);
-    std::cout << GridLogMessage<< " Mommu + Mommudag " << norm2(mommu)<<std::endl;
-    mommu   = PeekIndex<LorentzIndex>(UdSdU,mu);
-    std::cout << GridLogMessage<< " dsdumu  " << norm2(mommu)<<std::endl;
-    mommu   = mommu+adj(mommu);
-    std::cout << GridLogMessage<< " dsdumu + dag  " << norm2(mommu)<<std::endl;
-  }
-
+  //
+  // Ta has 1/2([ F - adj(F) ])_traceless and want the UdSdU _and_ UdagdSdUdag terms so 2x. 
+  //
  LatticeComplex dS(UGrid); dS = Zero();
  LatticeComplex dSmom(UGrid); dSmom = Zero();
  LatticeComplex dSmom2(UGrid); dSmom2 = Zero();
  for(int mu=0;mu<Nd;mu++){
    mommu   = PeekIndex<LorentzIndex>(UdSdU,mu);
-    mommu=Ta(mommu)*2.0;
+    mommu=Ta(mommu); // projectForce , GaugeImplTypes.h
    PokeIndex<LorentzIndex>(UdSdU,mommu,mu);
  }

  for(int mu=0;mu<Nd;mu++){
-    mommu   = PeekIndex<LorentzIndex>(mom,mu);
-    std::cout << GridLogMessage<< " Mommu  " << norm2(mommu)<<std::endl;
-    mommu   = mommu+adj(mommu);
-    std::cout << GridLogMessage<< " Mommu + Mommudag " << norm2(mommu)<<std::endl;
-    mommu   = PeekIndex<LorentzIndex>(UdSdU,mu);
-    std::cout << GridLogMessage<< " dsdumu  " << norm2(mommu)<<std::endl;
-    mommu   = mommu+adj(mommu);
-    std::cout << GridLogMessage<< " dsdumu + dag  " << norm2(mommu)<<std::endl;
-  }

-  for(int mu=0;mu<Nd;mu++){
-    forcemu = PeekIndex<LorentzIndex>(UdSdU,mu);
+    mUdSdUmu= PeekIndex<LorentzIndex>(UdSdU,mu);
    mommu   = PeekIndex<LorentzIndex>(mom,mu);

-    // Update PF action density
-    dS = dS+trace(mommu*forcemu)*dt;
+    //
+    // Derive HMC eom:
+    //
+    // Sdot =  - 2 trace( p p^dot ) / D - trace( p [ mUdSdU - h.c. ] ) = 0 
+    //
+    //
+    // Sdot = 0 = - 2 trace( p p^dot ) / D - 2 trace( p Ta( mUdSdU ) = 0
+    //
+    // EOM: 
+    //
+    // pdot = - D Ta( mUdSdU ) -- source of sign is the "funny sign" above
+    //
+    // dSqcd_dt  = - 2.0*trace(mommu* Ta(mUdSdU) )*dt -- i.e. mUdSdU with adjoint term -> force has a 2x implicit
+    //
+    // dH_mom/dt = - 2 trace (p pdot)/Denom  
+    //
+    // dH_tot / dt = 0 <= pdot = - Denom * mUdSdU 
+    //
+    // dH_mom/dt = 2 trace (p mUdSdU ) 
+    //
+    // True Momentum delta H has a dt^2 piece
+    //
+    // dSmom = [ trace mom*mom - trace ( (mom-Denom*f*dt)(mom-Denom*f*dt) ) ] / Denom
+    //       = 2*trace(mom*f) dt  - Denom*dt*dt * trace(f*f).
+    //       = dSmom + dSmom2
+    //

-    dSmom  = dSmom  - trace(mommu*forcemu) * dt;
-    dSmom2 = dSmom2 - trace(forcemu*forcemu) *(0.25* dt*dt);
+    dS     = dS - 2.0*trace(mommu*mUdSdUmu)*dt;   // U and Udagger derivs hence 2x.

-    // Update mom action density
-    mommu = mommu + forcemu*(dt*0.5);
+    dSmom  = dSmom  + 2.0*trace(mommu*mUdSdUmu) * dt;  // this 2.0 coms from derivative of p^2 
+    
+    dSmom2 = dSmom2 - trace(mUdSdUmu*mUdSdUmu) * dt*dt* HMC_MOMENTUM_DENOMINATOR; // Remnant

-    Hmomprime -= real(sum(trace(mommu*mommu)));
+    // Update mom action density . Verbatim update_P in Integrator.h
+    mommu = mommu - mUdSdUmu * dt* HMC_MOMENTUM_DENOMINATOR;; 
+
+    Hmomprime -= real(sum(trace(mommu*mommu))) / HMC_MOMENTUM_DENOMINATOR;

  }

@@ -199,20 +233,25 @@ int main (int argc, char ** argv)
  ComplexD dSm       = sum(dSmom);
  ComplexD dSm2      = sum(dSmom2);

+  std::cout << GridLogMessage <<"dSm "<< dSm<<std::endl;
+  std::cout << GridLogMessage <<"dSm2 "<< dSm2<<std::endl;

  std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <<std::endl;
  std::cout << GridLogMessage <<"Final   mom hamiltonian is "<< Hmomprime <<std::endl;
-  std::cout << GridLogMessage <<"Delta   mom hamiltonian is "<< Hmomprime-Hmom <<std::endl;

-  std::cout << GridLogMessage << " S      "<<S<<std::endl;
-  std::cout << GridLogMessage << " Sprime "<<Sprime<<std::endl;
-  std::cout << GridLogMessage << "dS      "<<Sprime-S<<std::endl;
-  std::cout << GridLogMessage << "predict dS    "<< dSpred <<std::endl;
-  std::cout << GridLogMessage <<"dSm "<< dSm<<std::endl;
-  std::cout << GridLogMessage <<"dSm2"<< dSm2<<std::endl;
+  std::cout << GridLogMessage <<"Delta   mom hamiltonian is "<< Hmomprime-Hmom <<std::endl;
+  std::cout << GridLogMessage <<"predict Delta mom hamiltonian is "<< dSm+dSm2 <<std::endl;
+  
+  std::cout << GridLogMessage << "Initial S      "<<S<<std::endl;
+  std::cout << GridLogMessage << "Final   S      "<<Sprime<<std::endl;
+  std::cout << GridLogMessage << "Delta   S      "<<Sprime-S<<std::endl;
+  std::cout << GridLogMessage << "predict delta S"<< dSpred <<std::endl;
+  std::cout << GridLogMessage << "defect "<< Sprime-S-dSpred <<std::endl;

  std::cout << GridLogMessage << "Total dS    "<< Hmomprime - Hmom + Sprime - S <<std::endl;

+  std::cout << GridLogMessage << "dS - dt^2 term "<< Hmomprime - Hmom + Sprime - S - dSm2 <<std::endl;
+  
  assert( fabs(real(Sprime-S-dSpred)) < 5.0 ) ;

  std::cout<< GridLogMessage << "Done" <<std::endl;
@@ -90,7 +90,8 @@ int main (int argc, char** argv)
  ConjugateGradient<FermionField> CG(1.0e-12, 5000);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, true);

-  Meofa.refresh(U, RNG5);
+  GridSerialRNG  sRNG; sRNG.SeedFixedIntegers(seeds4);
+  Meofa.refresh(U, sRNG, RNG5);
  RealD S = Meofa.S(U); // pdag M p

  // get the deriv of phidag M phi with respect to "U"
@@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 using namespace std;
 using namespace Grid;
- ;

 

@@ -59,6 +58,10 @@ int main (int argc, char ** argv)
  double beta = 1.0;
  double c1   = 0.331;

+  const int nu = 1;
+  std::vector<int> twists(Nd,0);
+  twists[nu] = 1;
+  ConjugateGimplD::setDirections(twists);
  ConjugatePlaqPlusRectangleActionR Action(beta,c1);
  //ConjugateWilsonGaugeActionR Action(beta);
  //WilsonGaugeActionR Action(beta);
@@ -46,6 +46,7 @@ int main (int argc, char ** argv)

  std::vector<int> seeds({1,2,3,4});

+  GridSerialRNG sRNG; sRNG.SeedFixedIntegers({4,5,6,7});
  GridParallelRNG          pRNG(&Grid);
  pRNG.SeedFixedIntegers(std::vector<int>({15,91,21,3}));

@@ -67,7 +68,7 @@ int main (int argc, char ** argv)
  LaplacianAdjointField<PeriodicGimplR> Laplacian(&Grid, CG, LapPar, Kappa);
  GeneralisedMomenta<PeriodicGimplR> LaplacianMomenta(&Grid, Laplacian);
  LaplacianMomenta.M.ImportGauge(U);
-  LaplacianMomenta.MomentaDistribution(pRNG);// fills the Momenta with the correct distr
+  LaplacianMomenta.MomentaDistribution(sRNG,pRNG);// fills the Momenta with the correct distr
  

  std::cout << std::setprecision(15);
@@ -69,7 +69,14 @@ int main (int argc, char ** argv)
  RealD M5=1.8; 
  RealD b=0.5;
  RealD c=0.5;
-  MobiusFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  WilsonImplParams p;
+  p.boundary_phases[0] = 1.0;
+  p.boundary_phases[1] = 1.0;
+  p.boundary_phases[2] = 1.0;
+  p.boundary_phases[3] =- 1.0;
+  
+  MobiusFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,p);
  Ddwf.M   (phi,Mphi);

  ComplexD S    = innerProduct(Mphi,Mphi); // pdag MdagM p
@@ -82,24 +89,44 @@ int main (int argc, char ** argv)
  Ddwf.MDeriv(tmp , Mphi,  phi,DaggerNo );  UdSdU=tmp;
  Ddwf.MDeriv(tmp , phi,  Mphi,DaggerYes ); UdSdU=(UdSdU+tmp);  
  
+  // *****************************************************************************************
+  // *** There is a funny negative sign in all derivatives. This is - UdSdU.               ***
+  // ***                                                                                   ***
+  // *** Deriv in both Wilson gauge action and the TwoFlavour.h seems to miss a minus sign ***
+  // *** UdSdU is negated relative to what I think - call what is returned mUdSdU,         ***
+  // *** and insert minus sign                                                             ***
+  // *****************************************************************************************
+
+  UdSdU = - UdSdU ; // Follow sign convention of actions in Grid. Seems crazy.
+
  LatticeFermion Ftmp      (FGrid);

  ////////////////////////////////////
  // Modify the gauge field a little 
  ////////////////////////////////////
-  RealD dt = 0.0001;
+  RealD dt = 0.001;
+  RealD Hmom = 0.0;
+  RealD Hmomprime = 0.0;

  LatticeColourMatrix mommu(UGrid); 
-  LatticeColourMatrix forcemu(UGrid); 
+  LatticeColourMatrix mUdSdUmu(UGrid); 
  LatticeGaugeField mom(UGrid); 
  LatticeGaugeField Uprime(UGrid); 

  for(int mu=0;mu<Nd;mu++){

    SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
-
    PokeIndex<LorentzIndex>(mom,mommu,mu);

+    // Momentum Hamiltonian is - trace(p^2)/HMC_MOM_DENOMINATOR
+    //
+    // Integrator.h:   RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom                                                                                       //     GaugeImplTypes.h:        Hloc += trace(Pmu * Pmu);
+    //                          Sign comes from a sneaky multiply by "i" in GaussianFundemantalLie algebra
+    //                          P is i P^a_\mu T^a, not Pa Ta
+    // 
+    // Integrator.h: H =  Hmom + sum S(action)
+    Hmom -= real(sum(trace(mommu*mommu)))/ HMC_MOMENTUM_DENOMINATOR;
+    
    // fourth order exponential approx
    autoView( U_v , U, CpuRead);
    autoView( mom_v, mom, CpuRead);
@@ -115,6 +142,7 @@ int main (int argc, char ** argv)
 	;
    });
  }
+  std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <<std::endl;
  
  Ddwf.ImportGauge(Uprime);
  Ddwf.M          (phi,MphiPrime);
@@ -125,32 +153,87 @@ int main (int argc, char ** argv)
  // Use derivative to estimate dS
  //////////////////////////////////////////////

-
  LatticeComplex dS(UGrid); dS = Zero();
+  LatticeComplex dSmom(UGrid); dSmom = Zero();
+  LatticeComplex dSmom2(UGrid); dSmom2 = Zero();
+
  for(int mu=0;mu<Nd;mu++){
    mommu   = PeekIndex<LorentzIndex>(UdSdU,mu);
-    mommu=Ta(mommu)*2.0;
+    mommu=Ta(mommu);
    PokeIndex<LorentzIndex>(UdSdU,mommu,mu);
  }

  for(int mu=0;mu<Nd;mu++){
-    forcemu = PeekIndex<LorentzIndex>(UdSdU,mu);
+
+    mUdSdUmu= PeekIndex<LorentzIndex>(UdSdU,mu);
    mommu   = PeekIndex<LorentzIndex>(mom,mu);

-    // Update PF action density
-    dS = dS+trace(mommu*forcemu)*dt;
+  //
+  // Derive HMC eom:
+  //
+  // Sdot =  - 2 trace( p p^dot ) / D - trace( p [ mUdSdU - h.c. ] ) = 0 
+  //
+  //
+  // Sdot = 0 = - 2 trace( p p^dot ) / D - 2 trace( p Ta( mUdSdU ) = 0
+  //
+  // EOM: 
+  //
+  // pdot = - D Ta( mUdSdU ) -- source of sign is the "funny sign" above
+  //
+  // dSqcd_dt  = - 2.0*trace(mommu* Ta(mUdSdU) )*dt -- i.e. mUdSdU with adjoint term -> force has a 2x implicit
+  //
+  // dH_mom/dt = - 2 trace (p pdot)/Denom  
+  //
+  // dH_tot / dt = 0 <= pdot = - Denom * mUdSdU 
+  //
+  // dH_mom/dt = 2 trace (p mUdSdU ) 
+  //
+  // True Momentum delta H has a dt^2 piece
+  //
+  // dSmom = [ trace mom*mom - trace ( (mom-Denom*f*dt)(mom-Denom*f*dt) ) ] / Denom
+  //       = 2*trace(mom*f) dt  - Denom*dt*dt * trace(f*f).
+  //       = dSmom + dSmom2
+  //
+    
+    dS     = dS - 2.0*trace(mommu*mUdSdUmu)*dt;   // U and Udagger derivs hence 2x.
+
+    dSmom  = dSmom  + 2.0*trace(mommu*mUdSdUmu) * dt;  // this 2.0 coms from derivative of p^2 
+    
+    dSmom2 = dSmom2 - trace(mUdSdUmu*mUdSdUmu) * dt*dt* HMC_MOMENTUM_DENOMINATOR; // Remnant
+
+    mommu = mommu - mUdSdUmu * dt* HMC_MOMENTUM_DENOMINATOR;; 
+
+    Hmomprime -= real(sum(trace(mommu*mommu))) / HMC_MOMENTUM_DENOMINATOR;

  }

  ComplexD dSpred    = sum(dS);
+  ComplexD dSm       = sum(dSmom);
+  ComplexD dSm2      = sum(dSmom2);

-  std::cout << GridLogMessage << " -- S         "<<S<<std::endl;
-  std::cout << GridLogMessage << " -- Sprime    "<<Sprime<<std::endl;
-  std::cout << GridLogMessage << "dS            "<<Sprime-S<<std::endl;
-  std::cout << GridLogMessage << "predict dS    "<< dSpred <<std::endl;
+  std::cout << GridLogMessage <<"dSm "<< dSm<<std::endl;
+  std::cout << GridLogMessage <<"dSm2 "<< dSm2<<std::endl;
+
+  std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <<std::endl;
+  std::cout << GridLogMessage <<"Final   mom hamiltonian is "<< Hmomprime <<std::endl;
+
+  std::cout << GridLogMessage <<"Delta   mom hamiltonian is "<< Hmomprime-Hmom <<std::endl;
+  std::cout << GridLogMessage <<"predict Delta mom hamiltonian is "<< dSm+dSm2 <<std::endl;
+  
+  std::cout << GridLogMessage << "Initial S      "<<S<<std::endl;
+  std::cout << GridLogMessage << "Final   S      "<<Sprime<<std::endl;
+  std::cout << GridLogMessage << "Delta   S      "<<Sprime-S<<std::endl;
+  std::cout << GridLogMessage << "predict delta S"<< dSpred <<std::endl;
+  std::cout << GridLogMessage << "defect "<< Sprime-S-dSpred <<std::endl;
+
+  std::cout << GridLogMessage << "Total dS    "<< Hmomprime - Hmom + Sprime - S <<std::endl;
+
+  std::cout << GridLogMessage << "dS - dt^2 term "<< Hmomprime - Hmom + Sprime - S - dSm2 <<std::endl;

  assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;

+
+  
  std::cout<< GridLogMessage << "Done" <<std::endl;
  Grid_finalize();
 }
@@ -88,7 +88,8 @@ int main (int argc, char** argv)
  ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
  ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false);

-  Meofa.refresh(U, RNG5);
+  GridSerialRNG  sRNG; sRNG.SeedFixedIntegers(seeds4);
+  Meofa.refresh(U, sRNG, RNG5 );
  RealD S = Meofa.S(U); // pdag M p

  // get the deriv of phidag M phi with respect to "U"
@@ -93,7 +93,8 @@ int main (int argc, char** argv)
  ConjugateGradient<FermionField> CG(1.0e-12, 5000);
  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false);

-  Meofa.refresh(U, RNG5);
+  GridSerialRNG  sRNG; sRNG.SeedFixedIntegers(seeds4);
+  Meofa.refresh(U, sRNG, RNG5 );
  RealD S = Meofa.S(U); // pdag M p

  // get the deriv of phidag M phi with respect to "U"
@@ -0,0 +1,156 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_wilson_force.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+//Get the mu-directected links on the upper boundary and the bulk remainder
+template<typename Field>
+void getLinksBoundaryBulk(Field &bound, Field &bulk,  Field &from, const Coordinate &latt_size){
+  bound = Zero(); bulk = Zero();
+  for(int mu=0;mu<Nd;mu++){
+    LatticeInteger mucoor(bound.Grid());
+    LatticeCoordinate(mucoor, mu);
+
+    bound = where( mucoor == (Integer)(latt_size[mu] - 1), from, bound );
+    bulk = where( mucoor != (Integer)(latt_size[mu] - 1), from, bulk );
+  }
+}
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> seeds({1,2,3,4});
+
+  GridParallelRNG          pRNG(&Grid);
+  GridSerialRNG            sRNG;
+  pRNG.SeedFixedIntegers(seeds);
+  sRNG.SeedFixedIntegers(seeds);
+
+  typedef PeriodicGimplR Gimpl;
+  typedef WilsonGaugeAction<Gimpl> GaugeAction;
+  typedef NoHirep Representation; //fundamental
+  typedef NoSmearing<Gimpl> Smearing;
+  typedef MinimumNorm2<Gimpl, Smearing> Omelyan;
+  typedef Gimpl::Field Field;
+  typedef MomentumFilterApplyPhase<Field> Filter;
+  Filter filter(&Grid);
+  
+  //Setup a filter that disables link update on links passing through the global lattice boundary
+  typedef Filter::LatticeLorentzScalarType MaskType;
+  typedef Filter::LorentzScalarType MaskSiteType;
+
+  MaskSiteType zero, one;
+  for(int mu=0;mu<Nd;mu++){
+    zero(mu)()() = 0.;
+    one(mu)()() = 1.;
+  }
+  MaskType zeroField(&Grid), oneField(&Grid);
+  zeroField = zero;
+  oneField = one;
+
+  
+  filter.phase = oneField; //make every site 1.0
+
+  //Zero mu-directed links at upper boundary
+  for(int mu=0;mu<Nd;mu++){
+    LatticeInteger mucoor(&Grid);
+    LatticeCoordinate(mucoor, mu);
+    
+    filter.phase = where( mucoor == (Integer)(latt_size[mu] - 1) , zeroField, filter.phase );
+  }
+
+  //Start with a random gauge field
+  Field U(&Grid);
+  SU<Nc>::HotConfiguration(pRNG,U);
+
+  //Get the original links on the bulk and boundary for later use
+  Field Ubnd_orig(&Grid), Ubulk_orig(&Grid);
+  getLinksBoundaryBulk(Ubnd_orig, Ubulk_orig, U, latt_size);
+
+  ActionSet<Field,Representation> actions(1);  
+  double beta=6;
+  GaugeAction gauge_action(beta);
+  actions[0].push_back(&gauge_action);
+
+  Smearing smear;
+  IntegratorParameters params(1,1.); //1 MD step
+  Omelyan integrator(&Grid, params, actions, smear);
+  
+  integrator.setMomentumFilter(filter);
+
+  integrator.refresh(U, sRNG, pRNG); //doesn't actually change the gauge field
+
+  //Check the momentum is zero on the boundary
+  const auto &P = integrator.getMomentum();
+  Field Pbnd(&Grid), Pbulk(&Grid);
+  getLinksBoundaryBulk(Pbnd, Pbulk, const_cast<Field&>(P), latt_size);
+
+  RealD Pbnd_nrm = norm2(Pbnd); //expect zero
+  std::cout << GridLogMessage << "After refresh, norm2 of mu-directed conjugate momentum on boundary is: " << Pbnd_nrm << " (expect 0)" << std::endl;
+  RealD Pbulk_nrm = norm2(Pbulk); //expect non-zero
+  std::cout << GridLogMessage << "After refresh, norm2 of bulk conjugate momentum is: " << Pbulk_nrm << " (expect non-zero)" << std::endl;
+
+  //Evolve the gauge field
+  integrator.integrate(U);
+
+  //Check momentum is still zero on boundary
+  getLinksBoundaryBulk(Pbnd, Pbulk, const_cast<Field&>(P), latt_size);
+  
+  Pbnd_nrm = norm2(Pbnd); //expect zero
+  std::cout << GridLogMessage << "After integrate, norm2 of mu-directed conjugate momentum on boundary is: " << Pbnd_nrm << " (expect 0)" << std::endl;
+  Pbulk_nrm = norm2(Pbulk); //expect non-zero
+  std::cout << GridLogMessage << "After integrate, norm2 of bulk conjugate momentum is: " << Pbulk_nrm << " (expect non-zero)" << std::endl;
+
+  //Get the new bulk and bound links
+  Field Ubnd_new(&Grid), Ubulk_new(&Grid);
+  getLinksBoundaryBulk(Ubnd_new, Ubulk_new, U, latt_size);
+
+  Field Ubnd_diff = Ubnd_new - Ubnd_orig;
+  Field Ubulk_diff = Ubulk_new - Ubulk_orig;
+
+  RealD Ubnd_change = norm2( Ubnd_diff );
+  RealD Ubulk_change = norm2( Ubulk_diff );
+  std::cout << GridLogMessage << "After integrate, norm2 of change in mu-directed boundary links is : " << Ubnd_change << " (expect 0)" << std::endl;
+  std::cout << GridLogMessage << "After integrate, norm2 of change in bulk links is : " << Ubulk_change << " (expect non-zero)" << std::endl;
+
+  Grid_finalize();
+}
Author	SHA1	Message	Date
portelli	d4290a7434	finer timers in Benchmark_IO	2021-06-17 11:57:02 +01:00
Peter BoyleandGitHub	92def28bd3	Update README.md	2021-06-06 04:52:05 -04:00
portelli	ca10bfa1c7	removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)	2021-06-04 11:12:22 +01:00
Peter Boyle	0e27e3847d	Remove synch	2021-06-03 04:24:19 +00:00
u61464	8cfc7342cd	staggered hand unroll read coalesce	2021-05-05 14:17:18 -07:00
u61464	15ae317858	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-05-04 08:40:38 -07:00
u61464	834f536b5f	Fastest option on SyCL is now std::complex	2021-05-04 08:40:18 -07:00
Peter BoyleandGitHub	c332d9f08b	Merge pull request #356 from felixerben/bugfix/stoutSmearing Jamie's fix	2021-04-27 14:10:49 -04:00
ferben	cf2923d5dd	Jamie's fix	2021-04-27 16:53:37 +01:00
Peter BoyleandGitHub	0e4413ddde	Merge pull request #355 from felixerben/bugfix/stoutSmearing bugfix 3D stout smearing	2021-04-27 08:01:55 -04:00
ferben	009ccd581e	bugfix 3D stout smearing	2021-04-26 10:36:33 +01:00
Peter Boyle	8cd4263974	Tests compile	2021-04-25 22:20:37 -04:00
Peter Boyle	d45c868656	Change interface	2021-04-25 10:53:34 -04:00
Peter Boyle	955a8113de	Expose label only to reduce number of parameters	2021-04-25 10:36:38 -04:00
Peter Boyle	dbe210dd53	Open the ens_id	2021-04-25 10:25:59 -04:00
Peter Boyle	86e11743ca	set twists	2021-04-20 10:19:11 -04:00
Peter BoyleandGitHub	980e721f6e	Update MetaData.h	2021-04-13 09:33:01 -04:00
Peter BoyleandGitHub	e2a0142d87	Merge pull request #348 from AndrewYongZhenNing/develop Conserved Tadpole Implementation for Shamir Action Only	2021-04-06 10:49:00 -04:00
aznyong	895244ecc3	Merge with upstream; implemented conserved tadpole for Shamir action.	2021-04-06 13:46:33 +01:00
aznyong	addeb621a7	Implemented tadpole operator for Shamir action.	2021-04-06 13:45:37 +01:00
Peter Boyle	a7fb25adf6	Make Cshift fields static to avoid repeated reallocaate overhead	2021-03-29 21:44:14 +02:00
Peter Boyle	e947992957	Improved force terms	2021-03-29 20:04:06 +02:00
Peter Boyle	bb89a82a07	Staggered coalseced read	2021-03-29 20:01:15 +02:00
Peter Boyle	8bdadbadac	Cold start	2021-03-18 15:41:14 -04:00
Peter Boyle	15c50a7442	Explicit instantiate the template function	2021-03-18 15:40:42 -04:00
Peter Boyle	49b0af2c95	Update of tests to compile with the sRNG addition. Audited the code conventions (again) with the CPS momentum denominator and added anti periodic in time to the Test_mobius_force.cc and tested the Test_dwf_gpforce. Promoted thesee to test full HMC hamiltonian, tr P^2/2 + phidag MdagM phi with the same pdot and Udot as audited in the Integrator.h etc... With full comments and sources for factors.	2021-03-18 09:10:02 -04:00
Peter Boyle	9c2b37218a	sRNG parameter added	2021-03-18 06:24:11 -04:00
Peter Boyle	3c67d626ba	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-12 15:36:55 +01:00
Peter Boyle	51f506553c	Read out the local ID once, and store	2021-03-12 15:33:04 +01:00
Peter Boyle	226be84937	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-12 09:31:50 -05:00
Peter Boyle	001814b442	updated to do list. Start adding DDHMC work items	2021-03-12 09:31:17 -05:00
Peter Boyle	db3ac67506	Update thread issue	2021-03-12 14:55:07 +01:00
Peter Boyle	da91a884ef	NVCC versions found buggy added as guard	2021-03-11 23:54:53 +01:00
Peter Boyle	a71e6755e3	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-11 22:43:06 +01:00
Peter Boyle	cd5891eecd	Test that fails on Cuda 11.0	2021-03-11 22:34:28 +01:00
Peter BoyleandGitHub	5bb7336f27	Merge pull request #347 from pjgeorg/fix-autotools-avx512 Fix inconsistent SIMD option AVX512 Thanks	2021-03-11 16:29:07 -05:00
Peter Boyle	ce1fc1f48a	Possible fallback plan for Fionn's compiler bbug in nvcc	2021-03-11 22:20:53 +01:00
Peter Georg	82402c6a7c	Add simd option SKL for ICC	2021-03-11 13:08:40 +01:00
Peter Georg	d9c4afe5b7	Fix inconsistent configure option AVX512 Before this change AVX512 enabled different instruction sets depending on the compiler: For Intel C++ Compiler Classic (ICC): AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL i.e. Intel Xeon Skylake and newer For Intel ICX, gcc, clang: AVX512F, AVX512CD, AVX512ER, AVX512PF i.e. Intel Xeon Phi x200/x205 (KNL/KNM) With this commit AVX512 now only enables the common instruction sets supported by all CPUs supporting any AVX-512 instructions set: AVX512F and AVX512CD (called COMMON-AVX512 by icc)	2021-03-11 12:58:49 +01:00
Peter Boyle	f786ff8d69	Extend test from Fionn, fails on A100 apparently	2021-03-10 14:32:06 -05:00
u61464	a651caed5f	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-10 06:23:51 -08:00
u61464	0e21adb3f6	Gives 200GF/s on SyCL/DG1 8^4, doesn't uglify develop for other platforms too badly. Easy to revert to clean more C++ stylistic code. Theres a SYCL_HACK macro I will clean up later once dpcpp evolves a central nervous systems.	2021-03-10 05:40:51 -08:00
Peter Boyle	58bf9b9e6d	Clean up test	2021-03-10 02:45:22 +01:00
Peter Boyle	2146eebb65	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-09 04:31:46 +01:00
Peter Boyle	6a429ee6d3	2d loop hits Nvidia 16bit limit on large local vols	2021-03-09 04:31:10 +01:00
Peter Boyle	4d1ea15c79	More verbosity. The 16bit limit on Grid.y, Grid.z is annoying	2021-03-09 04:29:37 +01:00
Peter BoyleandGitHub	a76cb005e0	Update Tensor_exp.h	2021-03-08 13:37:57 -05:00
Peter BoyleandGitHub	a9604367c1	Merge pull request #336 from lehner/feature/gpt Make ShmDims configurable; adjust GRID_MAX_SIMD to allow for 128 byte width on GPUs	2021-03-05 13:17:19 -05:00
Peter BoyleandGitHub	d7065023cc	Merge pull request #332 from mmphys/feature/mres_schur Optional changes to Test_cayley_mres e.g. Schur solver	2021-03-05 12:47:07 -05:00
Peter BoyleandGitHub	89d299ceec	Merge pull request #333 from mmphys/bugfix/LatTransfer Fix convertType for GPU in Lattice_transfer.h	2021-03-05 12:46:33 -05:00
Peter BoyleandGitHub	e34eda66df	Merge pull request #344 from felixerben/feature/XiToSigma Feature/xi to sigma	2021-03-05 12:45:44 -05:00
Christoph LehnerandGitHub	b24181aa4f	Update Coordinate.h Revert GRID_MAX_SIMD change	2021-03-05 16:56:58 +01:00
Peter BoyleandGitHub	aa173e2998	Update README.md	2021-03-05 10:25:33 -05:00
ferben	7a19432e0b	whitespace	2021-03-05 10:57:09 +00:00
ferben	9b15704290	tested and consitent	2021-03-05 10:42:32 +00:00
Michael Marshall	017f955b2d	Merge branch 'develop' into feature/mres_schur * develop: Pass serial RNG around Sycl happier	2021-03-04 20:42:02 +00:00
Michael Marshall	f252d69eef	Merge branch 'develop' into bugfix/LatTransfer * develop: Pass serial RNG around Sycl happier	2021-03-04 20:41:30 +00:00
ferben	3b06e4655e	Merge branch 'develop' into feature/XiToSigma	2021-03-04 20:06:16 +00:00
ferben	d4b4de8f42	changes	2021-03-04 20:01:24 +00:00
Peter Boyle	c90beee774	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-03-03 23:50:29 +01:00
Peter Boyle	1eea9d73b9	Pass serial RNG around	2021-03-03 23:50:01 +01:00
u61464	679d1d22f7	Sycl happier	2021-03-03 11:21:43 -08:00
Michael Marshall	b2b5e0b98c	Merge branch 'develop' into feature/mres_schur * develop: Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path. Better SIMD usage/coalescence	2021-03-03 16:15:12 +00:00
Michael Marshall	03e54722c1	Merge branch 'develop' into bugfix/LatTransfer * develop: Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.	2021-03-03 16:13:23 +00:00
Peter Boyle	442336bd96	Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.	2021-03-02 14:50:51 +01:00
Christoph LehnerandGitHub	9c9566b9c9	Merge pull request #23 from paboyle/develop Sync	2021-03-01 12:33:51 +01:00
Michael Marshall	1059a81a3c	Merge branch 'develop' into bugfix/LatTransfer * develop: Better SIMD usage/coalescence	2021-02-27 00:21:36 +00:00
Peter Boyle	2e61556389	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-02-26 17:52:20 +01:00
Peter Boyle	f9b1f240f6	Better SIMD usage/coalescence	2021-02-26 17:51:41 +01:00
Michael Marshall	69f41469dd	Merge branch 'develop' into bugfix/LatTransfer * develop: (26 commits) Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links Correct misleading ac help string Enable performance counting in WilsonFermion like in others changed back A2AUtils warning changed if and accelerator_for - no runtime errors any more Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons. Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error maxLocalNorm2() change back benchmark_ITT prettify Flop cout matches DiRAC-ITT-2020 revert changes merge develop fixes weird bug in 2pt function... revert changes final version, tested on CPU and GPU bugfix ...	2021-02-25 09:19:17 +00:00
Michael Marshall	d620b303ff	Merge branch 'develop' into feature/mres_schur * develop: (26 commits) Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links Correct misleading ac help string Enable performance counting in WilsonFermion like in others changed back A2AUtils warning changed if and accelerator_for - no runtime errors any more Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons. Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error maxLocalNorm2() change back benchmark_ITT prettify Flop cout matches DiRAC-ITT-2020 revert changes merge develop fixes weird bug in 2pt function... revert changes final version, tested on CPU and GPU bugfix ...	2021-02-24 18:07:27 +00:00
Peter BoyleandGitHub	157fd1428d	Merge pull request #342 from paboyle/feature/link-update-mask Feature/link update mask	2021-02-24 11:29:52 -05:00
Christopher Kelly	c791cb2214	Merge branch 'develop' into feature/link-update-mask	2021-02-23 11:51:54 -05:00
Christopher Kelly	d5ab571a89	Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links	2021-02-23 11:49:56 -05:00
ferben	0ed800f6e4	merge develop	2021-02-23 14:54:46 +00:00
Peter BoyleandGitHub	0a32183825	Merge pull request #335 from felixerben/gpu/baryons Gpu/baryons	2021-02-23 09:30:16 -05:00
Peter BoyleandGitHub	2cacfbde2a	Merge pull request #341 from DanielRichtmann/fix/minor-things Minor fixes	2021-02-22 09:28:50 -05:00
Daniel Richtmann	c073e62e0b	Correct misleading ac help string	2021-02-22 15:25:44 +01:00
Daniel Richtmann	e3d019bc2f	Enable performance counting in WilsonFermion like in others	2021-02-22 15:25:40 +01:00
ferben	7ae030f585	changed back A2AUtils warning	2021-02-18 13:24:50 +00:00
ferben	86b58d5aff	changed if and accelerator_for - no runtime errors any more	2021-02-18 12:04:32 +00:00
Peter BoyleandGitHub	26e8b9f4a5	Merge pull request #340 from mmphys/bugfix/config Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu	2021-02-17 11:56:21 -05:00
Michael Marshall	35114c9e62	Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu	2021-02-17 13:24:15 +00:00
Peter BoyleandGitHub	dfd28a85c9	Merge pull request #339 from mmphys/bugfix/config Optional rename PACKAGE_ to GRID_ in Grid/Config.h	2021-02-15 13:53:26 -05:00
Michael Marshall	a503332924	Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.	2021-02-14 21:27:54 +00:00
Peter BoyleandGitHub	1ac13ec3a7	Merge pull request #338 from paboyle/bugfix/maxnorm2 Fixed compile issues with maxLocalNorm2 for non-scalar lattices	2021-02-08 12:08:11 -05:00
Christopher Kelly	55de69a569	Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field	2021-02-08 12:03:16 -05:00
Peter Boyle	eda9ab487b	MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error	2021-02-08 10:47:22 -05:00
Peter Boyle	cd99edcc5f	maxLocalNorm2()	2021-02-04 18:25:49 -05:00
Christoph Lehner	4705aa541d	Allow user to configure ShmDims via environment variables	2021-02-04 14:25:55 +01:00
Michael Marshall	3215d88a91	Simplify syntax with Grid::EnableIf post code review. Updated EnableIf so that ReturnType defaults to void in same way as std::enable_if see https://en.cppreference.com/w/cpp/types/enable_if	2021-02-03 15:17:03 +00:00
ferben	9b9a53f870	...	2021-02-02 13:06:43 +00:00
Christoph Lehner	019ffe17d4	Allow for GPU vector width beyond 64	2021-02-02 11:32:23 +01:00
ferben	bc496dd844	change back benchmark_ITT	2021-01-28 14:29:56 +00:00
ferben	a673b6a54d	prettify	2021-01-28 14:15:09 +00:00
ferben	1bf2e4d187	Merge branch 'develop' into gpu/baryons	2021-01-27 21:17:37 +00:00
Peter Boyleandferben	96dd7a8fbd	Flop cout matches DiRAC-ITT-2020	2021-01-27 21:14:52 +00:00
ferben	7905afa9f5	revert changes	2021-01-27 21:14:52 +00:00
ferben	712bb40650	merge develop	2021-01-27 21:14:52 +00:00
ferben	81d88d9f4d	fixes	2021-01-27 21:09:51 +00:00
Michael Marshall	77063418da	Fix issue for GPU by ensuring accelerator_inline version of convertType is available for Grid::complex<T>. This removes many warnings in Hadrons Simplify the SFINAE syntax and correct convertType for iScalar	2021-01-25 15:09:36 +00:00
Michael Marshall	2983b6fdf6	Optional (superficial) changes to make comparison with Hadrons WardIdentity module easier: use Schur solver; example of Hadrons random gauge init; logging updates; only solve reverse propagator if provided	2021-01-23 12:41:48 +00:00
Peter Boyle	69f1f04f74	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2021-01-21 21:39:59 -05:00
Peter Boyle	11a5fd09d6	Hot config	2021-01-21 21:39:41 -05:00
Peter Boyle	ff1fa98808	Fix for GPU conserveed current	2021-01-21 21:38:23 -05:00
ferben	df16202865	weird bug in 2pt function...	2021-01-19 19:25:27 +00:00
ferben	3ff7c2c02a	Merge branch 'develop' into gpu/baryons	2021-01-19 12:34:13 +00:00
ferben	fc6d07897f	revert changes	2021-01-19 12:32:48 +00:00
ferben	f9c8e5c8ef	Merge branch 'develop' of github.com:paboyle/Grid into develop	2021-01-19 12:30:29 +00:00
ferben	8bfa0e74f8	final version, tested on CPU and GPU	2021-01-19 12:27:57 +00:00
ferben	9b73a937e7	bugfix	2021-01-18 18:57:05 +00:00
Peter Boyle	b0339bc5a4	Merge branch 'feature/conjugate-bc-dirs' into develop	2021-01-15 09:28:39 -05:00
ferben	fa12b9a329	bugfix	2021-01-13 10:04:17 +00:00
ferben	45fc7ded3a	test for sum	2021-01-12 09:10:37 +00:00
ferben	74de2d9742	whitespace changes	2021-01-08 18:28:36 +00:00
ferben	e759367d42	tested and working	2021-01-08 18:04:50 +00:00
Christoph LehnerandGitHub	299d0de066	Merge pull request #21 from paboyle/develop Sync	2020-12-22 20:59:15 +01:00
Christoph LehnerandGitHub	b4c1317ab4	Merge pull request #22 from DanielRichtmann/feature/clover-access-specifier Clover access specifier	2020-12-18 16:20:19 +01:00
ferben	f36d6f3923	compiles on GPU. 3pt still wrong!!!!	2020-12-17 17:04:08 +00:00
ferben	808f1e0e8c	merge develop	2020-12-15 16:33:29 +00:00
Daniel Richtmann	c438118fd7	Change access specifier of clover fields in order to allow deriving classes to access these	2020-12-08 14:42:11 +01:00
Christoph LehnerandGitHub	17ec9c5545	Merge pull request #20 from paboyle/develop Sync	2020-11-24 12:20:43 +01:00
ferben	3594ce877b	speedup in Sigma-to-nucleon	2020-11-03 20:04:30 +00:00
ferben	9bae6b889a	speedup in Sigma-to-nucleon	2020-11-03 20:03:09 +00:00
ferben	4014dfd5b9	first tested version	2020-11-03 16:13:08 +00:00
ferben	67023c334b	bugfix	2020-11-03 13:07:37 +00:00
ferben	a3de7026c8	bugfix	2020-11-03 12:51:50 +00:00
ferben	ee11678b1f	added Xi-to-Sigma rare decays	2020-11-03 12:41:35 +00:00