From 9296299b61508e8fb4a44fa32f656b7ab1857d9e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 4 Oct 2022 11:10:34 -0700
Subject: [PATCH 01/23] Better commenting

---
 Grid/qcd/action/fermion/WilsonCompressor.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h
index e0e08c1c..5c3351a6 100644
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -117,19 +117,19 @@ public:
     typedef decltype(coalescedRead(*in))    sobj;
     typedef decltype(coalescedRead(*out0)) hsobj;
 
-    unsigned int Nsimd = vobj::Nsimd();
+    constexpr unsigned int Nsimd = vobj::Nsimd();
     unsigned int mask = Nsimd >> (type + 1);
     int lane = acceleratorSIMTlane(Nsimd);
     int j0 = lane &(~mask); // inner coor zero
     int j1 = lane |(mask) ; // inner coor one
-    const vobj *vp0 = &in[k];
-    const vobj *vp1 = &in[m];
-    const vobj *vp = (lane&mask) ? vp1:vp0;
-    auto sa = coalescedRead(*vp,j0);
-    auto sb = coalescedRead(*vp,j1);
+    const vobj *vp0 = &in[k];  // out0[j] = merge low bit of type from in[k] and in[m] 
+    const vobj *vp1 = &in[m];  // out1[j] = merge hi  bit of type from in[k] and in[m]
+    const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
+    auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
+    auto sb = coalescedRead(*vp,j1); // lane to read for out 1
     hsobj psa, psb;
-    projector::Proj(psa,sa,mu,dag);
-    projector::Proj(psb,sb,mu,dag);
+    projector::Proj(psa,sa,mu,dag);  // spin project the result0
+    projector::Proj(psb,sb,mu,dag);  // spin project the result1
     coalescedWrite(out0[j],psa);
     coalescedWrite(out1[j],psb);
 #else

From e1e5c7502308e61b3649c87df56e1f3f31c7b90f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 4 Oct 2022 11:11:10 -0700
Subject: [PATCH 02/23] Stencil gather improvements - SVM was running slow and
 used for a pointer array that wasn't needed to be in SVM

---
 Grid/stencil/Stencil.h | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index c2bc8dab..65d878cb 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
 ///////////////////////////////////////////////////////////////////
 template<class cobj,class vobj,class compressor>
 void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
-				 commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
+				 commVector<cobj *> pointers,
+				 int dimension,int plane,
+				 int cbmask,compressor &compress,int type) __attribute__((noinline));
 
 template<class cobj,class vobj,class compressor>
-void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				 Vector<cobj *> pointers,int dimension,int plane,int cbmask,
+void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
+				 const Lattice<vobj> &rhs,
+				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
 				 compressor &compress,int type)
 {
   assert( (table.size()&0x1)==0);
@@ -92,14 +95,15 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
 
   auto rhs_v = rhs.View(AcceleratorRead);
+  auto rhs_p = &rhs_v[0];
   auto p0=&pointers[0][0];
   auto p1=&pointers[1][0];
   auto tp=&table[0];
   accelerator_forNB(j, num, vobj::Nsimd(), {
-      compress.CompressExchange(p0,p1, &rhs_v[0], j,
-			      so+tp[2*j  ].second,
-			      so+tp[2*j+1].second,
-			      type);
+      compress.CompressExchange(p0,p1, rhs_p, j,
+				so+tp[2*j  ].second,
+				so+tp[2*j+1].second,
+				type);
   });
   rhs_v.ViewClose();
 }
@@ -230,8 +234,8 @@ public:
   };
   struct Merge {
     cobj * mpointer;
-    Vector<scalar_object *> rpointers;
-    Vector<cobj *> vpointers;
+    //    std::vector<scalar_object *> rpointers;
+    std::vector<cobj *> vpointers;
     Integer buffer_size;
     Integer type;
   };
@@ -406,6 +410,7 @@ public:
       comms_bytes+=bytes;
       shm_bytes  +=2*Packets[i].bytes-bytes;
     }
+    _grid->StencilBarrier();// Synch shared memory on a single nodes
   }
 
   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
@@ -420,7 +425,7 @@ public:
   ////////////////////////////////////////////////////////////////////////
   void Communicate(void)
   {
-    if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
+    if ( 0 ){
       thread_region {
 	// must be called in parallel region
 	int mythread  = thread_num();
@@ -569,7 +574,7 @@ public:
     d.buffer_size = buffer_size;
     dv.push_back(d);
   }
-  void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
+  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
     Merge m;
     m.type     = type;
     m.mpointer = merge_p;
@@ -582,6 +587,7 @@ public:
   }
   template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
     mpi3synctime-=usecond();
+    accelerator_barrier();
     _grid->StencilBarrier();// Synch shared memory on a single nodes
     mpi3synctime+=usecond();
     shmmergetime-=usecond();
@@ -1114,8 +1120,8 @@ public:
     int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
     assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
 
-    Vector<cobj *> rpointers(maxl);
-    Vector<cobj *> spointers(maxl);
+    std::vector<cobj *> rpointers(maxl);
+    std::vector<cobj *> spointers(maxl);
 
     ///////////////////////////////////////////
     // Work out what to send where

From 03508448f88c08cfa146e5b0082b930d5de503d7 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 4 Oct 2022 11:12:15 -0700
Subject: [PATCH 03/23] Remove verbose

---
 Grid/threads/Accelerator.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index d2cf8a01..e17e85d1 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -298,14 +298,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      });	   			              \
     });
 
-#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n");  theGridAccelerator->wait(); }
+#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
 
 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 
-inline void acceleratorCopySynchronise(void) {  printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); }
+inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}

From 413312f9a9a98634363f5c5206be19e97ec55fca Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 4 Oct 2022 11:12:59 -0700
Subject: [PATCH 04/23] Benchmark the halo construction. THe bye counts are out
 and should be doubled for SIMD directions

---
 benchmarks/Benchmark_halo.cc | 131 +++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 benchmarks/Benchmark_halo.cc

diff --git a/benchmarks/Benchmark_halo.cc b/benchmarks/Benchmark_halo.cc
new file mode 100644
index 00000000..43138e67
--- /dev/null
+++ b/benchmarks/Benchmark_halo.cc
@@ -0,0 +1,131 @@
+ /*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid
+    Source file: ./benchmarks/Benchmark_dwf.cc
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#ifdef GRID_CUDA
+#define CUDA_PROFILE
+#endif
+
+#ifdef CUDA_PROFILE
+#include <cuda_profiler_api.h>
+#endif
+
+using namespace std;
+using namespace Grid;
+
+template<class d>
+struct scal {
+  d internal;
+};
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt4= GridDefaultLatt();
+  Coordinate mpi  = GridDefaultMpi();
+  Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd());
+
+  GridLogLayout();
+
+  int Ls=16;
+  for(int i=0;i<argc;i++)
+    if(std::string(argv[i]) == "-Ls"){
+      std::stringstream ss(argv[i+1]); ss >> Ls;
+    }
+
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,simd ,mpi);
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
+  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
+  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
+  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+  LatticeFermionF src   (FGrid); random(RNG5,src);
+  RealD N2 = 1.0/::sqrt(norm2(src));
+  src = src*N2;
+
+  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+  LatticeGaugeFieldF Umu(UGrid);
+  SU<Nc>::HotConfiguration(RNG4,Umu);
+  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+
+  RealD mass=0.1;
+  RealD M5  =1.8;
+
+  RealD NP = UGrid->_Nprocessors;
+  RealD NN = UGrid->NodeCount();
+
+  DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+  const int ncall = 500;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::HaloGatherOpt         "<<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  {
+    typename DomainWallFermionF::Compressor compressor(0);
+    FGrid->Barrier();
+    Dw.Stencil.HaloExchangeOptGather(src,compressor);
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Dw.Stencil.HaloExchangeOptGather(src,compressor);
+    }
+    double t1=usecond();
+    FGrid->Barrier();
+
+    double bytes=0.0;
+    if(mpi[0]) bytes+=latt4[1]*latt4[2]*latt4[3];
+    if(mpi[1]) bytes+=latt4[0]*latt4[2]*latt4[3];
+    if(mpi[2]) bytes+=latt4[0]*latt4[1]*latt4[3];
+    if(mpi[3]) bytes+=latt4[0]*latt4[1]*latt4[2];
+    bytes = bytes * Ls * 8.* (24.+12.)* 2.0;
+
+    std::cout<<GridLogMessage << "Gather us /call =   "<< (t1-t0)/ncall<<std::endl;
+    std::cout<<GridLogMessage << "Gather MBs /call =   "<< bytes*ncall/(t1-t0)<<std::endl;
+
+  }
+
+  Grid_finalize();
+  exit(0);
+}

From 0d5639f7075902182c59b9458a0e242f7ad3b7eb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 4 Oct 2022 11:13:41 -0700
Subject: [PATCH 05/23] Run script update

---
 systems/PVC/benchmarks/run-2tile-mpi.sh | 11 +++++++++--
 systems/PVC/benchmarks/wrap.sh          |  4 ++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh
index 9db0b66b..34deac2c 100755
--- a/systems/PVC/benchmarks/run-2tile-mpi.sh
+++ b/systems/PVC/benchmarks/run-2tile-mpi.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 ##SBATCH -p PVC-SPR-QZEH 
 ##SBATCH -p PVC-ICX-QZNW
+
 #SBATCH -p QZ1J-ICX-PVC
 
 source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
@@ -19,8 +20,14 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
 export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
 
-mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > dw.2tile.1x2.log
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > dw.2tile.2x1.log
+
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log
 
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log
 
diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh
index a352fff9..bb7b517d 100755
--- a/systems/PVC/benchmarks/wrap.sh
+++ b/systems/PVC/benchmarks/wrap.sh
@@ -7,8 +7,8 @@ echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
 
 if [ $MPI_LOCALRANKID = "0" ] 
 then
-#  ~psteinbr/build_pti/ze_tracer -c $@
-  onetrace --chrome-kernel-timeline $@
+#  ~psteinbr/build_pti/ze_tracer -h $@
+  onetrace --chrome-device-timeline $@
 else
   $@
 fi

From 9e4835a3e3769e3b310941b26a58e7a239e4fb43 Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Tue, 25 Oct 2022 15:19:43 +0100
Subject: [PATCH 06/23] feat: changed CompactWilsonExpClover exponentiation to
 Taylor expansion with Horner scheme.

---
 Grid/qcd/action/fermion/CloverHelpers.h       | 203 ++++--------------
 ...CompactWilsonCloverFermionImplementation.h |  11 +-
 2 files changed, 50 insertions(+), 164 deletions(-)

diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h
index 57e71998..c1df81fb 100644
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@@ -242,6 +242,19 @@ public:
     // mass term is multiplied to exp(Clover) below
   }
 
+  // Can this be avoided?
+  static void IdentityTimesC(const CloverField& in, RealD c) {
+    int DimRep = Impl::Dimension;
+
+    autoView(in_v, in, AcceleratorWrite);
+
+    accelerator_for(ss, in.Grid()->oSites(), 1, {
+      for (int sa=0; sa<Ns; sa++)
+        for (int ca=0; ca<DimRep; ca++)
+          in_v[ss]()(sa,sa)(ca,ca) = c;
+    });
+  }
+
   static int getNMAX(RealD prec, RealD R) {
     /* compute stop condition for exponential */
     int NMAX=1;
@@ -254,176 +267,46 @@ public:
     return NMAX;
   }
 
-  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
-  static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
+  static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
+  static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
 
-  static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
+  static void Exponentiate_Clover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
 
-  	  typedef iMatrix<ComplexD,6> mat;
+    GridBase* grid = Clover.Grid();
+    CloverField ExpClover(grid);
 
-  	  RealD qn[6];
-  	  RealD qnold[6];
-  	  RealD p[5];
-  	  RealD trA2, trA3, trA4;
+    int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
 
-  	  mat A2, A3, A4, A5;
-  	  A2 = alpha * alpha * arg * arg;
-  	  A3 = alpha * arg * A2;
-  	  A4 = A2 * A2;
-  	  A5 = A2 * A3;
+    Clover *= (1.0/diag_mass);
 
-  	  trA2 = toReal( trace(A2) );
-  	  trA3 = toReal( trace(A3) );
-  	  trA4 = toReal( trace(A4));
-
-  	  p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
-  	  p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
-  	  p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
-  	  p[3] = trA3 / 3.0;
-  	  p[4] = 0.5 * trA2;
-
-  	  qnold[0] = cN[Niter];
-  	  qnold[1] = 0.0;
-  	  qnold[2] = 0.0;
-  	  qnold[3] = 0.0;
-  	  qnold[4] = 0.0;
-  	  qnold[5] = 0.0;
-
-  	  for(int i = Niter-1; i >= 0; i--)
-  	  {
-  	   qn[0] = p[0] * qnold[5] + cN[i];
-  	   qn[1] = p[1] * qnold[5] + qnold[0];
-  	   qn[2] = p[2] * qnold[5] + qnold[1];
-  	   qn[3] = p[3] * qnold[5] + qnold[2];
-  	   qn[4] = p[4] * qnold[5] + qnold[3];
-  	   qn[5] = qnold[4];
-
-  	   qnold[0] = qn[0];
-  	   qnold[1] = qn[1];
-  	   qnold[2] = qn[2];
-  	   qnold[3] = qn[3];
-  	   qnold[4] = qn[4];
-  	   qnold[5] = qn[5];
-  	  }
-
-  	  mat unit(1.0);
-
-  	  dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
-
-    }
-
-  static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
-
-    GridBase* grid = Diagonal.Grid();
-    int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
-
-    //
-    // Implementation completely in Daniel's layout
-    //
-
-    // Taylor expansion with Cayley-Hamilton recursion
-    // underlying Horner scheme as above
+    // Taylor expansion, slow but generic
+    // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
+    // qN = cN
+    // qn = cn + qn+1 X
     std::vector<RealD> cn(NMAX+1);
     cn[0] = 1.0;
-    for (int i=1; i<=NMAX; i++){
+    for (int i=1; i<=NMAX; i++)
       cn[i] = cn[i-1] / RealD(i);
+
+    ExpClover = Zero();
+    IdentityTimesC(ExpClover, cn[NMAX]);
+    for (int i=NMAX-1; i>=0; i--)
+      ExpClover = ExpClover * Clover + cn[i];
+
+    // prepare inverse
+    CloverInv = (-1.0)*Clover;
+
+    Clover = ExpClover * diag_mass;
+
+    ExpClover = Zero();
+    IdentityTimesC(ExpClover, cn[NMAX]);
+    for (int i=NMAX-1; i>=0; i--)
+      ExpClover = ExpClover * CloverInv + cn[i];
+
+    CloverInv = ExpClover * (1.0/diag_mass);
+
     }
 
-      // Taken over from Daniel's implementation
-      conformable(Diagonal, Triangle);
-
-      long lsites = grid->lSites();
-    {
-      typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
-      typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
-      typedef iMatrix<ComplexD,6> mat;
-
-      autoView(diagonal_v,  Diagonal,  CpuRead);
-      autoView(triangle_v,  Triangle,  CpuRead);
-      autoView(diagonalExp_v, Diagonal, CpuWrite);
-      autoView(triangleExp_v, Triangle, CpuWrite);
-
-      thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
-
-    	  mat srcCloverOpUL(0.0); // upper left block
-    	  mat srcCloverOpLR(0.0); // lower right block
-    	  mat ExpCloverOp;
-
-        scalar_object_diagonal diagonal_tmp     = Zero();
-        scalar_object_diagonal diagonal_exp_tmp = Zero();
-        scalar_object_triangle triangle_tmp     = Zero();
-        scalar_object_triangle triangle_exp_tmp = Zero();
-
-        Coordinate lcoor;
-        grid->LocalIndexToLocalCoor(site, lcoor);
-
-        peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
-        peekLocalSite(triangle_tmp, triangle_v, lcoor);
-
-        int block;
-        block = 0;
-        for(int i = 0; i < 6; i++){
-        	for(int j = 0; j < 6; j++){
-        		if (i == j){
-        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
-        		}
-        		else{
-        			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
-        		}
-        	}
-        }
-        block = 1;
-        for(int i = 0; i < 6; i++){
-          	for(int j = 0; j < 6; j++){
-           		if (i == j){
-           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
-           		}
-           		else{
-           			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
-           		}
-            }
-        }
-
-        // exp(Clover)
-
-        ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
-
-        block = 0;
-        for(int i = 0; i < 6; i++){
-        	for(int j = 0; j < 6; j++){
-            	if (i == j){
-            		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
-            	}
-            	else if(i < j){
-            		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
-            	}
-           	}
-        }
-
-        ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
-
-        block = 1;
-        for(int i = 0; i < 6; i++){
-        	for(int j = 0; j < 6; j++){
-              	if (i == j){
-              		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
-               	}
-               	else if(i < j){
-               		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
-               	}
-            }
-        }
-
-        pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
-        pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
-      });
-    }
-
-    Diagonal *= diag_mass;
-    Triangle *= diag_mass;
-  }
-
-
   static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
     assert(0);
     return lambda;
diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
index e4864730..e042b985 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -305,6 +305,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
   GridBase* grid = _Umu.Grid();
   typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
   CloverField TmpOriginal(grid);
+  CloverField TmpInverse(grid);
 
   // Compute the field strength terms mu>nu
   double t2 = usecond();
@@ -326,14 +327,14 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
   TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
   // Handle mass term based on clover policy
   CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
-  
+
   // Convert the data layout of the clover term
   double t4 = usecond();
-  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
+  CloverHelpers::Exponentiate_Clover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
 
   // Exponentiate the clover (nothing happens in case of the standard clover)
   double t5 = usecond();
-  CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
+  CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
 
   // Possible modify the boundary values
   double t6 = usecond();
@@ -341,7 +342,9 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
 
   // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
   double t7 = usecond();
-  CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
+  //CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
+  CompactHelpers::ConvertLayout(TmpInverse, DiagonalInv, TriangleInv);
+  //if(open_boundaries) handle differently!
 
   // Fill the remaining clover fields
   double t8 = usecond();

From 513d797ea63245fdffdafd5ea1ea66bb2e357168 Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Tue, 25 Oct 2022 16:17:22 +0100
Subject: [PATCH 07/23] fix: signature of
 CompactWilsonCloverHelpers::Exponentiate fixed.

---
 Grid/qcd/action/fermion/CloverHelpers.h                      | 5 +----
 .../CompactWilsonCloverFermionImplementation.h               | 4 ++--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h
index c1df81fb..c2bb0d8f 100644
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@@ -208,10 +208,7 @@ public:
     Clover += diag_mass;
   }
 
-  static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
-                          CloverTriangleField& Triangle,
-                          RealD csw_t, RealD diag_mass) {
-
+  static void Exponentiate_Clover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
     // Do nothing
   }
 
diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
index e042b985..e45a531f 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -365,8 +365,8 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
   std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
   std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
   std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "convert =                    " << (t5 - t4) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "exponentiation =             " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "exponentiation =             " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "convert =                    " << (t6 - t5) / 1e6 << std::endl;
   std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl;
   std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl;
   std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;

From b36442e263e117ad771b4416dac469dcb6aba2fa Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Tue, 25 Oct 2022 16:57:01 +0100
Subject: [PATCH 08/23] feat: CloverHelpers::InvertClover implemented which
 handles the inversion of the Clover term depending on clover type and the
 boundary conditions.

---
 Grid/qcd/action/fermion/CloverHelpers.h       | 31 +++++++++++++++++--
 ...CompactWilsonCloverFermionImplementation.h |  6 ++--
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h
index c2bb0d8f..e8c45888 100644
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@@ -208,10 +208,20 @@ public:
     Clover += diag_mass;
   }
 
-  static void Exponentiate_Clover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
+  static void ExponentiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
     // Do nothing
   }
 
+  static void InvertClover(CloverField& InvClover,
+                            const CloverDiagonalField& diagonal,
+                            const CloverTriangleField& triangle,
+                            CloverDiagonalField&       diagonalInv,
+                            CloverTriangleField&       triangleInv,
+                            bool open_boundaries) {
+
+    CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
+  }
+
   // TODO: implement Cmunu for better performances with compact layout, but don't do it
   // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
   static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
@@ -267,7 +277,7 @@ public:
   static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
   static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
 
-  static void Exponentiate_Clover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
+  static void ExponentiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
 
     GridBase* grid = Clover.Grid();
     CloverField ExpClover(grid);
@@ -302,7 +312,24 @@ public:
 
     CloverInv = ExpClover * (1.0/diag_mass);
 
+  }
+
+  static void InvertClover(CloverField& InvClover,
+                            const CloverDiagonalField& diagonal,
+                            const CloverTriangleField& triangle,
+                            CloverDiagonalField&       diagonalInv,
+                            CloverTriangleField&       triangleInv,
+                            bool open_boundaries) {
+
+    if (open_boundaries)
+    {
+      CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
     }
+    else
+    {
+      CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
+    }
+  }
 
   static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
     assert(0);
diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
index e45a531f..6ee22113 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -330,7 +330,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
 
   // Convert the data layout of the clover term
   double t4 = usecond();
-  CloverHelpers::Exponentiate_Clover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
+  CloverHelpers::ExponentiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
 
   // Exponentiate the clover (nothing happens in case of the standard clover)
   double t5 = usecond();
@@ -342,9 +342,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
 
   // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
   double t7 = usecond();
-  //CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
-  CompactHelpers::ConvertLayout(TmpInverse, DiagonalInv, TriangleInv);
-  //if(open_boundaries) handle differently!
+  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, open_boundaries);
 
   // Fill the remaining clover fields
   double t8 = usecond();

From 86075fdd45a1f88108615a6dada2df2bee5af8ad Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Tue, 25 Oct 2022 17:05:34 +0100
Subject: [PATCH 09/23] feat: MassTerm and ExponentiateClover merged into
 InstantiateClover

---
 Grid/qcd/action/fermion/CloverHelpers.h             | 13 ++-----------
 .../CompactWilsonCloverFermionImplementation.h      | 12 ++++++------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h
index e8c45888..b8b4fa59 100644
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@@ -204,14 +204,10 @@ public:
   typedef WilsonCloverHelpers<Impl> Helpers;
   typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
 
-  static void MassTerm(CloverField& Clover, RealD diag_mass) {
+  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
     Clover += diag_mass;
   }
 
-  static void ExponentiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
-    // Do nothing
-  }
-
   static void InvertClover(CloverField& InvClover,
                             const CloverDiagonalField& diagonal,
                             const CloverTriangleField& triangle,
@@ -244,11 +240,6 @@ public:
   template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
   typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
 
-  static void MassTerm(CloverField& Clover, RealD diag_mass) {
-    // do nothing!
-    // mass term is multiplied to exp(Clover) below
-  }
-
   // Can this be avoided?
   static void IdentityTimesC(const CloverField& in, RealD c) {
     int DimRep = Impl::Dimension;
@@ -277,7 +268,7 @@ public:
   static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
   static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
 
-  static void ExponentiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
+  static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
 
     GridBase* grid = Clover.Grid();
     CloverField ExpClover(grid);
diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
index 6ee22113..0828f5fc 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -325,14 +325,14 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
   TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
   TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
   TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
-  // Handle mass term based on clover policy
-  CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
+
+  // Instantiate the clover term
+  // - In case of the standard clover the mass term is added
+  // - In case of the exponential clover the clover term is exponentiated
+  double t4 = usecond();
+  CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
 
   // Convert the data layout of the clover term
-  double t4 = usecond();
-  CloverHelpers::ExponentiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
-
-  // Exponentiate the clover (nothing happens in case of the standard clover)
   double t5 = usecond();
   CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
 

From 9317d893b201a54c7dfa411fd0aaf6a32aa6f61b Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Tue, 25 Oct 2022 17:10:06 +0100
Subject: [PATCH 10/23] docs: details about inversion of CompactClover term
 added.

---
 .../CompactWilsonCloverFermionImplementation.h               | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
index 0828f5fc..40bdfb43 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -340,7 +340,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
   double t6 = usecond();
   if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
 
-  // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
+  // Invert the Clover term
+  // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
+  // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
+  // TODO: For now this inversion is explictly done on the CPU
   double t7 = usecond();
   CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, open_boundaries);
 

From a2a879b668d008ad5f70f77e7160c8e855c444a0 Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Tue, 25 Oct 2022 17:20:42 +0100
Subject: [PATCH 11/23] docs: CompactClover Debug Info improved.

---
 .../CompactWilsonCloverFermionImplementation.h         | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
index 40bdfb43..16f89725 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -336,7 +336,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
   double t5 = usecond();
   CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
 
-  // Possible modify the boundary values
+  // Modify the clover term at the temporal boundaries in case of open boundary conditions
   double t6 = usecond();
   if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
 
@@ -366,10 +366,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
   std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
   std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
   std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "exponentiation =             " << (t5 - t4) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "convert =                    " << (t6 - t5) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "exponentiate clover =        " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;
   std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl;
   std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl;
 }

From 5fa6a8b96d6d7d82dfb2ab2c0ebfb844871bcc6b Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Wed, 26 Oct 2022 12:40:28 +0100
Subject: [PATCH 12/23] docs: CompactClover debug info generalized.

---
 .../implementation/CompactWilsonCloverFermionImplementation.h   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
index 16f89725..c8d1ca30 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -366,7 +366,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
   std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl;
   std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl;
   std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl;
-  std::cout << GridLogDebug << "exponentiate clover =        " << (t5 - t4) / 1e6 << std::endl;
+  std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl;
   std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl;
   std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl;
   std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl;

From 184adeedb83c716b879d2026d1af902aa2457d02 Mon Sep 17 00:00:00 2001
From: Fabian Joswig <fabian.joswig@ed.ac.uk>
Date: Wed, 26 Oct 2022 12:53:46 +0100
Subject: [PATCH 13/23] feat: renamed open_boundaries to fixedBoundaries

---
 Grid/qcd/action/fermion/CloverHelpers.h       |  6 ++--
 .../fermion/CompactWilsonCloverFermion.h      |  2 +-
 ...CompactWilsonCloverFermionImplementation.h | 32 +++++++++----------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h
index b8b4fa59..cd469ea7 100644
--- a/Grid/qcd/action/fermion/CloverHelpers.h
+++ b/Grid/qcd/action/fermion/CloverHelpers.h
@@ -213,7 +213,7 @@ public:
                             const CloverTriangleField& triangle,
                             CloverDiagonalField&       diagonalInv,
                             CloverTriangleField&       triangleInv,
-                            bool open_boundaries) {
+                            bool fixedBoundaries) {
 
     CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
   }
@@ -310,9 +310,9 @@ public:
                             const CloverTriangleField& triangle,
                             CloverDiagonalField&       diagonalInv,
                             CloverTriangleField&       triangleInv,
-                            bool open_boundaries) {
+                            bool fixedBoundaries) {
 
-    if (open_boundaries)
+    if (fixedBoundaries)
     {
       CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
     }
diff --git a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
index 249b20bd..d79b34d4 100644
--- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h
@@ -225,7 +225,7 @@ public:
   RealD csw_t;
   RealD cF;
 
-  bool open_boundaries;
+  bool fixedBoundaries;
 
   CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd;
   CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
index c8d1ca30..7e3b7f00 100644
--- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h
@@ -48,7 +48,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
   , csw_r(_csw_r)
   , csw_t(_csw_t)
   , cF(_cF)
-  , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
+  , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
   , Diagonal(&Fgrid),        Triangle(&Fgrid)
   , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid)
   , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid)
@@ -67,7 +67,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
     csw_r /= clover_anisotropy.xi_0;
 
   ImportGauge(_Umu);
-  if (open_boundaries) {
+  if (fixedBoundaries) {
     this->BoundaryMaskEven.Checkerboard() = Even;
     this->BoundaryMaskOdd.Checkerboard() = Odd;
     CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
@@ -77,31 +77,31 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
   WilsonBase::Dhop(in, out, dag);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
   WilsonBase::DhopOE(in, out, dag);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
   WilsonBase::DhopEO(in, out, dag);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
   WilsonBase::DhopDir(in, out, dir, disp);
-  if(this->open_boundaries) ApplyBoundaryMask(out);
+  if(this->fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
   WilsonBase::DhopDirAll(in, out);
-  if(this->open_boundaries) {
+  if(this->fixedBoundaries) {
     for(auto& o : out) ApplyBoundaryMask(o);
   }
 }
@@ -112,7 +112,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in,
   WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
   Mooee(in, Tmp);
   axpy(out, 1.0, out, Tmp);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
@@ -121,19 +121,19 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i
   WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc
   MooeeDag(in, Tmp);
   axpy(out, 1.0, out, Tmp);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
   WilsonBase::Meooe(in, out);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
   WilsonBase::MeooeDag(in, out);
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
@@ -147,7 +147,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
   } else {
     MooeeInternal(in, out, Diagonal, Triangle);
   }
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
@@ -166,7 +166,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
   } else {
     MooeeInternal(in, out, DiagonalInv, TriangleInv);
   }
-  if(open_boundaries) ApplyBoundaryMask(out);
+  if(fixedBoundaries) ApplyBoundaryMask(out);
 }
 
 template<class Impl, class CloverHelpers>
@@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField
 
 template<class Impl, class CloverHelpers>
 void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
-  assert(!open_boundaries); // TODO check for changes required for open bc
+  assert(!fixedBoundaries); // TODO check for changes required for open bc
 
   // NOTE: code copied from original clover term
   conformable(X.Grid(), Y.Grid());
@@ -338,14 +338,14 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
 
   // Modify the clover term at the temporal boundaries in case of open boundary conditions
   double t6 = usecond();
-  if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
+  if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
 
   // Invert the Clover term
   // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
   // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
   // TODO: For now this inversion is explictly done on the CPU
   double t7 = usecond();
-  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, open_boundaries);
+  CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
 
   // Fill the remaining clover fields
   double t8 = usecond();

From 82e959f66cac74c3863eaa0b42f3c8b42ae874d8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 8 Nov 2022 12:45:25 -0800
Subject: [PATCH 14/23] SYCL reduction

---
 Grid/lattice/Lattice_reduction.h | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 0ddac437..fb6a258c 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #if defined(GRID_CUDA)||defined(GRID_HIP)
 #include <Grid/lattice/Lattice_reduction_gpu.h>
 #endif
+#if defined(GRID_SYCL)
+#include <Grid/lattice/Lattice_reduction_sycl.h>
+#endif
 
 NAMESPACE_BEGIN(Grid);
 
@@ -127,7 +130,7 @@ inline Double max(const Double *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
   return sum_gpu(arg,osites);
 #else
   return sum_cpu(arg,osites);
@@ -136,7 +139,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
   return sumD_gpu(arg,osites);
 #else
   return sumD_cpu(arg,osites);
@@ -145,7 +148,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
   return sumD_gpu_large(arg,osites);
 #else
   return sumD_cpu(arg,osites);
@@ -155,13 +158,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
 template<class vobj>
 inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
-  autoView( arg_v, arg, AcceleratorRead);
   Integer osites = arg.Grid()->oSites();
-  auto ssum= sum_gpu(&arg_v[0],osites);
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
+  typename vobj::scalar_object ssum;
+  autoView( arg_v, arg, AcceleratorRead);
+  ssum= sum_gpu(&arg_v[0],osites);
 #else
   autoView(arg_v, arg, CpuRead);
-  Integer osites = arg.Grid()->oSites();
   auto ssum= sum_cpu(&arg_v[0],osites);
 #endif  
   arg.Grid()->GlobalSum(ssum);
@@ -171,7 +174,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 template<class vobj>
 inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
 {
-#if defined(GRID_CUDA)||defined(GRID_HIP)
+#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
   autoView( arg_v, arg, AcceleratorRead);
   Integer osites = arg.Grid()->oSites();
   auto ssum= sum_gpu_large(&arg_v[0],osites);
@@ -235,11 +238,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
   typedef decltype(innerProductD(vobj(),vobj())) inner_t;
   Vector<inner_t> inner_tmp(sites);
   auto inner_tmp_v = &inner_tmp[0];
-    
   {
     autoView( left_v , left, AcceleratorRead);
     autoView( right_v,right, AcceleratorRead);
-
+    // This code could read coalesce
     // GPU - SIMT lane compliance...
     accelerator_for( ss, sites, 1,{
 	auto x_l = left_v[ss];

From bd68861b28b24ce85adcbcbe5a7c50b383446b3f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 8 Nov 2022 12:49:26 -0800
Subject: [PATCH 15/23] SYCL sum

---
 Grid/lattice/Lattice_reduction_sycl.h | 125 ++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 Grid/lattice/Lattice_reduction_sycl.h

diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h
new file mode 100644
index 00000000..90980c4c
--- /dev/null
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -0,0 +1,125 @@
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Possibly promote to double and sum
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_objectD sobjD;
+  sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
+  sobj identity; zeroit(identity);
+  sobj ret ; 
+
+  Integer nsimd= vobj::Nsimd();
+  
+  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+     auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
+     cgh.parallel_for(cl::sycl::range<1>{osites},
+		      Reduction,
+		      [=] (cl::sycl::id<1> item, auto &sum) {
+      auto osite   = item[0];
+      sum +=Reduce(lat[osite]);
+     });
+   });
+  theGridAccelerator->wait();
+  ret = mysum[0];
+  free(mysum,*theGridAccelerator);
+  sobjD dret; convertType(dret,ret);
+  return dret;
+}
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
+{
+  return sumD_gpu_tensor(lat,osites);
+}
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
+{
+  return sumD_gpu_large(lat,osites);
+}
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
+{
+  return sumD_gpu_large(lat,osites);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Return as same precision as input performing reduction in double precision though
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <class vobj>
+inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) 
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj result;
+  result = sumD_gpu(lat,osites);
+  return result;
+}
+
+template <class vobj>
+inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::scalar_object sobj;
+  sobj result;
+  result = sumD_gpu_large(lat,osites);
+  return result;
+}
+
+NAMESPACE_END(Grid);
+
+/*
+template<class Double> Double svm_reduce(Double *vec,uint64_t L)
+{
+  Double sumResult; zeroit(sumResult);
+  Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
+  Double identity;  zeroit(identity);
+  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+     auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
+     cgh.parallel_for(cl::sycl::range<1>{L},
+		      Reduction,
+		      [=] (cl::sycl::id<1> index, auto &sum) {
+	 sum +=vec[index];
+     });
+   });
+  theGridAccelerator->wait();
+  Double ret = d_sum[0];
+  free(d_sum,*theGridAccelerator);
+  std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
+  return ret;
+}
+
+template <class vobj>
+inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
+{
+  typedef typename vobj::vector_type  vector;
+  typedef typename vobj::scalar_type  scalar;
+
+  typedef typename vobj::scalar_typeD scalarD;
+  typedef typename vobj::scalar_objectD sobjD;
+
+  sobjD ret;
+  scalarD *ret_p = (scalarD *)&ret;
+  
+  const int nsimd = vobj::Nsimd();
+  const int words = sizeof(vobj)/sizeof(vector);
+
+  Vector<scalar> buffer(osites*nsimd);
+  scalar *buf = &buffer[0];
+  vector *dat = (vector *)lat;
+
+  for(int w=0;w<words;w++) {
+
+    accelerator_for(ss,osites,nsimd,{
+	int lane = acceleratorSIMTlane(nsimd);
+	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
+    });
+    //Precision change at this point is to late to gain precision
+    ret_p[w] = svm_reduce(buf,nsimd*osites);
+  }
+  return ret;
+}
+*/

From 63fd1dfa62b86a2fae01f31ae6619c3ac6713119 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 8 Nov 2022 13:22:09 -0800
Subject: [PATCH 16/23] Config on PVC

---
 systems/PVC/config-command | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/systems/PVC/config-command b/systems/PVC/config-command
index 3f5b5993..858edda7 100644
--- a/systems/PVC/config-command
+++ b/systems/PVC/config-command
@@ -2,14 +2,15 @@ INSTALL=/nfs/site/home/azusayax/install
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
-	--enable-comms=mpi \
+	--enable-comms=mpi-auto \
 	--disable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
-	--enable-unified=yes \
-	CXX=mpicxx \
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=dpcpp \
 	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
-	CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
+	CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
 

From 187310136226348c0cd4cf507a79f76eab7fed0b Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 8 Nov 2022 13:22:45 -0800
Subject: [PATCH 17/23] PVC

---
 systems/PVC/benchmarks/run-1tile.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 systems/PVC/benchmarks/run-1tile.sh

diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh
old mode 100644
new mode 100755
index 923afd84..0fe80247
--- a/systems/PVC/benchmarks/run-1tile.sh
+++ b/systems/PVC/benchmarks/run-1tile.sh
@@ -6,7 +6,7 @@
 
 source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 
-export NT=16
+export NT=8
 
 export I_MPI_OFFLOAD=1
 export I_MPI_OFFLOAD_TOPOLIB=level_zero
@@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
 export ZE_AFFINITY_MASK=0.0
-mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8
+mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768
 
 export ZE_AFFINITY_MASK=0
 export I_MPI_OFFLOAD_CELL=device

From 5c75aa50089bc53bc8a2a18dd16e8d6aab5041c4 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 8 Nov 2022 13:22:57 -0800
Subject: [PATCH 18/23] Device mem

---
 systems/PVC/benchmarks/run-2tile-mpi.sh | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh
index 34deac2c..cefab776 100755
--- a/systems/PVC/benchmarks/run-2tile-mpi.sh
+++ b/systems/PVC/benchmarks/run-2tile-mpi.sh
@@ -8,7 +8,6 @@ source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
 
 export NT=16
 
-
 # export IGC_EnableLSCFenceUGMBeforeEOT=0
 # export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
 #export IGC_ShaderDumpEnable=1 
@@ -20,14 +19,16 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
 export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
-#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
+export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
 
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > dw.2tile.1x2.log
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > dw.2tile.2x1.log
-
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log
+for i in 0 
+do
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
+done
+#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
+#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log
 
 

From 0655dab46659e21dfb7eaa4a4c947296e315fa5e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 8 Nov 2022 13:38:54 -0800
Subject: [PATCH 19/23] Open MP on host enabled

---
 systems/PVC/config-command | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/systems/PVC/config-command b/systems/PVC/config-command
index 858edda7..cd7bba1d 100644
--- a/systems/PVC/config-command
+++ b/systems/PVC/config-command
@@ -12,5 +12,5 @@ INSTALL=/nfs/site/home/azusayax/install
 	MPICXX=mpicxx \
 	CXX=dpcpp \
 	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
-	CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
+	CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"
 

From e13930c8b228380b6556541c334a0ae40a97afae Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 30 Nov 2022 15:11:29 -0500
Subject: [PATCH 20/23] Faster fermtoprop case

---
 Grid/qcd/QCD.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h
index 858aead7..b292e560 100644
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -451,9 +451,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
 // Fermion <-> propagator assignements
 //////////////////////////////////////////////
 //template <class Prop, class Ferm>
+#define FAST_FERM_TO_PROP
 template <class Fimpl>
 void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
+#ifdef FAST_FERM_TO_PROP
+  autoView(p_v,p,AcceleratorWrite);
+  autoView(f_v,f,AcceleratorRead);
+  accelerator_for(idx,p_v.oSites(),1,{
+      for(int ss = 0; ss < Ns; ++ss) {
+      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
+	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
+      }}
+  );
+#else
   for(int j = 0; j < Ns; ++j)
     {
       auto pjs = peekSpin(p, j, s);
@@ -465,12 +476,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
 	}
       pokeSpin(p, pjs, j, s);
     }
+#endif
 }
     
 //template <class Prop, class Ferm>
 template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
+#ifdef FAST_FERM_TO_PROP
+  autoView(p_v,p,AcceleratorWrite);
+  autoView(f_v,f,AcceleratorRead);
+  accelerator_for(idx,p_v.oSites(),1,{
+      for(int ss = 0; ss < Ns; ++ss) {
+      for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
+	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
+      }}
+  );
+#else
   for(int j = 0; j < Ns; ++j)
     {
       auto pjs = peekSpin(p, j, s);
@@ -482,6 +504,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
 	}
       pokeSpin(f, fj, j);
     }
+#endif
 }
     
 //////////////////////////////////////////////

From 97a098636d816970631b120877b85e13691dd8ff Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Wed, 30 Nov 2022 15:36:35 -0500
Subject: [PATCH 21/23] FermToProp

---
 Grid/qcd/QCD.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h
index b292e560..9c56c12c 100644
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -463,7 +463,7 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
       for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
       }}
-  );
+    });
 #else
   for(int j = 0; j < Ns; ++j)
     {
@@ -491,7 +491,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
       for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
       }}
-  );
+    });
 #else
   for(int j = 0; j < Ns; ++j)
     {

From d49694f38f1b07676c4200892922c9a89bca99ad Mon Sep 17 00:00:00 2001
From: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
Date: Tue, 6 Dec 2022 15:48:54 +0000
Subject: [PATCH 22/23] PropToFerm fix

---
 Grid/qcd/QCD.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h
index 9c56c12c..39f45ada 100644
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -484,8 +484,8 @@ template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
-  autoView(p_v,p,AcceleratorWrite);
-  autoView(f_v,f,AcceleratorRead);
+  autoView(p_v,p,AcceleratorRead);
+  autoView(f_v,f,AcceleratorWrite);
   accelerator_for(idx,p_v.oSites(),1,{
       for(int ss = 0; ss < Ns; ++ss) {
       for(int cc = 0; cc < Fimpl::Dimension; ++cc) {

From 40234f531ff83172cebbb7401916641d0df7ddd2 Mon Sep 17 00:00:00 2001
From: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
Date: Tue, 6 Dec 2022 17:34:51 +0000
Subject: [PATCH 23/23] FermToProp accelerator_for -> thread_for

---
 Grid/qcd/QCD.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h
index 39f45ada..6dd6d5dc 100644
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -456,9 +456,9 @@ template <class Fimpl>
 void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
-  autoView(p_v,p,AcceleratorWrite);
-  autoView(f_v,f,AcceleratorRead);
-  accelerator_for(idx,p_v.oSites(),1,{
+  autoView(p_v,p,CpuWrite);
+  autoView(f_v,f,CpuRead);
+  thread_for(idx,p_v.oSites(),{
       for(int ss = 0; ss < Ns; ++ss) {
       for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
@@ -484,9 +484,9 @@ template <class Fimpl>
 void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
 {
 #ifdef FAST_FERM_TO_PROP
-  autoView(p_v,p,AcceleratorRead);
-  autoView(f_v,f,AcceleratorWrite);
-  accelerator_for(idx,p_v.oSites(),1,{
+  autoView(p_v,p,CpuRead);
+  autoView(f_v,f,CpuWrite);
+  thread_for(idx,p_v.oSites(),{
       for(int ss = 0; ss < Ns; ++ss) {
       for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
 	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index