From 4f85672674a018f1213af057e6e3b09ada600347 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 17 Dec 2022 20:16:11 -0500
Subject: [PATCH 1/9] Simpler test for PETSc

---
 tests/core/Test_fft_matt.cc | 160 ++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 tests/core/Test_fft_matt.cc
diff --git a/tests/core/Test_fft_matt.cc b/tests/core/Test_fft_matt.cc
new file mode 100644
index 00000000..d4455a7e
--- /dev/null
+++ b/tests/core/Test_fft_matt.cc
@@ -0,0 +1,160 @@
+    /*************************************************************************************
+
+    grid` physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_cshift.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace Grid;
+ ;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  int vol = 1;
+  for(int d=0;d<latt_size.size();d++){
+    vol = vol * latt_size[d];
+  }
+  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian RBGRID(&GRID);
+
+  LatticeComplexD     one(&GRID);
+  LatticeComplexD      zz(&GRID);
+  LatticeComplexD       C(&GRID);
+  LatticeComplexD  Ctilde(&GRID);
+  LatticeComplexD  Cref  (&GRID);
+  LatticeComplexD  Csav  (&GRID);
+  LatticeComplexD    coor(&GRID);
+
+  LatticeSpinMatrixD    S(&GRID);
+  LatticeSpinMatrixD    Stilde(&GRID);
+  
+  Coordinate p({1,3,2,3});
+
+  one = ComplexD(1.0,0.0);
+  zz  = ComplexD(0.0,0.0);
+
+  ComplexD ci(0.0,1.0);
+
+  std::vector<int> seeds({1,2,3,4});
+  GridSerialRNG          sRNG;  sRNG.SeedFixedIntegers(seeds); // naughty seeding
+  GridParallelRNG          pRNG(&GRID);
+  pRNG.SeedFixedIntegers(seeds);
+
+  LatticeGaugeFieldD Umu(&GRID);
+
+  SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
+
+  ////////////////////////////////////////////////////
+  // Wilson test
+  ////////////////////////////////////////////////////
+  {
+    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    
+    RealD mass=0.01;
+    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
+    
+    Dw.M(src,tmp);
+
+    std::cout << "Dw src = " <<norm2(src)<<std::endl;
+    std::cout << "Dw tmp = " <<norm2(tmp)<<std::endl;
+    
+    Dw.FreePropagator(tmp,ref,mass);
+
+    std::cout << "Dw ref = " <<norm2(ref)<<std::endl;
+    
+    ref = ref - src;
+    
+    std::cout << "Dw ref-src = " <<norm2(ref)<<std::endl;
+  }
+
+
+  ////////////////////////////////////////////////////
+  // Wilson prop
+  ////////////////////////////////////////////////////
+  {
+    std::cout<<"****************************************"<<std::endl;
+    std::cout << "Wilson Mom space 4d propagator \n";
+    std::cout<<"****************************************"<<std::endl;
+
+    LatticeFermionD    src(&GRID); gaussian(pRNG,src);
+    LatticeFermionD    tmp(&GRID);
+    LatticeFermionD    ref(&GRID);
+    LatticeFermionD    diff(&GRID);
+
+    src=Zero();
+    Coordinate point(4,0); // 0,0,0,0
+    SpinColourVectorD ferm;
+    ferm=Zero();
+    ferm()(0)(0) = ComplexD(1.0);
+    pokeSite(ferm,src,point);
+
+    RealD mass=0.01;
+    WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
+
+    // Momentum space prop
+    std::cout << " Solving by FFT and Feynman rules" <<std::endl;
+    Dw.FreePropagator(src,ref,mass) ;
+
+    Gamma G5(Gamma::Algebra::Gamma5);
+
+    LatticeFermionD    result(&GRID); 
+    const int sdir=0;
+    
+    ////////////////////////////////////////////////////////////////////////
+    // Conjugate gradient on normal equations system
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
+    Dw.Mdag(src,tmp);
+    src=tmp;
+    MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw);
+    ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000);
+    CG(HermOp,src,result);
+    
+    ////////////////////////////////////////////////////////////////////////
+    std::cout << " Taking difference" <<std::endl;
+    std::cout << "Dw result "<<norm2(result)<<std::endl;
+    std::cout << "Dw ref     "<<norm2(ref)<<std::endl;
+    
+    diff = ref - result;
+    std::cout << "result - ref     "<<norm2(diff)<<std::endl;
+
+    DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
+  }
+
+  
+  Grid_finalize();
+}

From 87697eb07ebaf510a776f79bc409efebc7860a23 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 14 Mar 2023 09:07:36 -0700
Subject: [PATCH 2/9] SHared compile

---
 Grid/allocator/MemoryManagerShared.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc
index 2434ad47..e291ef89 100644
--- a/Grid/allocator/MemoryManagerShared.cc
+++ b/Grid/allocator/MemoryManagerShared.cc
@@ -13,7 +13,7 @@ uint64_t  MemoryManager::DeviceToHostBytes;
 uint64_t  MemoryManager::HostToDeviceXfer;
 uint64_t  MemoryManager::DeviceToHostXfer;
 
-void  MemoryManager::Audit(void){};
+void  MemoryManager::Audit(std::string s){};
 void  MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
 void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
 int   MemoryManager::isOpen   (void* CpuPtr) { return 0;}

From cad5b187ddcbce278bdca06b42dfb4ac6bbc59d0 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 14 Mar 2023 09:08:16 -0700
Subject: [PATCH 3/9] Cleanup

---
 Grid/qcd/action/fermion/WilsonKernels.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h
index 68422f28..2d868c27 100644
--- a/Grid/qcd/action/fermion/WilsonKernels.h
+++ b/Grid/qcd/action/fermion/WilsonKernels.h
@@ -52,13 +52,6 @@ public:
   typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
 public:
 
-#ifdef GRID_SYCL
-#define SYCL_HACK
-#endif  
-#ifdef SYCL_HACK
-  static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor  *buf,
-			       int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
-#endif
   
   static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
 			 int Ls, int Nsite, const FermionField &in, FermionField &out,

From eeb6e0a6e371407ec4d445f7d0459e6a951c2449 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 14 Mar 2023 09:10:27 -0700
Subject: [PATCH 4/9] Renable cache blocking and efficient UPI type SHM comms

---
 .../WilsonFermion5DImplementation.h           |   4 +
 .../WilsonFermionImplementation.h             |   3 +
 .../WilsonKernelsImplementation.h             |   3 +-
 Grid/stencil/Stencil.h                        | 104 ++++++++++++++++--
 configure.ac                                  |   8 ++
 5 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
index 4ca24789..1ddb30ba 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -63,6 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
   _tmp(&FiveDimRedBlackGrid),
   Dirichlet(0)
 {
+  Stencil.lo     = &Lebesgue;
+  StencilEven.lo = &LebesgueEvenOdd;
+  StencilOdd.lo  = &LebesgueEvenOdd;
+  
   // some assertions
   assert(FiveDimGrid._ndimension==5);
   assert(FourDimGrid._ndimension==4);
diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
index 2833fdc4..1a262533 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -60,6 +60,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
       _tmp(&Hgrid),
       anisotropyCoeff(anis)
 {
+  Stencil.lo     = &Lebesgue;
+  StencilEven.lo = &LebesgueEvenOdd;
+  StencilOdd.lo  = &LebesgueEvenOdd;
   // Allocate the required comms buffer
   ImportGauge(_Umu);
   if  (anisotropyCoeff.isAnisotropic){
diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
index bdba7cb2..b307fad4 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -433,7 +433,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
     });									
 
 #define ASM_CALL(A)							\
-  thread_for( ss, Nsite, {						\
+  thread_for( sss, Nsite, {						\
+    int ss = st.lo->Reorder(sss);					\
     int sU = ss;							\
     int sF = ss*Ls;							\
     WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index cffede12..a74b720d 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -290,9 +290,9 @@ public:
 
 protected:
   GridBase *                        _grid;
-
 public:
   GridBase *Grid(void) const { return _grid; }
+  LebesgueOrder *lo;
 
   ////////////////////////////////////////////////////////////////////////
   // Needed to conveniently communicate gparity parameters into GPU memory
@@ -337,6 +337,7 @@ public:
   ////////////////////////////////////////
   // Stencil query
   ////////////////////////////////////////
+#ifdef SHM_FAST_PATH
   inline int SameNode(int point) {
 
     int dimension    = this->_directions[point];
@@ -356,7 +357,40 @@ public:
     if ( displacement == 0 ) return 1;
     return 0;
   }
+#else
+  // 
+  inline int SameNode(int point) {
 
+    int dimension    = this->_directions[point];
+    int displacement = this->_distances[point];
+
+    int pd              = _grid->_processors[dimension];
+    int fd              = _grid->_fdimensions[dimension];
+    int ld              = _grid->_ldimensions[dimension];
+    int rd              = _grid->_rdimensions[dimension];
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+ 
+    int recv_from_rank;
+    int xmit_to_rank;
+
+    if ( ! comm_dim ) return 1;
+
+    int nbr_proc;
+    if (displacement>0) nbr_proc = 1;
+    else                 nbr_proc = pd-1;
+
+    // FIXME  this logic needs to be sorted for three link term
+    //    assert( (displacement==1) || (displacement==-1));
+    // Present hack only works for >= 4^4 subvol per node
+    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
+
+    void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
+
+    if ( (shm==NULL) ) return 0;
+    return 1;
+  }
+#endif
   //////////////////////////////////////////
   // Comms packet queue for asynch thread
   // Use OpenMP Tasks for cleaner ???
@@ -1056,7 +1090,7 @@ public:
     int comms_recv   = this->_comms_recv[point];
     int comms_partial_send   = this->_comms_partial_send[point] ;
     int comms_partial_recv   = this->_comms_partial_recv[point] ;
-
+    
     assert(rhs.Grid()==_grid);
     //	  conformable(_grid,rhs.Grid());
 
@@ -1127,11 +1161,32 @@ public:
 	  recv_buf=this->u_recv_buf_p;
 	}
 
+	// potential SHM fast path for intranode
+	int shm_send=0;
+	int shm_recv=0;
+#ifdef SHM_FAST_PATH
+	// Put directly in place if we can
+	send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
+	if ( (send_buf==NULL) ) {
+	  shm_send=0;
+	  send_buf = this->u_send_buf_p;
+	} else {
+	  shm_send=1;
+	}
+	void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf);
+	if ( test_ptr != NULL ) shm_recv = 1;
+	//	static int printed;
+	//	if (!printed){
+	  //	  std::cout << " GATHER FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
+	//	  printed = 1;
+	//	}
+#else
 	////////////////////////////////////////////////////////
 	// Gather locally
 	////////////////////////////////////////////////////////
 	send_buf = this->u_send_buf_p; // Gather locally, must send
 	assert(send_buf!=NULL);
+#endif
 
 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
 	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
@@ -1143,10 +1198,13 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
+	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
-		    xmit_to_rank, comms_send|comms_partial_send,
-		    recv_from_rank, comms_recv|comms_partial_recv,
+		    xmit_to_rank, do_send,
+		    recv_from_rank, do_recv,
 		    xbytes,rbytes);
 	}
 
@@ -1288,19 +1346,47 @@ public:
 
 	    int recv_from_rank;
 	    int xmit_to_rank;
-
+	    int shm_send=0;
+	    int shm_recv=0;
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
-
+#ifdef SHM_FAST_PATH
+  #warning STENCIL SHM FAST PATH SELECTED
+	    // shm == receive pointer         if offnode
+	    // shm == Translate[send pointer] if on node -- my view of his send pointer
+	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
+	    if (shm==NULL) {
+	      shm = rp;
+	      // we found a packet that comes from MPI and contributes to this shift.
+	      // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
+	      // Kernel will add the exterior_terms except if is_same_node.
+	      // leg of stencil
+	      shm_recv=0;
+	    } else {
+	      shm_recv=1;
+	    }
+	    rpointers[i] = shm;
+	    // Test send side
+	    void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
+	    if ( test_ptr != NULL ) shm_send = 1;
+	    //	    static int printed;
+	    //	    if (!printed){
+	    //	      std::cout << " GATHERSIMD FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
+	    //	      printed = 1;
+	    //	    }
+#else
 	    rpointers[i] = rp;
+#endif
 	    
 	    int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
 	    if ( !duplicate  ) { 
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
+	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 	      AddPacket((void *)sp,(void *)rp,
-			xmit_to_rank,comms_send|comms_partial_send,
-			recv_from_rank,comms_recv|comms_partial_recv,
+			xmit_to_rank,do_send,
+			recv_from_rank,do_send,
 			xbytes,rbytes);
 	    }
 
@@ -1310,7 +1396,7 @@ public:
 
 	  }
 	}
-
+	// rpointer may be doing a remote read in the gather over SHM
 	if ( comms_recv|comms_partial_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}
diff --git a/configure.ac b/configure.ac
index 2e6199c7..fedca3fe 100644
--- a/configure.ac
+++ b/configure.ac
@@ -646,6 +646,14 @@ case ${ac_SHM_FORCE_MPI} in
       ;;
      *) ;;
 esac
+############### force MPI in SMP
+AC_ARG_ENABLE([shm-fast-path],[AS_HELP_STRING([--enable-shm-fast-path],[Allow kernels to remote copy over intranode])],[ac_SHM_FAST_PATH=${enable_shm_fast_path}],[ac_SHM_FAST_PATH=no])
+case ${ac_SHM_FAST_PATH} in
+     yes)
+        AC_DEFINE([SHM_FAST_PATH],[1],[SHM_FAST_PATH] )
+      ;;
+     *) ;;
+esac
 
 ############### communication type selection
 AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])

From f36b87deb5451a3c1d47c4a25502319395478c3e Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 14 Mar 2023 12:09:00 -0700
Subject: [PATCH 5/9] syscall fix

---
 Grid/communicator/SharedMemoryMPI.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc
index 4993a02e..9a273dc4 100644
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -29,6 +29,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
 
 #include <Grid/GridCore.h>
 #include <pwd.h>
+#include <syscall.h>
 
 #ifdef GRID_CUDA
 #include <cuda_runtime_api.h>

From 14cc142a1424d02883887572cef47d1677a91fb1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 14 Mar 2023 12:09:26 -0700
Subject: [PATCH 6/9] Warning remove

---
 Grid/stencil/Stencil.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index a74b720d..29aa876f 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -387,7 +387,7 @@ public:
 
     void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
 
-    if ( (shm==NULL) ) return 0;
+    if ( shm==NULL ) return 0;
     return 1;
   }
 #endif

From 861e5d7f4c0bb56cbb5de550cb2a14e5c1ad15f7 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 14 Mar 2023 12:10:02 -0700
Subject: [PATCH 7/9] SYCL version update. Why do they keep making incompatible
 changes

---
 Grid/threads/Accelerator.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 5ac36d15..04ae885b 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -248,17 +248,23 @@ inline int  acceleratorIsCommunicable(void *ptr)
 //////////////////////////////////////////////
 // SyCL acceleration
 //////////////////////////////////////////////
-#ifdef GRID_SYCL
-NAMESPACE_END(Grid);
-#include <CL/sycl.hpp>
-#include <CL/sycl/usm.hpp>
 
+#ifdef GRID_SYCL
 #define GRID_SYCL_LEVEL_ZERO_IPC
 
-#ifdef GRID_SYCL_LEVEL_ZERO_IPC
+NAMESPACE_END(Grid);
+#if 0
+#include <CL/sycl.hpp>
+#include <CL/sycl/usm.hpp>
 #include <level_zero/ze_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
+#else
+#include <sycl/CL/sycl.hpp>
+#include <sycl/usm.hpp>
+#include <level_zero/ze_api.h>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
+
 NAMESPACE_BEGIN(Grid);
 
 extern cl::sycl::queue *theGridAccelerator;

From a997d24743abb33c2ec1778bafd805793d1919f7 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 14 Mar 2023 12:10:31 -0700
Subject: [PATCH 8/9] Remove nofma

---
 systems/PVC/config-command | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/systems/PVC/config-command b/systems/PVC/config-command
index 3f5b5993..dc6b222c 100644
--- a/systems/PVC/config-command
+++ b/systems/PVC/config-command
@@ -11,5 +11,5 @@ INSTALL=/nfs/site/home/azusayax/install
 	--enable-unified=yes \
 	CXX=mpicxx \
 	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
-	CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
+	CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare"
 

From e1c326558a54a38e138e9eeaead1aaabaab57440 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 21 Mar 2023 08:53:56 -0700
Subject: [PATCH 9/9] COmms improvements

---
 Grid/qcd/action/fermion/WilsonCompressor.h    | 34 ++++++++++---------
 .../WilsonKernelsImplementation.h             | 11 ++++++
 Grid/stencil/Stencil.h                        |  4 +--
 Grid/threads/Accelerator.h                    |  4 +--
 systems/PVC/benchmarks/run-1tile.sh           |  2 +-
 systems/PVC/benchmarks/run-2tile-mpi.sh       |  4 +--
 systems/PVC/benchmarks/wrap.sh                | 10 +++---
 systems/PVC/config-command                    |  6 ++--
 systems/PVC/setup.sh                          |  3 +-
 9 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h
index fd1bbe89..5523ae8a 100644
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -484,24 +484,26 @@ public:
 
     int dag = compress.dag;
     int face_idx=0;
+#define vet_same_node(a,b) \
+      { auto tmp = b;  }
     if ( dag ) { 
-      assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
-      assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
-      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
-      assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
-      assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
-      assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
-      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
-      assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
+      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx));
+      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx));
+      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
+      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx));
+      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx));
+      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx));
+      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
+      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx));
     } else {
-      assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
-      assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
-      assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
-      assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
-      assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
-      assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
-      assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
-      assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
+      vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx));
+      vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx));
+      vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
+      vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx));
+      vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx));
+      vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx));
+      vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
+      vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
     }
     this->face_table_computed=1;
     assert(this->u_comm_offset==this->_unified_buffer_size);
diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
index b307fad4..fcf1f1f3 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -439,6 +439,17 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
     int sF = ss*Ls;							\
     WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
   });
+#define ASM_CALL_SLICE(A)						\
+  auto grid = in.Grid() ;						\
+  int nt = grid->LocalDimensions()[4];					\
+  int nxyz = Nsite/nt ;							\
+  for(int t=0;t<nt;t++){						\
+  thread_for( sss, nxyz, {						\
+    int ss = t*nxyz+sss;						\
+    int sU = ss;							\
+    int sF = ss*Ls;							\
+    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
+    });}
 
 template <class Impl>
 void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index 29aa876f..c8703b9f 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -358,7 +358,7 @@ public:
     return 0;
   }
 #else
-  // 
+  // fancy calculation for shm code
   inline int SameNode(int point) {
 
     int dimension    = this->_directions[point];
@@ -378,7 +378,7 @@ public:
 
     int nbr_proc;
     if (displacement>0) nbr_proc = 1;
-    else                 nbr_proc = pd-1;
+    else                nbr_proc = pd-1;
 
     // FIXME  this logic needs to be sorted for three link term
     //    assert( (displacement==1) || (displacement==-1));
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index 04ae885b..2dde1433 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -305,14 +305,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      });	   			              \
     });
 
-#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n");  theGridAccelerator->wait(); }
+#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
 
 inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
 inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
 inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
 
-inline void acceleratorCopySynchronise(void) {  printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); }
+inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }
 inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh
index 923afd84..3c594ab6 100644
--- a/systems/PVC/benchmarks/run-1tile.sh
+++ b/systems/PVC/benchmarks/run-1tile.sh
@@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
 export ZE_AFFINITY_MASK=0.0
-mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8
+mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
 
 export ZE_AFFINITY_MASK=0
 export I_MPI_OFFLOAD_CELL=device
diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh
index 9db0b66b..fa56d5ec 100755
--- a/systems/PVC/benchmarks/run-2tile-mpi.sh
+++ b/systems/PVC/benchmarks/run-2tile-mpi.sh
@@ -20,7 +20,7 @@ export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
 
-mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log
+#mpiexec -launcher ssh -n 1 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log
 
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
 
diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh
index a352fff9..0e48625b 100755
--- a/systems/PVC/benchmarks/wrap.sh
+++ b/systems/PVC/benchmarks/wrap.sh
@@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
 echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
 
 
-if [ $MPI_LOCALRANKID = "0" ] 
-then
+#if [ $MPI_LOCALRANKID = "0" ] 
+#then
 #  ~psteinbr/build_pti/ze_tracer -c $@
-  onetrace --chrome-kernel-timeline $@
-else
+#  onetrace --chrome-kernel-timeline $@
+#else
   $@
-fi
+#fi
diff --git a/systems/PVC/config-command b/systems/PVC/config-command
index dc6b222c..7549f2b4 100644
--- a/systems/PVC/config-command
+++ b/systems/PVC/config-command
@@ -1,4 +1,4 @@
-INSTALL=/nfs/site/home/azusayax/install
+INSTALL=/nfs/site/home/paboylx/prereqs/
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
@@ -8,8 +8,8 @@ INSTALL=/nfs/site/home/azusayax/install
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
-	--enable-unified=yes \
+	--enable-unified=no \
 	CXX=mpicxx \
 	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
-	CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare"
+	CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -Wno-tautological-constant-compare -I$INSTALL/include"
 
diff --git a/systems/PVC/setup.sh b/systems/PVC/setup.sh
index 2a6f920b..9b515a62 100644
--- a/systems/PVC/setup.sh
+++ b/systems/PVC/setup.sh
@@ -1,5 +1,6 @@
 export https_proxy=http://proxy-chain.intel.com:911
-export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
+#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH
 
 module load intel-release
 source /opt/intel/oneapi/PVC_setup.sh