Fence propagation from SYCL

Merge branch 'develop' of https://github.com/paboyle/Grid into develop
Script update
2025-06-23 18:22:02 +01:00 · 2023-03-29 15:00:40 -04:00 · 2023-03-27 17:29:54 -07:00 · 2023-03-27 17:29:43 -07:00 · 2023-03-27 17:29:21 -07:00 · 2023-03-27 17:28:38 -07:00
8 changed files with 39 additions and 65 deletions
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -401,8 +401,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
  //   std::cout << "Copy Synchronised\n"<<std::endl;
-  acceleratorCopySynchronise();
-
  int nreq=list.size();

  if (nreq==0) return;
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -36,10 +36,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
-#ifdef GRID_SYCl
-
+#ifdef GRID_SYCL
+#define GRID_SYCL_LEVEL_ZERO_IPC
 #endif

+
 NAMESPACE_BEGIN(Grid); 
 #define header "SharedMemoryMpi: "
 /*Construct from an MPI communicator*/
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -459,11 +459,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField

   if( interior && exterior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
-#ifdef SYCL_HACK     
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl);    return; }
-#else
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
-#endif     
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);    return;}
 #endif
@ -474,6 +470,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
+     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt);    return;}
 #ifndef GRID_CUDA
@ -498,10 +495,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;}
 #endif
-     acceleratorFenceComputeStream();
   } else if( interior ) {
-     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
-     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;}
+     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
+     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt);    return;}
 #ifndef GRID_CUDA
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;}
 #endif
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -398,6 +398,8 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    // Buffers are gathered AND synchronised
+    // Copies are MPI ISend OR asynch copy on copy stream
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
@ -410,14 +412,18 @@ public:
      comms_bytes+=bytes;
      shm_bytes  +=2*Packets[i].bytes-bytes;
    }
-    _grid->StencilBarrier();// Synch shared memory on a single nodes
  }

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    // complete intranode
+    acceleratorCopySynchronise();
+    // complete MPI
    for(int i=0;i<Packets.size();i++){
      _grid->StencilSendToRecvFromComplete(reqs[i],i);
    }
+    // Everyone agrees we are all done
+    _grid->StencilBarrier(); 
    commtime+=usecond();
  }
  ////////////////////////////////////////////////////////////////////////
@ -425,34 +431,10 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void Communicate(void)
  {
-    if ( 0 ){
-      thread_region {
-	// must be called in parallel region
-	int mythread  = thread_num();
-	int maxthreads= thread_max();
-	int nthreads = CartesianCommunicator::nCommThreads;
-	assert(nthreads <= maxthreads);
-	if (nthreads == -1) nthreads = 1;
-	if (mythread < nthreads) {
-	  for (int i = mythread; i < Packets.size(); i += nthreads) {
-	    double start = usecond();
-	    uint64_t bytes= _grid->StencilSendToRecvFrom(Packets[i].send_buf,
-							 Packets[i].to_rank,
-							 Packets[i].recv_buf,
-							 Packets[i].from_rank,
-							 Packets[i].bytes,i);
-	    comm_bytes_thr[mythread] += bytes;
-	    shm_bytes_thr[mythread]  += Packets[i].bytes - bytes;
-	    comm_time_thr[mythread]  += usecond() - start;
-	  }
-	}
-      }
-    } else { // Concurrent and non-threaded asynch calls to MPI
    std::vector<std::vector<CommsRequest_t> > reqs;
    this->CommunicateBegin(reqs);
    this->CommunicateComplete(reqs);
  }
-  }

  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
  {
@ -527,7 +509,6 @@ public:
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime_g+=usecond();

-    // conformable(source.Grid(),_grid);
    assert(source.Grid()==_grid);
    halogtime-=usecond();

@ -586,13 +567,8 @@ public:
    CommsMerge(decompress,Mergers,Decompressions);
  }
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
-    mpi3synctime-=usecond();
-    accelerator_barrier();
-    _grid->StencilBarrier();// Synch shared memory on a single nodes
-    mpi3synctime+=usecond();
-    shmmergetime-=usecond();
-    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
-    shmmergetime+=usecond();
+    assert(MergersSHM.size()==0);
+    assert(DecompressionsSHM.size()==0);
  }

  template<class decompressor>
@ -609,6 +585,7 @@ public:
 	  decompress.Exchange(mp,vp0,vp1,type,o);
      });
    }
+    if ( mm.size() )    acceleratorFenceComputeStream();
    mergetime+=usecond();

    decompresstime-=usecond();
@ -619,7 +596,9 @@ public:
 	decompress.Decompress(kp,mp,o);
      });
    }
+    if ( dd.size() )    acceleratorFenceComputeStream();
    decompresstime+=usecond();
+    
  }
  ////////////////////////////////////////
  // Set up routines
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -249,14 +249,16 @@ inline int  acceleratorIsCommunicable(void *ptr)
 //////////////////////////////////////////////
 #ifdef GRID_SYCL
 NAMESPACE_END(Grid);
+#if 0
 #include <CL/sycl.hpp>
 #include <CL/sycl/usm.hpp>
-
-#define GRID_SYCL_LEVEL_ZERO_IPC
-
-#ifdef GRID_SYCL_LEVEL_ZERO_IPC
 #include <level_zero/ze_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
+#else
+#include <sycl/CL/sycl.hpp>
+#include <sycl/usm.hpp>
+#include <level_zero/ze_api.h>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
 NAMESPACE_BEGIN(Grid);

--- a/systems/PVC/benchmarks/run-1tile.sh
+++ b/systems/PVC/benchmarks/run-1tile.sh
@ -4,7 +4,7 @@
 #SBATCH -p QZ1J-ICX-PVC
 ##SBATCH -p QZ1J-SPR-PVC-2C

-source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh

 export NT=8

--- a/systems/PVC/benchmarks/run-2tile-mpi.sh
+++ b/systems/PVC/benchmarks/run-2tile-mpi.sh
@ -4,7 +4,7 @@

 #SBATCH -p QZ1J-ICX-PVC

-source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
+#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh

 export NT=16

@ -19,16 +19,14 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
 export I_MPI_OFFLOAD_CELL=tile
 export EnableImplicitScaling=0
 export EnableWalkerPartition=0
-export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
+#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0

-for i in 0 
+for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 do
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
-mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 1.1.1.2.log$i
+mpiexec -launcher ssh -n 2 -host localhost  ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 0  --device-mem 32768 > 2.1.1.1.log$i 
 done
-#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log
-#mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log


--- a/systems/PVC/benchmarks/wrap.sh
+++ b/systems/PVC/benchmarks/wrap.sh
@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
 echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK


-if [ $MPI_LOCALRANKID = "0" ] 
-then
+#if [ $MPI_LOCALRANKID = "0" ] 
+#then
 #  ~psteinbr/build_pti/ze_tracer -h $@
-  onetrace --chrome-device-timeline $@
-else
+#  onetrace --chrome-device-timeline $@
+#else
  $@
-fi
+#fi
Author	SHA1	Message	Date
Peter Boyle	a00ae981e0	Fence propagation from SYCL	2023-03-29 15:00:40 -04:00
Peter Boyle	3f2fd49db4	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2023-03-27 17:29:54 -07:00
Peter Boyle	0efa107cb6	Script update	2023-03-27 17:29:43 -07:00
Peter Boyle	8feedb4f6f	Include files moved	2023-03-27 17:29:21 -07:00
Peter Boyle	05e562e3d7	Move the copy synch out to stencil and do one per call instead of one per packet	2023-03-27 17:28:38 -07:00
Peter Boyle	dd3bbb8fa2	MOve the synchronise out to the stencil so one call instead of one call per packet	2023-03-27 17:27:45 -07:00
Peter Boyle	2fbcf13c46	SYCL fix	2023-03-27 14:25:14 -07:00