Bland SYCL compile

Making SYCL happy
2025-06-21 09:12:03 +01:00 · 2023-09-26 13:20:27 -07:00 · 2023-09-26 13:19:42 -07:00
7 changed files with 108 additions and 4 deletions
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@ -604,8 +604,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
+    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
+    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@ -90,10 +90,12 @@ template<class vtype,int N> accelerator_inline iVector<vtype,N> ProjectOnGroup(c
 template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
 accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
 {
  typedef typename iMatrix<vtype,N>::scalar_type scalar;
  // need a check for the group type?
  iMatrix<vtype,N> ret(arg);
  vtype nrm;
  vtype inner;
  scalar one(1.0);
  for(int c1=0;c1<N;c1++){
    // Normalises row c1
@ -102,7 +104,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
    nrm = sqrt(inner);
-    nrm = 1.0/nrm;
+    nrm = one/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
@ -127,7 +129,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
    nrm = sqrt(inner);
-    nrm = 1.0/nrm;
+    nrm = one/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
  }
--- a/systems/OEM/README
+++ b/systems/OEM/README
@ -0,0 +1,53 @@
 1. Prerequisites:
 ===================
 Make sure you have the latest Intel ipcx release loaded (via modules or similar)
 Make sure you have SYCL aware MPICH or Intel MPI loaded (assumed as mpicxx)
 2. Obtain Grid:
 ===================
 bash$
 git clone https://github.com/paboyle/Grid
 cd Grid
 ./bootstrap.sh
 cd systems/PVC
 3. Build Grid:
 ===================
 Here, configure command is stored in file config-command:
 bash$
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
 	LDFLAGS="-fiopenmp  -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
 make all
 4. Run a benchmark:
 ===================
 *** Assumes interactive access to node. ***
 run Benchmark_dwf_fp32 using benchmarks/bench.sh
 bash$
 cd benchmarks
 ./bench.sh
--- a/systems/OEM/benchmarks/bench.sh
+++ b/systems/OEM/benchmarks/bench.sh
@ -0,0 +1,18 @@
 #!/bin/bash
 export EnableImplicitScaling=0
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero
 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 mpiexec -launcher ssh -n 1 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 1tile.log
 mpiexec -launcher ssh -n 2 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 2tile.log
 #mpiexec -launcher ssh -n 4 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 16.16.64.64 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 4tile.log
 #mpiexec -launcher ssh -n 8 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.4 --grid 16.16.64.128 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 8tile.log
--- a/systems/OEM/benchmarks/select_gpu.sh
+++ b/systems/OEM/benchmarks/select_gpu.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 num_tile=2
 gpu_id=$(( (MPI_LOCAL_RANKID % num_tile ) ))
 tile_id=$((MPI_LOCAL_RANKID / num_tile))
 export ZE_AFFINITY_MASK=$gpu_id.$tile_id
 echo "local rank $MPI_LOCALRANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
 "$@"
--- a/systems/OEM/config-command
+++ b/systems/OEM/config-command
@ -0,0 +1,15 @@
 ../../configure \
 	--enable-simd=GPU \
 	--enable-gen-simd-width=64 \
 	--enable-comms=mpi-auto \
 	--enable-accelerator-cshift \
 	--disable-gparity \
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
 	LDFLAGS="-fiopenmp  -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
--- a/systems/OEM/setup.sh
+++ b/systems/OEM/setup.sh
@ -0,0 +1,3 @@
 export https_proxy=http://proxy-chain.intel.com:911
 module load intel-release
 module load intel/mpich
Author	SHA1	Message	Date
Peter Boyle	80359e0d49	Bland SYCL compile	2023-09-26 13:20:27 -07:00
Peter Boyle	3d437c5cc4	Making SYCL happy	2023-09-26 13:19:42 -07:00