Bland SYCL compile

Making SYCL happy
2025-11-28 10:29:31 +00:00 · 2023-09-26 13:20:27 -07:00 · 2023-09-26 13:19:42 -07:00
7 changed files with 108 additions and 4 deletions
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -604,8 +604,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;

-    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
-    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
+    auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
+    auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
      
    ze_ipc_mem_handle_t ihandle;
    clone_mem_t handle;
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@@ -90,10 +90,12 @@ template<class vtype,int N> accelerator_inline iVector<vtype,N> ProjectOnGroup(c
 template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
 accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
 {
+  typedef typename iMatrix<vtype,N>::scalar_type scalar;
  // need a check for the group type?
  iMatrix<vtype,N> ret(arg);
  vtype nrm;
  vtype inner;
+  scalar one(1.0);
  for(int c1=0;c1<N;c1++){

    // Normalises row c1
@@ -102,7 +104,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);

    nrm = sqrt(inner);
-    nrm = 1.0/nrm;
+    nrm = one/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
      
@@ -127,7 +129,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);

    nrm = sqrt(inner);
-    nrm = 1.0/nrm;
+    nrm = one/nrm;
    for(int c2=0;c2<N;c2++)
      ret._internal[c1][c2]*= nrm;
  }
--- a/systems/OEM/README
+++ b/systems/OEM/README
@@ -0,0 +1,53 @@
+1. Prerequisites:
+===================
+Make sure you have the latest Intel ipcx release loaded (via modules or similar)
+Make sure you have SYCL aware MPICH or Intel MPI loaded (assumed as mpicxx)
+
+2. Obtain Grid:
+===================
+
+bash$
+git clone https://github.com/paboyle/Grid
+cd Grid
+./bootstrap.sh
+cd systems/PVC
+
+3. Build Grid:
+===================
+
+Here, configure command is stored in file config-command:
+
+bash$
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--enable-accelerator-cshift \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=icpx \
+	LDFLAGS="-fiopenmp  -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
+
+make all
+
+4. Run a benchmark:
+===================
+
+*** Assumes interactive access to node. ***
+
+run Benchmark_dwf_fp32 using benchmarks/bench.sh
+
+bash$
+cd benchmarks
+./bench.sh
+
+
+
+
+
+
--- a/systems/OEM/benchmarks/bench.sh
+++ b/systems/OEM/benchmarks/bench.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+export EnableImplicitScaling=0
+export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+export ONEAPI_DEVICE_FILTER=gpu,level_zero
+export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+
+mpiexec -launcher ssh -n 1 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 1tile.log
+mpiexec -launcher ssh -n 2 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 2tile.log
+
+#mpiexec -launcher ssh -n 4 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 16.16.64.64 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 4tile.log
+#mpiexec -launcher ssh -n 8 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.4 --grid 16.16.64.128 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 8tile.log
+
+
--- a/systems/OEM/benchmarks/select_gpu.sh
+++ b/systems/OEM/benchmarks/select_gpu.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+num_tile=2
+
+gpu_id=$(( (MPI_LOCAL_RANKID % num_tile ) ))
+tile_id=$((MPI_LOCAL_RANKID / num_tile))
+
+export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+
+echo "local rank $MPI_LOCALRANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
+
+"$@"
+
--- a/systems/OEM/config-command
+++ b/systems/OEM/config-command
@@ -0,0 +1,15 @@
+../../configure \
+	--enable-simd=GPU \
+	--enable-gen-simd-width=64 \
+	--enable-comms=mpi-auto \
+	--enable-accelerator-cshift \
+	--disable-gparity \
+	--disable-fermion-reps \
+	--enable-shm=nvlink \
+	--enable-accelerator=sycl \
+	--enable-unified=no \
+	MPICXX=mpicxx \
+	CXX=icpx \
+	LDFLAGS="-fiopenmp  -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
+	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
+
--- a/systems/OEM/setup.sh
+++ b/systems/OEM/setup.sh
@@ -0,0 +1,3 @@
+export https_proxy=http://proxy-chain.intel.com:911
+module load intel-release
+module load intel/mpich
Author	SHA1	Message	Date
Peter Boyle	80359e0d49	Bland SYCL compile	2023-09-26 13:20:27 -07:00
Peter Boyle	3d437c5cc4	Making SYCL happy	2023-09-26 13:19:42 -07:00