Merge 461cd045c6 into ccf147d6c1

Select the compiler that gives better performance on sunspot
New config command for sunspot
2025-09-18 01:01:04 +01:00 · 2024-05-16 16:40:21 -04:00 · 2024-05-07 18:45:56 +00:00 · 2024-05-07 18:45:40 +00:00 · 2024-05-07 18:45:21 +00:00 · 2024-05-07 18:44:49 +00:00
9 changed files with 188 additions and 122 deletions
--- a/Grid/lattice/Lattice_slicesum_core.h
+++ b/Grid/lattice/Lattice_slicesum_core.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <type_traits>
+
 #if defined(GRID_CUDA)

 #include <cub/cub.cuh>
@@ -90,8 +90,61 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
  

 }
+#endif 

-template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
+
+#if defined(GRID_SYCL)
+template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
+{
+  size_t subvol_size = e1*e2;
+
+  vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
+  vobj vobj_zero;
+  zeroit(vobj_zero);
+  for (int r = 0; r<rd; r++) { 
+    mysum[r] = vobj_zero; 
+  }
+
+  commVector<vobj> reduction_buffer(rd*subvol_size);    
+
+  auto rb_p = &reduction_buffer[0];
+
+  // autoView(Data_v, Data, AcceleratorRead);
+
+  //prepare reduction buffer 
+  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
+  
+      int n = s / e2;
+      int b = s % e2;
+      int so=r*ostride; // base offset for start of plane 
+      int ss= so+n*stride+b;
+
+      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
+
+  });
+
+  for (int r = 0; r < rd; r++) {
+      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+          auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
+          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
+          Reduction,
+          [=](cl::sycl::id<1> item, auto &sum) {
+              auto s = item[0];
+              sum += rb_p[r*subvol_size+s];
+          });
+      });
+      
+     
+  }
+  theGridAccelerator->wait();
+  for (int r = 0; r < rd; r++) {
+    lvSum[r] = mysum[r];
+  }
+  free(mysum,*theGridAccelerator);
+}
+#endif
+
+template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
  typedef typename vobj::vector_type vector;
  const int words = sizeof(vobj)/sizeof(vector);
  const int osites = rd*e1*e2;
@@ -106,7 +159,11 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
 	    buf[ss] = dat[ss*words+w];
    });

+    #if defined(GRID_CUDA) || defined(GRID_HIP)
      sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #elif defined(GRID_SYCL)
+      sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
+    #endif

    for (int r = 0; r < rd; r++) {
      lvSum_ptr[w+words*r]=lvSum_small[r];
@@ -117,66 +174,24 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
  
 }

-template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
+template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
 {
-  autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
+  autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
    if constexpr (sizeof(vobj) <= 256) { 
+
+      #if defined(GRID_CUDA) || defined(GRID_HIP)
        sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      #elif defined (GRID_SYCL)
+        sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      #endif
+
    }
    else {
-      sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
+      sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
    }
 }
-#endif


-#if defined(GRID_SYCL)
-template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
-{
-  typedef typename vobj::scalar_object sobj;
-  size_t subvol_size = e1*e2;
-
-  vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
-  vobj vobj_zero;
-  zeroit(vobj_zero);
-    
-  commVector<vobj> reduction_buffer(rd*subvol_size);    
-
-  auto rb_p = &reduction_buffer[0];
-
-  autoView(Data_v, Data, AcceleratorRead);
-
-  //prepare reduction buffer 
-  accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ 
-  
-      int n = s / e2;
-      int b = s % e2;
-      int so=r*ostride; // base offset for start of plane 
-      int ss= so+n*stride+b;
-
-      coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
-
-  });
-
-  for (int r = 0; r < rd; r++) {
-      mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
-      theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
-          auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
-          cgh.parallel_for(cl::sycl::range<1>{subvol_size},
-          Reduction,
-          [=](cl::sycl::id<1> item, auto &sum) {
-              auto s = item[0];
-              sum += rb_p[r*subvol_size+s];
-          });
-      });
-      theGridAccelerator->wait();
-      lvSum[r] = mysum[0];
-  }
-  
-  free(mysum,*theGridAccelerator);
-}
-#endif
-
 template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
 {
  // sum over reduced dimension planes, breaking out orthog dir
@@ -195,13 +210,9 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data

 template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) 
 {
-  #if defined(GRID_CUDA) || defined(GRID_HIP)
+  #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
  
-  sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
-  
-  #elif defined(GRID_SYCL)
-  
-  sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
+  sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
  
  #else
  sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -405,11 +405,4 @@ NAMESPACE_BEGIN(Grid);
 NAMESPACE_END(Grid);


-#ifdef GRID_SYCL
-template<typename T> struct
-sycl::is_device_copyable<T, typename std::enable_if<
-			      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value),
-			      void>::type>
-  : public std::true_type {};
-#endif

--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -210,8 +210,8 @@ void acceleratorInit(void)
  cl::sycl::gpu_selector selector;
  cl::sycl::device selectedDevice { selector };
  theGridAccelerator = new sycl::queue (selectedDevice);
-  //  theCopyAccelerator = new sycl::queue (selectedDevice);
-  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
+  theCopyAccelerator = new sycl::queue (selectedDevice);
+  //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.

 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
  zeInit(0);
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@@ -247,9 +247,12 @@ void FlightRecorder::ReductionLog(double local,double global)
 }
 void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 {
+  if(LoggingMode == LoggingModeNone) return;
+
  if ( ChecksumCommsSend ){
  uint64_t *ubuf = (uint64_t *)buf;
  if(LoggingMode == LoggingModeNone) return;
+  
 #ifdef GRID_SYCL
  uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
  if(LoggingMode == LoggingModePrint) {
@@ -285,12 +288,6 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
    XmitLoggingCounter++;
  }
 #endif
-  } else {
-    uint64_t word = 1;
-    deviceVector<uint64_t> dev(1);
-    acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t));
-    acceleratorCopySynchronise();
-    MPI_Barrier(MPI_COMM_WORLD);
  }
 }
 void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
--- a/systems/Aurora/benchmarks/bench1.pbs
+++ b/systems/Aurora/benchmarks/bench1.pbs
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+#PBS -q debug
+#PBS -l select=1
+#PBS -l walltime=00:20:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+module load pti-gpu
+
+#cat $PBS_NODEFILE
+
+export OMP_NUM_THREADS=4
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPICH_OFI_NIC_POLICY=GPU
+
+# 12 ppn, 2 nodes, 24 ranks
+#
+CMD="mpiexec -np 12 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
+#$CMD | tee 1node.comms
+
+
+CMD="mpiexec -np 1 -ppn 1  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
+#$CMD | tee 1tile.dwf
+
+CMD="mpiexec -np 12 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
+$CMD | tee 1node.32.32.32.48.dwf
+
+
+CMD="mpiexec -np 12 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
+#$CMD | tee 1node.64.64.32.96.dwf
+
+CMD="mpiexec -np 12 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
+#$CMD | tee 1node.64.32.32.48.dwf
+
--- a/systems/Aurora/benchmarks/bench12.pbs
+++ b/systems/Aurora/benchmarks/bench12.pbs
@@ -1,10 +1,8 @@
 #!/bin/bash

-## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
-
-#PBS -q EarlyAppAccess
+#PBS -q workq
 #PBS -l select=2
-#PBS -l walltime=01:00:00
+#PBS -l walltime=00:20:00
 #PBS -A LatticeQCD_aesp_CNDA

 #export OMP_PROC_BIND=spread
@@ -13,11 +11,13 @@
 cd $PBS_O_WORKDIR

 source ../sourceme.sh
+module load pti-gpu

-export OMP_NUM_THREADS=3
+#cat $PBS_NODEFILE
+
+export OMP_NUM_THREADS=4
 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1

-
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
@@ -31,30 +31,25 @@ export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 export MPICH_OFI_NIC_POLICY=GPU

+# 12 ppn, 2 nodes, 24 ranks
+#
 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
-	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" 
+$CMD | tee 2node.comms

-#$CMD 

 CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
+$CMD | tee 2node.32.32.64.48.dwf

-#$CMD 

-CMD="mpiexec -np 1 -ppn 1  -envall \
+CMD="mpiexec -np 24 -ppn 12  -envall \
 	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
+		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
+$CMD | tee 2node.64.64.64.96.dwf

-$CMD 
-
-CMD="mpiexec -np 1 -ppn 1  -envall \
-	     ./gpu_tile_compact.sh \
-	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
-		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-
-$CMD 
--- a/systems/Aurora/benchmarks/gpu_tile_compact.sh
+++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh
@@ -1,33 +1,34 @@
 #!/bin/bash

-export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
-#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
-export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
-export  NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 )
-export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
-export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
+#export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
+#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
+#export  GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
+
+export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 );
+export  GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )

 export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
-export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
-export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
 export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
-export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
-  
-#export GRID_MPICH_NIC_BIND=$NIC
-#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
  
 unset EnableWalkerPartition
 export EnableImplicitScaling=0
-export ZE_AFFINITY_MASK=$gpu_id.$tile_id
+export ZE_AFFINITY_MASK=$gpu_id
 export ONEAPI_DEVICE_FILTER=gpu,level_zero

-#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
-#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
-#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
-#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1

-#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
+echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "

-numactl -m $NUMA -N $NUMAP  "$@"
+if [ $PALS_RANKID = "0" ]
+then
+#    numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline  "$@"
+#    numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
+    numactl -m $NUMA -N $NUMA  "$@"
+else 
+    numactl -m $NUMA -N $NUMA  "$@"
+fi
--- a/systems/Aurora/config-command
+++ b/systems/Aurora/config-command
@@ -7,7 +7,7 @@
 	--disable-fermion-reps \
 	--enable-shm=nvlink \
 	--enable-accelerator=sycl \
-	--enable-accelerator-aware-mpi=no\
+	--enable-accelerator-aware-mpi=yes\
 	--enable-unified=no \
 	MPICXX=mpicxx \
 	CXX=icpx \
--- a/systems/Aurora/sourceme.sh
+++ b/systems/Aurora/sourceme.sh
@@ -1,7 +1,9 @@
 #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0

-module use /soft/modulefiles
-module load intel_compute_runtime/release/agama-devel-682.22
+module load oneapi/release/2023.12.15.001
+
+#module use /soft/modulefiles
+#module load intel_compute_runtime/release/agama-devel-682.22

 export FI_CXI_DEFAULT_CQ_SIZE=131072
 export FI_CXI_CQ_FILL_PERCENT=20
Author	SHA1	Message	Date
dbollweg	7f9d06f339	Merge `461cd045c6` into `ccf147d6c1`	2024-05-16 16:40:21 -04:00
Peter Boyle	ccf147d6c1	Select the compiler that gives better performance on sunspot	2024-05-07 18:45:56 +00:00
Peter Boyle	7aa12b446f	New config command for sunspot	2024-05-07 18:45:40 +00:00
Peter Boyle	c293228102	layout control	2024-05-07 18:45:21 +00:00
Peter Boyle	5c4c9f721a	Remove pbs file and replace with bench1 and bench2 for 1 and 2 nodes	2024-05-07 18:44:49 +00:00
Peter Boyle	057f86c1de	2 queues works ok in performance	2024-05-07 18:42:50 +00:00
Peter Boyle	cd52e3cbc2	Jobs on subspot	2024-05-07 18:38:15 +00:00
Peter Boyle	24602e1259	Accidental synchronise	2024-05-07 17:28:38 +00:00
Peter Boyle	8a098889fc	Update FlightRecorder.cc	2024-04-30 21:15:08 +01:00
Peter Boyle	ff2ea5de18	Update Tensor_traits.h	2024-04-11 14:25:45 -04:00
dbollweg	461cd045c6	sliceSum cleanup	2024-03-13 18:18:44 -04:00
dbollweg	fee65d7a75	Merge branch 'paboyle:develop' into sycl_slicesum_update	2024-03-13 18:06:17 -04:00
dbollweg	31f9971dbf	avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum	2024-03-13 13:39:26 -04:00
dbollweg	d87296f3e8	Merge branch 'develop' of https://github.com/dbollweg/Grid into develop	2024-03-06 16:54:22 -05:00
dbollweg	be94cf1c6f	Fewer wait-calls in sycl slicesum	2024-03-06 16:53:13 -05:00