From 22b43b86cb8737cd74f322f401f65e8549798a25 Mon Sep 17 00:00:00 2001
From: Christoph Lehner <christoph@lhnr.de>
Date: Wed, 28 Feb 2024 12:57:17 +0100
Subject: [PATCH] Make GPT test suite work with SYCL

---
 Grid/lattice/Lattice_basis.h    |  2 +-
 Grid/lattice/Lattice_transfer.h |  4 +--
 Grid/threads/Accelerator.h      | 48 +++++++++++++++++++++------------
 3 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h
index 9415bd4f..03a869fb 100644
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
     basis_v.push_back(basis[k].View(AcceleratorWrite));
   }
 
-#if ( !(defined(GRID_CUDA) || defined(GRID_HIP)) )
+#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
   int max_threads = thread_max();
   Vector < vobj > Bt(Nm * max_threads);
   thread_region
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 668ef4b4..a936b1c0 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
   Coordinate fine_rdimensions = fine->_rdimensions;
   Coordinate coarse_rdimensions = coarse->_rdimensions;
 
-  vobj zz = Zero();
-  
   accelerator_for(sc,coarse->oSites(),1,{
 
       // One thread per sub block
       Coordinate coor_c(_ndimension);
       Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
 
-      vobj cd = zz;
+      vobj cd = Zero();
       
       for(int sb=0;sb<blockVol;sb++){
 
diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
index ff5ccd7a..b4df0924 100644
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -287,23 +287,37 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
   theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-      unsigned long nt=acceleratorThreads();				\
-      unsigned long unum1 = num1;					\
-      unsigned long unum2 = num2;					\
-      if(nt < 8)nt=8;							\
-      cl::sycl::range<3> local {nt,1,nsimd};				\
-      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
-      cgh.parallel_for(					\
-      cl::sycl::nd_range<3>(global,local), \
-      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
-      [[intel::reqd_sub_group_size(16)]]	      \
-      {						      \
-      auto iter1    = item.get_global_id(0);	      \
-      auto iter2    = item.get_global_id(1);	      \
-      auto lane     = item.get_global_id(2);	      \
-      { __VA_ARGS__ };				      \
-     });	   			              \
-    });
+    unsigned long nt=acceleratorThreads();				\
+    if(nt < 8)nt=8;							\
+    unsigned long unum1 = num1;						\
+    unsigned long unum2 = num2;						\
+    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
+    cl::sycl::range<3> local {nt,1,nsimd};				\
+    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
+    if (unum1_divisible_by_nt != unum1) {				\
+      cgh.parallel_for(							\
+		       cl::sycl::nd_range<3>(global,local),		\
+		       [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
+		       [[intel::reqd_sub_group_size(16)]]		\
+		       {						\
+			 auto iter1    = item.get_global_id(0);		\
+			 auto iter2    = item.get_global_id(1);		\
+			 auto lane     = item.get_global_id(2);		\
+			 { if (iter1 < unum1){ __VA_ARGS__ } };		\
+		       });						\
+    } else {								\
+      cgh.parallel_for(							\
+		       cl::sycl::nd_range<3>(global,local),		\
+		       [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
+		       [[intel::reqd_sub_group_size(16)]]		\
+		       {						\
+			 auto iter1    = item.get_global_id(0);		\
+			 auto iter2    = item.get_global_id(1);		\
+			 auto lane     = item.get_global_id(2);		\
+			 { __VA_ARGS__ };				\
+		       });						\
+    }									\
+  });
 
 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }