Make GPT test suite work with SYCL

2026-02-28 09:36:13 +00:00 · 2024-02-28 12:57:17 +01:00
parent 66391f84f2
commit 22b43b86cb
3 changed files with 33 additions and 21 deletions
--- a/Grid/lattice/Lattice_basis.h
+++ b/Grid/lattice/Lattice_basis.h
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }

-#if ( !(defined(GRID_CUDA) || defined(GRID_HIP)) )
+#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;

-  vobj zz = Zero();
-  
  accelerator_for(sc,coarse->oSites(),1,{

      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate

-      vobj cd = zz;
+      vobj cd = Zero();
      
      for(int sb=0;sb<blockVol;sb++){

--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -287,23 +287,37 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {

 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-      unsigned long nt=acceleratorThreads();				\
-      unsigned long unum1 = num1;					\
-      unsigned long unum2 = num2;					\
-      if(nt < 8)nt=8;							\
-      cl::sycl::range<3> local {nt,1,nsimd};				\
-      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
-      cgh.parallel_for(					\
-      cl::sycl::nd_range<3>(global,local), \
-      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
-      [[intel::reqd_sub_group_size(16)]]	      \
-      {						      \
-      auto iter1    = item.get_global_id(0);	      \
-      auto iter2    = item.get_global_id(1);	      \
-      auto lane     = item.get_global_id(2);	      \
-      { __VA_ARGS__ };				      \
-     });	   			              \
-    });
+    unsigned long nt=acceleratorThreads();				\
+    if(nt < 8)nt=8;							\
+    unsigned long unum1 = num1;						\
+    unsigned long unum2 = num2;						\
+    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
+    cl::sycl::range<3> local {nt,1,nsimd};				\
+    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
+    if (unum1_divisible_by_nt != unum1) {				\
+      cgh.parallel_for(							\
+		       cl::sycl::nd_range<3>(global,local),		\
+		       [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
+		       [[intel::reqd_sub_group_size(16)]]		\
+		       {						\
+			 auto iter1    = item.get_global_id(0);		\
+			 auto iter2    = item.get_global_id(1);		\
+			 auto lane     = item.get_global_id(2);		\
+			 { if (iter1 < unum1){ __VA_ARGS__ } };		\
+		       });						\
+    } else {								\
+      cgh.parallel_for(							\
+		       cl::sycl::nd_range<3>(global,local),		\
+		       [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
+		       [[intel::reqd_sub_group_size(16)]]		\
+		       {						\
+			 auto iter1    = item.get_global_id(0);		\
+			 auto iter2    = item.get_global_id(1);		\
+			 auto lane     = item.get_global_id(2);		\
+			 { __VA_ARGS__ };				\
+		       });						\
+    }									\
+  });

 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }