Make GPT test suite work with SYCL

2026-05-22 10:04:17 +01:00 · 2024-02-28 12:57:17 +01:00
parent 66391f84f2
commit 22b43b86cb
3 changed files with 33 additions and 21 deletions
@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
    basis_v.push_back(basis[k].View(AcceleratorWrite));
  }
-#if ( !(defined(GRID_CUDA) || defined(GRID_HIP)) )
+#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
  int max_threads = thread_max();
  Vector < vobj > Bt(Nm * max_threads);
  thread_region
@@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  Coordinate fine_rdimensions = fine->_rdimensions;
  Coordinate coarse_rdimensions = coarse->_rdimensions;
  vobj zz = Zero();
  accelerator_for(sc,coarse->oSites(),1,{
      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
-      vobj cd = zz;
+      vobj cd = Zero();
      for(int sb=0;sb<blockVol;sb++){
@@ -287,23 +287,37 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
-      unsigned long nt=acceleratorThreads();				\
+    unsigned long nt=acceleratorThreads();				\
-      unsigned long unum1 = num1;					\
+    if(nt < 8)nt=8;							\
-      unsigned long unum2 = num2;					\
+    unsigned long unum1 = num1;						\
-      if(nt < 8)nt=8;							\
+    unsigned long unum2 = num2;						\
-      cl::sycl::range<3> local {nt,1,nsimd};				\
+    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
-      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
+    cl::sycl::range<3> local {nt,1,nsimd};				\
-      cgh.parallel_for(					\
+    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
-      cl::sycl::nd_range<3>(global,local), \
+    if (unum1_divisible_by_nt != unum1) {				\
-      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
+      cgh.parallel_for(							\
-      [[intel::reqd_sub_group_size(16)]]	      \
+		       cl::sycl::nd_range<3>(global,local),		\
-      {						      \
+		       [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
-      auto iter1    = item.get_global_id(0);	      \
+		       [[intel::reqd_sub_group_size(16)]]		\
-      auto iter2    = item.get_global_id(1);	      \
+		       {						\
-      auto lane     = item.get_global_id(2);	      \
+			 auto iter1    = item.get_global_id(0);		\
-      { __VA_ARGS__ };				      \
+			 auto iter2    = item.get_global_id(1);		\
-     });	   			              \
+			 auto lane     = item.get_global_id(2);		\
-    });
+			 { if (iter1 < unum1){ __VA_ARGS__ } };		\
 		       });						\
    } else {								\
      cgh.parallel_for(							\
 		       cl::sycl::nd_range<3>(global,local),		\
 		       [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
 		       [[intel::reqd_sub_group_size(16)]]		\
 		       {						\
 			 auto iter1    = item.get_global_id(0);		\
 			 auto iter2    = item.get_global_id(1);		\
 			 auto lane     = item.get_global_id(2);		\
 			 { __VA_ARGS__ };				\
 		       });						\
    }									\
  });
 #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }