mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Make GPT test suite work with SYCL
This commit is contained in:
		@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
 | 
			
		||||
    basis_v.push_back(basis[k].View(AcceleratorWrite));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP)) )
 | 
			
		||||
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
 | 
			
		||||
  int max_threads = thread_max();
 | 
			
		||||
  Vector < vobj > Bt(Nm * max_threads);
 | 
			
		||||
  thread_region
 | 
			
		||||
 
 | 
			
		||||
@@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 | 
			
		||||
  Coordinate fine_rdimensions = fine->_rdimensions;
 | 
			
		||||
  Coordinate coarse_rdimensions = coarse->_rdimensions;
 | 
			
		||||
 | 
			
		||||
  vobj zz = Zero();
 | 
			
		||||
  
 | 
			
		||||
  accelerator_for(sc,coarse->oSites(),1,{
 | 
			
		||||
 | 
			
		||||
      // One thread per sub block
 | 
			
		||||
      Coordinate coor_c(_ndimension);
 | 
			
		||||
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
 | 
			
		||||
 | 
			
		||||
      vobj cd = zz;
 | 
			
		||||
      vobj cd = Zero();
 | 
			
		||||
      
 | 
			
		||||
      for(int sb=0;sb<blockVol;sb++){
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -287,23 +287,37 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
 | 
			
		||||
 | 
			
		||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\
 | 
			
		||||
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\
 | 
			
		||||
      unsigned long nt=acceleratorThreads();				\
 | 
			
		||||
      unsigned long unum1 = num1;					\
 | 
			
		||||
      unsigned long unum2 = num2;					\
 | 
			
		||||
      if(nt < 8)nt=8;							\
 | 
			
		||||
      cl::sycl::range<3> local {nt,1,nsimd};				\
 | 
			
		||||
      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
 | 
			
		||||
      cgh.parallel_for(					\
 | 
			
		||||
      cl::sycl::nd_range<3>(global,local), \
 | 
			
		||||
      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
 | 
			
		||||
      [[intel::reqd_sub_group_size(16)]]	      \
 | 
			
		||||
      {						      \
 | 
			
		||||
      auto iter1    = item.get_global_id(0);	      \
 | 
			
		||||
      auto iter2    = item.get_global_id(1);	      \
 | 
			
		||||
      auto lane     = item.get_global_id(2);	      \
 | 
			
		||||
      { __VA_ARGS__ };				      \
 | 
			
		||||
     });	   			              \
 | 
			
		||||
    });
 | 
			
		||||
    unsigned long nt=acceleratorThreads();				\
 | 
			
		||||
    if(nt < 8)nt=8;							\
 | 
			
		||||
    unsigned long unum1 = num1;						\
 | 
			
		||||
    unsigned long unum2 = num2;						\
 | 
			
		||||
    unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\
 | 
			
		||||
    cl::sycl::range<3> local {nt,1,nsimd};				\
 | 
			
		||||
    cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\
 | 
			
		||||
    if (unum1_divisible_by_nt != unum1) {				\
 | 
			
		||||
      cgh.parallel_for(							\
 | 
			
		||||
		       cl::sycl::nd_range<3>(global,local),		\
 | 
			
		||||
		       [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
 | 
			
		||||
		       [[intel::reqd_sub_group_size(16)]]		\
 | 
			
		||||
		       {						\
 | 
			
		||||
			 auto iter1    = item.get_global_id(0);		\
 | 
			
		||||
			 auto iter2    = item.get_global_id(1);		\
 | 
			
		||||
			 auto lane     = item.get_global_id(2);		\
 | 
			
		||||
			 { if (iter1 < unum1){ __VA_ARGS__ } };		\
 | 
			
		||||
		       });						\
 | 
			
		||||
    } else {								\
 | 
			
		||||
      cgh.parallel_for(							\
 | 
			
		||||
		       cl::sycl::nd_range<3>(global,local),		\
 | 
			
		||||
		       [=] (cl::sycl::nd_item<3> item) /*mutable*/	\
 | 
			
		||||
		       [[intel::reqd_sub_group_size(16)]]		\
 | 
			
		||||
		       {						\
 | 
			
		||||
			 auto iter1    = item.get_global_id(0);		\
 | 
			
		||||
			 auto iter2    = item.get_global_id(1);		\
 | 
			
		||||
			 auto lane     = item.get_global_id(2);		\
 | 
			
		||||
			 { __VA_ARGS__ };				\
 | 
			
		||||
		       });						\
 | 
			
		||||
    }									\
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user