From 22b43b86cb8737cd74f322f401f65e8549798a25 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 28 Feb 2024 12:57:17 +0100 Subject: [PATCH] Make GPT test suite work with SYCL --- Grid/lattice/Lattice_basis.h | 2 +- Grid/lattice/Lattice_transfer.h | 4 +-- Grid/threads/Accelerator.h | 48 +++++++++++++++++++++------------ 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 9415bd4f..03a869fb 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) basis_v.push_back(basis[k].View(AcceleratorWrite)); } -#if ( !(defined(GRID_CUDA) || defined(GRID_HIP)) ) +#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) ) int max_threads = thread_max(); Vector < vobj > Bt(Nm * max_threads); thread_region diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 668ef4b4..a936b1c0 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -469,15 +469,13 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) Coordinate fine_rdimensions = fine->_rdimensions; Coordinate coarse_rdimensions = coarse->_rdimensions; - vobj zz = Zero(); - accelerator_for(sc,coarse->oSites(),1,{ // One thread per sub block Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate - vobj cd = zz; + vobj cd = Zero(); for(int sb=0;sbsubmit([&](cl::sycl::handler &cgh) { \ - unsigned long nt=acceleratorThreads(); \ - unsigned long unum1 = num1; \ - unsigned long unum2 = num2; \ - if(nt < 8)nt=8; \ - cl::sycl::range<3> local {nt,1,nsimd}; \ - cl::sycl::range<3> global{unum1,unum2,nsimd}; \ - cgh.parallel_for( \ - cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ - [[intel::reqd_sub_group_size(16)]] \ - { \ - auto iter1 = item.get_global_id(0); \ - auto iter2 = item.get_global_id(1); \ - auto lane = item.get_global_id(2); \ - { __VA_ARGS__ }; \ - }); \ - }); + unsigned long nt=acceleratorThreads(); \ + if(nt < 8)nt=8; \ + unsigned long unum1 = num1; \ + unsigned long unum2 = num2; \ + unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \ + cl::sycl::range<3> local {nt,1,nsimd}; \ + cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ + if (unum1_divisible_by_nt != unum1) { \ + cgh.parallel_for( \ + cl::sycl::nd_range<3>(global,local), \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + [[intel::reqd_sub_group_size(16)]] \ + { \ + auto iter1 = item.get_global_id(0); \ + auto iter2 = item.get_global_id(1); \ + auto lane = item.get_global_id(2); \ + { if (iter1 < unum1){ __VA_ARGS__ } }; \ + }); \ + } else { \ + cgh.parallel_for( \ + cl::sycl::nd_range<3>(global,local), \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + [[intel::reqd_sub_group_size(16)]] \ + { \ + auto iter1 = item.get_global_id(0); \ + auto iter2 = item.get_global_id(1); \ + auto lane = item.get_global_id(2); \ + { __VA_ARGS__ }; \ + }); \ + } \ + }); #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }