SYCL updates -- operator = giving trouble on Aurora.

SYCL reduction is failing intermittently with SVM interface - returns zero, expect non-zero. Think I need to remove ALL dependence on SVM.
2026-07-08 03:13:29 +01:00 · 2024-09-04 13:53:48 +00:00
parent aa67a5b095
commit 622f78ebea
2 changed files with 47 additions and 6 deletions
@@ -236,10 +236,13 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if defined(GRID_HIP) || defined(GRID_CUDA) || defined (GRID_SYCL)
+#if 0
+    deviceVector<vobj> vvtmp(1);
+    acceleratorPut(vvtmp[0],vtmp);
+    vobj *vvtmp_p = & vvtmp[0];
    auto me  = View(AcceleratorWrite);
    accelerator_for(ss,me.size(),vobj::Nsimd(),{
-	auto stmp=coalescedRead(vtmp);
+	auto stmp=coalescedRead(*vvtmp_p);
 	coalescedWrite(me[ss],stmp);
    });
 #else    
@@ -4,16 +4,36 @@ NAMESPACE_BEGIN(Grid);
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////

+
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_objectD sobjD;
+#if 1
+  sobj identity; zeroit(identity);
+  sobj ret; zeroit(ret);
+  Integer nsimd= vobj::Nsimd();
+  { 
+    sycl::buffer<sobj, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>());
+      cgh.parallel_for(cl::sycl::range<1>{osites},
+                      Reduction,
+                      [=] (cl::sycl::id<1> item, auto &sum) {
+                        auto osite   = item[0];
+                        sum +=Reduce(lat[osite]);
+                      });
+    });
+  }
+  sobjD dret; convertType(dret,ret);
+  return dret;
+#else
  static Vector<sobj> mysum;
  mysum.resize(1);
  sobj *mysum_p = & mysum[0];
  sobj identity; zeroit(identity);
-  mysum[0] = identity;
+  acceleratorPut(mysum[0],identity);
  sobj ret ; 

  Integer nsimd= vobj::Nsimd();
@@ -33,6 +53,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
  //  free(mysum,*theGridAccelerator);
  sobjD dret; convertType(dret,ret);
  return dret;
+#endif
 }

 template <class vobj>
@@ -76,12 +97,28 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite

 template<class Word> Word svm_xor(Word *vec,uint64_t L)
 {
-  Word xorResult; xorResult = 0;
+#if 1
+  Word identity;  identity=0;
+  Word ret = 0;
+  { 
+    sycl::buffer<Word, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
+      cgh.parallel_for(cl::sycl::range<1>{L},
+                      Reduction,
+                      [=] (cl::sycl::id<1> index, auto &sum) {
+                        sum ^=vec[index];
+                      });
+    });
+  }
+  theGridAccelerator->wait();
+  return ret;
+#else
  static Vector<Word> d_sum;
  d_sum.resize(1);
  Word *d_sum_p=&d_sum[0];
  Word identity;  identity=0;
-  d_sum[0] = identity;
+  acceleratorPut(d_sum[0],identity);
  const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
  theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
    auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
@@ -92,9 +129,10 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
     });
   });
  theGridAccelerator->wait();
-  Word ret = d_sum[0];
+  Word ret = acceleratorGet(d_sum[0]);
  //  free(d_sum,*theGridAccelerator);
  return ret;
+#endif
 }

 NAMESPACE_END(Grid);