From 622f78ebea56fa355afeb69db83ab2cfa2cfa2bc Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ed.ac.uk>
Date: Wed, 4 Sep 2024 13:53:48 +0000
Subject: [PATCH] SYCL updates -- operator = giving trouble on Aurora. SYCL
 reduction is failing intermittently with SVM interface - returns zero, expect
 non-zero. Think I need to remove ALL dependence on SVM.

---
 Grid/lattice/Lattice_base.h           |  7 ++--
 Grid/lattice/Lattice_reduction_sycl.h | 46 ++++++++++++++++++++++++---
 2 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index 9d4d3d5f..515c847f 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -236,10 +236,13 @@ public:
   template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
     vobj vtmp;
     vtmp = r;
-#if defined(GRID_HIP) || defined(GRID_CUDA) || defined (GRID_SYCL)
+#if 0
+    deviceVector<vobj> vvtmp(1);
+    acceleratorPut(vvtmp[0],vtmp);
+    vobj *vvtmp_p = & vvtmp[0];
     auto me  = View(AcceleratorWrite);
     accelerator_for(ss,me.size(),vobj::Nsimd(),{
-	auto stmp=coalescedRead(vtmp);
+	auto stmp=coalescedRead(*vvtmp_p);
 	coalescedWrite(me[ss],stmp);
     });
 #else    
diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h
index b8dc5378..7dff7939 100644
--- a/Grid/lattice/Lattice_reduction_sycl.h
+++ b/Grid/lattice/Lattice_reduction_sycl.h
@@ -4,16 +4,36 @@ NAMESPACE_BEGIN(Grid);
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) 
 {
   typedef typename vobj::scalar_object sobj;
   typedef typename vobj::scalar_objectD sobjD;
+#if 1
+  sobj identity; zeroit(identity);
+  sobj ret; zeroit(ret);
+  Integer nsimd= vobj::Nsimd();
+  { 
+    sycl::buffer<sobj, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>());
+      cgh.parallel_for(cl::sycl::range<1>{osites},
+                      Reduction,
+                      [=] (cl::sycl::id<1> item, auto &sum) {
+                        auto osite   = item[0];
+                        sum +=Reduce(lat[osite]);
+                      });
+    });
+  }
+  sobjD dret; convertType(dret,ret);
+  return dret;
+#else
   static Vector<sobj> mysum;
   mysum.resize(1);
   sobj *mysum_p = & mysum[0];
   sobj identity; zeroit(identity);
-  mysum[0] = identity;
+  acceleratorPut(mysum[0],identity);
   sobj ret ; 
 
   Integer nsimd= vobj::Nsimd();
@@ -33,6 +53,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
   //  free(mysum,*theGridAccelerator);
   sobjD dret; convertType(dret,ret);
   return dret;
+#endif
 }
 
 template <class vobj>
@@ -76,12 +97,28 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
 
 template<class Word> Word svm_xor(Word *vec,uint64_t L)
 {
-  Word xorResult; xorResult = 0;
+#if 1
+  Word identity;  identity=0;
+  Word ret = 0;
+  { 
+    sycl::buffer<Word, 1> abuff(&ret, {1});
+    theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
+      auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
+      cgh.parallel_for(cl::sycl::range<1>{L},
+                      Reduction,
+                      [=] (cl::sycl::id<1> index, auto &sum) {
+                        sum ^=vec[index];
+                      });
+    });
+  }
+  theGridAccelerator->wait();
+  return ret;
+#else
   static Vector<Word> d_sum;
   d_sum.resize(1);
   Word *d_sum_p=&d_sum[0];
   Word identity;  identity=0;
-  d_sum[0] = identity;
+  acceleratorPut(d_sum[0],identity);
   const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
   theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
     auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
@@ -92,9 +129,10 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
      });
    });
   theGridAccelerator->wait();
-  Word ret = d_sum[0];
+  Word ret = acceleratorGet(d_sum[0]);
   //  free(d_sum,*theGridAccelerator);
   return ret;
+#endif
 }
 
 NAMESPACE_END(Grid);