Changed batchedInnerProduct for portability

2026-07-07 19:03:30 +01:00 · 2026-03-17 18:54:18 -04:00
parent 24752002fa
commit 09aa843984
1 changed files with 23 additions and 35 deletions
@@ -492,31 +492,17 @@ public:

    typedef typename Field::vector_object         vobj;
    typedef decltype(innerProduct(vobj(), vobj())) inner_t;
-    typedef typename inner_t::scalar_objectD       sobj;
    typedef decltype(basis[0].View(AcceleratorRead)) View;

    GridBase *grid = vec.Grid();
    uint64_t oSites = grid->oSites();
    uint64_t nsimd  = grid->Nsimd();
-    uint64_t size   = oSites * nsimd;

-    // Reduction geometry (same logic as sumD_gpu_small)
-    int numThreads, numBlocks;
-    int size_i = (int)size;
-    getNumBlocksAndThreads(size_i, sizeof(sobj), numThreads, numBlocks);
-    int smemSize = numThreads * sizeof(sobj);
+    // all_ip[j * oSites + ss] = per-site inner product of basis[j] and vec at site ss.
+    // Layout: n contiguous blocks of oSites each.
+    deviceVector<inner_t> all_ip((uint64_t)n * oSites);
+    inner_t *all_ip_p = &all_ip[0];

-    // inner_tmp: reused per-site scratch for each basis vector j.
-    // Safe because the stream serializes (per_site_j → reduce_j → per_site_{j+1}):
-    // reduce_j finishes reading inner_tmp before per_site_{j+1} overwrites it.
-    deviceVector<inner_t> inner_tmp(oSites);
-    inner_t *inner_tmp_p = &inner_tmp[0];
-
-    // buffers[j*numBlocks + 0] will hold the reduction result for basis j.
-    deviceVector<sobj> buffers(n * numBlocks);
-    sobj *buf_p = &buffers[0];
-
-    // Basis views (opened once, closed at end of function)
    hostVector<View>   h_basis_v(n);
    deviceVector<View> d_basis_v(n);
    for (int j = 0; j < n; ++j) {
@@ -525,33 +511,35 @@ public:
    }
    View *basis_vp = &d_basis_v[0];

-    // Queue all 2n kernels to computeStream — no intermediate barriers.
-    // CUDA stream ordering guarantees reduce_j runs after per_site_j.
+    // Queue n per-site kernels to the accelerator stream — no intermediate barriers.
    autoView(vec_v, vec, AcceleratorRead);
    for (int j = 0; j < n; ++j) {
-      // Per-site inner products: inner_tmp[ss] = <basis[j][ss] | vec[ss]>
-      int jj = j;
+      int      jj      = j;
+      uint64_t oSites_ = oSites;
      accelerator_for(ss, oSites, nsimd, {
-        auto x_l = coalescedRead(basis_vp[jj][ss]);
-        auto y_l = coalescedRead(vec_v[ss]);
-        coalescedWrite(inner_tmp_p[ss], innerProduct(x_l, y_l));
+        auto x = coalescedRead(basis_vp[jj][ss]);
+        auto y = coalescedRead(vec_v[ss]);
+        coalescedWrite(all_ip_p[jj * oSites_ + ss], innerProduct(x, y));
      });
-      // Reduce inner_tmp → buf_p + j*numBlocks  (async, no barrier)
-      reduceKernel<<<numBlocks, numThreads, smemSize, computeStream>>>(
-          inner_tmp_p, buf_p + j * numBlocks, size_i);
    }

-    // ONE sync after all 2n kernels are queued
+    // ONE sync after all n kernels
    accelerator_barrier();

-    // Batch copy all n results (result for j sits at buf[j*numBlocks+0])
-    hostVector<sobj> buf_h(n * numBlocks);
-    acceleratorCopyFromDevice(buf_p, &buf_h[0], n * numBlocks * sizeof(sobj));
+    // Copy all per-site results to host
+    hostVector<inner_t> all_ip_h((uint64_t)n * oSites);
+    acceleratorCopyFromDevice(all_ip_p, &all_ip_h[0], (uint64_t)n * oSites * sizeof(inner_t));

-    // Convert sobj → ComplexD and do ONE batched MPI allreduce
+    // Reduce on host: sum over oSites, then collapse SIMD lanes via Reduce(TensorRemove(...))
+    // TensorRemove strips the iSinglet tensor wrapper to expose the SIMD scalar type.
+    // Reduce sums all nsimd lanes and returns a plain scalar (RealD or ComplexD).
    std::vector<ComplexD> raw(n);
-    for (int j = 0; j < n; ++j)
-      raw[j] = TensorRemove(buf_h[j * numBlocks]);
+    for (int j = 0; j < n; ++j) {
+      inner_t sum = Zero();
+      for (uint64_t ss = 0; ss < oSites; ++ss)
+        sum += all_ip_h[(uint64_t)j * oSites + ss];
+      raw[j] = ComplexD(Reduce(TensorRemove(sum)));
+    }
    grid->GlobalSumVector(&raw[0], n);
    for (int j = 0; j < n; ++j) c[j] = raw[j];