Verbosity reduction batched inner product for reorthogonalization

2026-07-18 08:03:27 +01:00 · 2026-03-17 13:02:16 -04:00
parent f3223021fd
commit 24752002fa
2 changed files with 157 additions and 15 deletions
@@ -153,7 +153,7 @@ public:
      // Step 1: extend from pStart steps to Nm steps
      // ----------------------------------------------------------------
      extendBasis(pStart, Nm, betaRestart);
-      verify();
+//      verify();
      // ----------------------------------------------------------------
      // Step 2: SVD of the Nm x Nm B matrix.
@@ -208,7 +208,7 @@ public:
      // Step 4: implicit restart — compress to Nk steps
      // ----------------------------------------------------------------
      implicitRestart(Nm, Nk, sigma, X, Y, order, betaK, betaRestart);
-      verify();
+//      verify();
      // Lucky breakdown: exact invariant subspace found; convergence is exact.
      // B_p^+ = diag(alpha[0..Nk-1]); extract directly from restart basis.
@@ -360,7 +360,7 @@ private:
    if (restart_col >= 0 && restart_col < m && (int)fvec.size() > 0) {
      for (int j = 0; j < restart_col && j < (int)fvec.size(); ++j){
        B(j, restart_col) = fvec[j];
-        std::cout << GridLogMessage << "buildFullB: B  " <<j<<" "<<restart_col<<B(j, restart_col);
+        std::cout << GridLogDebug << "buildFullB: B  " <<j<<" "<<restart_col<<B(j, restart_col)<<std::endl;
      }
    }
    return B;
@@ -464,20 +464,162 @@ private:
    }
  }
 public:
  // ------------------------------------------------------------------
-  // Full reorthogonalization of vec against the vectors in basis.
+  // Block reorthogonalization helpers.
-  // Subtracts projections only — does NOT normalize.
+  // Declared public because CUDA extended lambdas cannot live inside
  // private/protected member functions.
  //
  // batchInnerProducts: computes c[j] = <basis[j], vec> for all j
  //   in a single GPU pass (one accelerator_barrier instead of n).
  //   Queues n pairs of (per-site kernel, reduceKernel) to computeStream
  //   without intermediate CPU syncs, then syncs once at the end.
  //
  // batchUpdate: computes vec -= sum_j c[j]*basis[j] in one GPU kernel.
  //
  // reorthogonalize: two-pass Classical Gram-Schmidt (CGS2) using the
  //   two helpers above.  Each pass costs 2 GPU syncs (1 IP + 1 update)
  //   instead of 2n syncs per pass in the old sequential MGS.
  // ------------------------------------------------------------------
  void batchInnerProducts(const Field &vec,
                          const std::vector<Field> &basis,
                          std::vector<ComplexD> &c)
  {
    int n = (int)basis.size();
    c.resize(n);
    if (n == 0) return;
    typedef typename Field::vector_object         vobj;
    typedef decltype(innerProduct(vobj(), vobj())) inner_t;
    typedef typename inner_t::scalar_objectD       sobj;
    typedef decltype(basis[0].View(AcceleratorRead)) View;
    GridBase *grid = vec.Grid();
    uint64_t oSites = grid->oSites();
    uint64_t nsimd  = grid->Nsimd();
    uint64_t size   = oSites * nsimd;
    // Reduction geometry (same logic as sumD_gpu_small)
    int numThreads, numBlocks;
    int size_i = (int)size;
    getNumBlocksAndThreads(size_i, sizeof(sobj), numThreads, numBlocks);
    int smemSize = numThreads * sizeof(sobj);
    // inner_tmp: reused per-site scratch for each basis vector j.
    // Safe because the stream serializes (per_site_j → reduce_j → per_site_{j+1}):
    // reduce_j finishes reading inner_tmp before per_site_{j+1} overwrites it.
    deviceVector<inner_t> inner_tmp(oSites);
    inner_t *inner_tmp_p = &inner_tmp[0];
    // buffers[j*numBlocks + 0] will hold the reduction result for basis j.
    deviceVector<sobj> buffers(n * numBlocks);
    sobj *buf_p = &buffers[0];
    // Basis views (opened once, closed at end of function)
    hostVector<View>   h_basis_v(n);
    deviceVector<View> d_basis_v(n);
    for (int j = 0; j < n; ++j) {
      h_basis_v[j] = basis[j].View(AcceleratorRead);
      acceleratorPut(d_basis_v[j], h_basis_v[j]);
    }
    View *basis_vp = &d_basis_v[0];
    // Queue all 2n kernels to computeStream — no intermediate barriers.
    // CUDA stream ordering guarantees reduce_j runs after per_site_j.
    autoView(vec_v, vec, AcceleratorRead);
    for (int j = 0; j < n; ++j) {
      // Per-site inner products: inner_tmp[ss] = <basis[j][ss] | vec[ss]>
      int jj = j;
      accelerator_for(ss, oSites, nsimd, {
        auto x_l = coalescedRead(basis_vp[jj][ss]);
        auto y_l = coalescedRead(vec_v[ss]);
        coalescedWrite(inner_tmp_p[ss], innerProduct(x_l, y_l));
      });
      // Reduce inner_tmp → buf_p + j*numBlocks  (async, no barrier)
      reduceKernel<<<numBlocks, numThreads, smemSize, computeStream>>>(
          inner_tmp_p, buf_p + j * numBlocks, size_i);
    }
    // ONE sync after all 2n kernels are queued
    accelerator_barrier();
    // Batch copy all n results (result for j sits at buf[j*numBlocks+0])
    hostVector<sobj> buf_h(n * numBlocks);
    acceleratorCopyFromDevice(buf_p, &buf_h[0], n * numBlocks * sizeof(sobj));
    // Convert sobj → ComplexD and do ONE batched MPI allreduce
    std::vector<ComplexD> raw(n);
    for (int j = 0; j < n; ++j)
      raw[j] = TensorRemove(buf_h[j * numBlocks]);
    grid->GlobalSumVector(&raw[0], n);
    for (int j = 0; j < n; ++j) c[j] = raw[j];
    for (int j = 0; j < n; ++j) h_basis_v[j].ViewClose();
  }
  void batchUpdate(Field &vec,
                   const std::vector<Field> &basis,
                   const std::vector<ComplexD> &c)
  {
    int n = (int)basis.size();
    if (n == 0) return;
    typedef typename Field::vector_object vobj;
    typedef decltype(basis[0].View(AcceleratorRead)) View;
    GridBase *grid = vec.Grid();
    uint64_t oSites = grid->oSites();
    uint64_t nsimd  = grid->Nsimd();
    // Split complex coefficients into real/imag double arrays on device.
    // Using doubles avoids potential ComplexD-device-code compatibility issues.
    hostVector<double>   h_re(n), h_im(n);
    deviceVector<double> d_re(n), d_im(n);
    for (int k = 0; k < n; ++k) {
      h_re[k] = c[k].real();
      h_im[k] = c[k].imag();
    }
    acceleratorCopyToDevice(&h_re[0], &d_re[0], n * sizeof(double));
    acceleratorCopyToDevice(&h_im[0], &d_im[0], n * sizeof(double));
    double *re_p = &d_re[0];
    double *im_p = &d_im[0];
    // Basis views
    hostVector<View>   h_basis_v(n);
    deviceVector<View> d_basis_v(n);
    for (int k = 0; k < n; ++k) {
      h_basis_v[k] = basis[k].View(AcceleratorRead);
      acceleratorPut(d_basis_v[k], h_basis_v[k]);
    }
    View *basis_vp = &d_basis_v[0];
    // Single kernel: vec[ss] -= sum_k (re[k] + i*im[k]) * basis[k][ss]
    autoView(vec_v, vec, AcceleratorWrite);
    accelerator_for(ss, oSites, nsimd, {
      auto v = coalescedRead(vec_v[ss]);
      for (int k = 0; k < n; ++k) {
        auto b = coalescedRead(basis_vp[k][ss]);
        v = v - re_p[k] * b - timesI(im_p[k] * b);
      }
      coalescedWrite(vec_v[ss], v);
    });
    for (int k = 0; k < n; ++k) h_basis_v[k].ViewClose();
  }
  // ------------------------------------------------------------------
  // Full reorthogonalization using two-pass Classical Gram-Schmidt (CGS2).
  // Each pass calls batchInnerProducts (1 GPU sync) + batchUpdate (1 sync),
  // replacing the old 2n GPU syncs per pass from sequential MGS.
  // ------------------------------------------------------------------
  void reorthogonalize(Field &vec, const std::vector<Field> &basis)
  {
-    for (int j = 0; j < (int)basis.size(); ++j) {
+    if (basis.empty()) return;
-      ComplexD ip = innerProduct(basis[j], vec);
+    std::vector<ComplexD> c;
-      vec = vec - ip * basis[j];
+    for (int pass = 0; pass < 2; ++pass) {
-    }
+      batchInnerProducts(vec, basis, c);
-    // Second pass for numerical stability
+      batchUpdate(vec, basis, c);
    for (int j = 0; j < (int)basis.size(); ++j) {
      ComplexD ip = innerProduct(basis[j], vec);
      vec = vec - ip * basis[j];
    }
  }
@@ -200,8 +200,8 @@ int main(int argc, char** argv) {
    RD.close();
  }
-//  std::vector<Complex> boundary = {1,1,1,-1};
+  std::vector<Complex> boundary = {1,1,1,-1};
-  std::vector<Complex> boundary = {1,1,1,1};
+//  std::vector<Complex> boundary = {1,1,1,1};
  FermionOp::ImplParams Params(boundary);