1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-03-29 08:16:10 +01:00

Verbosity reduction batched inner product for reorthogonalization

This commit is contained in:
Chulwoo Jung
2026-03-17 13:02:16 -04:00
parent f3223021fd
commit 24752002fa
2 changed files with 157 additions and 15 deletions

View File

@@ -153,7 +153,7 @@ public:
// Step 1: extend from pStart steps to Nm steps // Step 1: extend from pStart steps to Nm steps
// ---------------------------------------------------------------- // ----------------------------------------------------------------
extendBasis(pStart, Nm, betaRestart); extendBasis(pStart, Nm, betaRestart);
verify(); // verify();
// ---------------------------------------------------------------- // ----------------------------------------------------------------
// Step 2: SVD of the Nm x Nm B matrix. // Step 2: SVD of the Nm x Nm B matrix.
@@ -208,7 +208,7 @@ public:
// Step 4: implicit restart — compress to Nk steps // Step 4: implicit restart — compress to Nk steps
// ---------------------------------------------------------------- // ----------------------------------------------------------------
implicitRestart(Nm, Nk, sigma, X, Y, order, betaK, betaRestart); implicitRestart(Nm, Nk, sigma, X, Y, order, betaK, betaRestart);
verify(); // verify();
// Lucky breakdown: exact invariant subspace found; convergence is exact. // Lucky breakdown: exact invariant subspace found; convergence is exact.
// B_p^+ = diag(alpha[0..Nk-1]); extract directly from restart basis. // B_p^+ = diag(alpha[0..Nk-1]); extract directly from restart basis.
@@ -360,7 +360,7 @@ private:
if (restart_col >= 0 && restart_col < m && (int)fvec.size() > 0) { if (restart_col >= 0 && restart_col < m && (int)fvec.size() > 0) {
for (int j = 0; j < restart_col && j < (int)fvec.size(); ++j){ for (int j = 0; j < restart_col && j < (int)fvec.size(); ++j){
B(j, restart_col) = fvec[j]; B(j, restart_col) = fvec[j];
std::cout << GridLogMessage << "buildFullB: B " <<j<<" "<<restart_col<<B(j, restart_col); std::cout << GridLogDebug << "buildFullB: B " <<j<<" "<<restart_col<<B(j, restart_col)<<std::endl;
} }
} }
return B; return B;
@@ -464,20 +464,162 @@ private:
} }
} }
public:
// ------------------------------------------------------------------ // ------------------------------------------------------------------
// Full reorthogonalization of vec against the vectors in basis. // Block reorthogonalization helpers.
// Subtracts projections only — does NOT normalize. // Declared public because CUDA extended lambdas cannot live inside
// private/protected member functions.
//
// batchInnerProducts: computes c[j] = <basis[j], vec> for all j
// in a single GPU pass (one accelerator_barrier instead of n).
// Queues n pairs of (per-site kernel, reduceKernel) to computeStream
// without intermediate CPU syncs, then syncs once at the end.
//
// batchUpdate: computes vec -= sum_j c[j]*basis[j] in one GPU kernel.
//
// reorthogonalize: two-pass Classical Gram-Schmidt (CGS2) using the
// two helpers above. Each pass costs 2 GPU syncs (1 IP + 1 update)
// instead of 2n syncs per pass in the old sequential MGS.
// ------------------------------------------------------------------
void batchInnerProducts(const Field &vec,
const std::vector<Field> &basis,
std::vector<ComplexD> &c)
{
int n = (int)basis.size();
c.resize(n);
if (n == 0) return;
typedef typename Field::vector_object vobj;
typedef decltype(innerProduct(vobj(), vobj())) inner_t;
typedef typename inner_t::scalar_objectD sobj;
typedef decltype(basis[0].View(AcceleratorRead)) View;
GridBase *grid = vec.Grid();
uint64_t oSites = grid->oSites();
uint64_t nsimd = grid->Nsimd();
uint64_t size = oSites * nsimd;
// Reduction geometry (same logic as sumD_gpu_small)
int numThreads, numBlocks;
int size_i = (int)size;
getNumBlocksAndThreads(size_i, sizeof(sobj), numThreads, numBlocks);
int smemSize = numThreads * sizeof(sobj);
// inner_tmp: reused per-site scratch for each basis vector j.
// Safe because the stream serializes (per_site_j → reduce_j → per_site_{j+1}):
// reduce_j finishes reading inner_tmp before per_site_{j+1} overwrites it.
deviceVector<inner_t> inner_tmp(oSites);
inner_t *inner_tmp_p = &inner_tmp[0];
// buffers[j*numBlocks + 0] will hold the reduction result for basis j.
deviceVector<sobj> buffers(n * numBlocks);
sobj *buf_p = &buffers[0];
// Basis views (opened once, closed at end of function)
hostVector<View> h_basis_v(n);
deviceVector<View> d_basis_v(n);
for (int j = 0; j < n; ++j) {
h_basis_v[j] = basis[j].View(AcceleratorRead);
acceleratorPut(d_basis_v[j], h_basis_v[j]);
}
View *basis_vp = &d_basis_v[0];
// Queue all 2n kernels to computeStream — no intermediate barriers.
// CUDA stream ordering guarantees reduce_j runs after per_site_j.
autoView(vec_v, vec, AcceleratorRead);
for (int j = 0; j < n; ++j) {
// Per-site inner products: inner_tmp[ss] = <basis[j][ss] | vec[ss]>
int jj = j;
accelerator_for(ss, oSites, nsimd, {
auto x_l = coalescedRead(basis_vp[jj][ss]);
auto y_l = coalescedRead(vec_v[ss]);
coalescedWrite(inner_tmp_p[ss], innerProduct(x_l, y_l));
});
// Reduce inner_tmp → buf_p + j*numBlocks (async, no barrier)
reduceKernel<<<numBlocks, numThreads, smemSize, computeStream>>>(
inner_tmp_p, buf_p + j * numBlocks, size_i);
}
// ONE sync after all 2n kernels are queued
accelerator_barrier();
// Batch copy all n results (result for j sits at buf[j*numBlocks+0])
hostVector<sobj> buf_h(n * numBlocks);
acceleratorCopyFromDevice(buf_p, &buf_h[0], n * numBlocks * sizeof(sobj));
// Convert sobj → ComplexD and do ONE batched MPI allreduce
std::vector<ComplexD> raw(n);
for (int j = 0; j < n; ++j)
raw[j] = TensorRemove(buf_h[j * numBlocks]);
grid->GlobalSumVector(&raw[0], n);
for (int j = 0; j < n; ++j) c[j] = raw[j];
for (int j = 0; j < n; ++j) h_basis_v[j].ViewClose();
}
void batchUpdate(Field &vec,
const std::vector<Field> &basis,
const std::vector<ComplexD> &c)
{
int n = (int)basis.size();
if (n == 0) return;
typedef typename Field::vector_object vobj;
typedef decltype(basis[0].View(AcceleratorRead)) View;
GridBase *grid = vec.Grid();
uint64_t oSites = grid->oSites();
uint64_t nsimd = grid->Nsimd();
// Split complex coefficients into real/imag double arrays on device.
// Using doubles avoids potential ComplexD-device-code compatibility issues.
hostVector<double> h_re(n), h_im(n);
deviceVector<double> d_re(n), d_im(n);
for (int k = 0; k < n; ++k) {
h_re[k] = c[k].real();
h_im[k] = c[k].imag();
}
acceleratorCopyToDevice(&h_re[0], &d_re[0], n * sizeof(double));
acceleratorCopyToDevice(&h_im[0], &d_im[0], n * sizeof(double));
double *re_p = &d_re[0];
double *im_p = &d_im[0];
// Basis views
hostVector<View> h_basis_v(n);
deviceVector<View> d_basis_v(n);
for (int k = 0; k < n; ++k) {
h_basis_v[k] = basis[k].View(AcceleratorRead);
acceleratorPut(d_basis_v[k], h_basis_v[k]);
}
View *basis_vp = &d_basis_v[0];
// Single kernel: vec[ss] -= sum_k (re[k] + i*im[k]) * basis[k][ss]
autoView(vec_v, vec, AcceleratorWrite);
accelerator_for(ss, oSites, nsimd, {
auto v = coalescedRead(vec_v[ss]);
for (int k = 0; k < n; ++k) {
auto b = coalescedRead(basis_vp[k][ss]);
v = v - re_p[k] * b - timesI(im_p[k] * b);
}
coalescedWrite(vec_v[ss], v);
});
for (int k = 0; k < n; ++k) h_basis_v[k].ViewClose();
}
// ------------------------------------------------------------------
// Full reorthogonalization using two-pass Classical Gram-Schmidt (CGS2).
// Each pass calls batchInnerProducts (1 GPU sync) + batchUpdate (1 sync),
// replacing the old 2n GPU syncs per pass from sequential MGS.
// ------------------------------------------------------------------ // ------------------------------------------------------------------
void reorthogonalize(Field &vec, const std::vector<Field> &basis) void reorthogonalize(Field &vec, const std::vector<Field> &basis)
{ {
for (int j = 0; j < (int)basis.size(); ++j) { if (basis.empty()) return;
ComplexD ip = innerProduct(basis[j], vec); std::vector<ComplexD> c;
vec = vec - ip * basis[j]; for (int pass = 0; pass < 2; ++pass) {
} batchInnerProducts(vec, basis, c);
// Second pass for numerical stability batchUpdate(vec, basis, c);
for (int j = 0; j < (int)basis.size(); ++j) {
ComplexD ip = innerProduct(basis[j], vec);
vec = vec - ip * basis[j];
} }
} }

View File

@@ -200,8 +200,8 @@ int main(int argc, char** argv) {
RD.close(); RD.close();
} }
// std::vector<Complex> boundary = {1,1,1,-1}; std::vector<Complex> boundary = {1,1,1,-1};
std::vector<Complex> boundary = {1,1,1,1}; // std::vector<Complex> boundary = {1,1,1,1};
FermionOp::ImplParams Params(boundary); FermionOp::ImplParams Params(boundary);