From d8d16407e9c3c867601bca80a3dc605cd92785b7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 27 May 2026 11:12:29 -0400 Subject: [PATCH] A2ASpatialSum: extended meson field kernel and test Co-Authored-By: Claude Sonnet 4.6 --- Grid/algorithms/Algorithms.h | 1 + Grid/algorithms/blas/A2ASpatialSum.h | 213 ++++++++ tests/Test_extended_meson_field.cc | 790 +++++++++++++++++++++++++++ 3 files changed, 1004 insertions(+) create mode 100644 Grid/algorithms/blas/A2ASpatialSum.h create mode 100644 tests/Test_extended_meson_field.cc diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 6ebb1c58..30ffe0f0 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -53,6 +53,7 @@ NAMESPACE_CHECK(approx); #include // Not really deflation, but useful #include +#include NAMESPACE_CHECK(deflation); #include NAMESPACE_CHECK(ConjGrad); diff --git a/Grid/algorithms/blas/A2ASpatialSum.h b/Grid/algorithms/blas/A2ASpatialSum.h new file mode 100644 index 00000000..924a350d --- /dev/null +++ b/Grid/algorithms/blas/A2ASpatialSum.h @@ -0,0 +1,213 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: Grid/algorithms/blas/A2ASpatialSum.h + + Copyright (C) 2025 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +/* + A2ASpatialSum + + Replaces the scalar spatial accumulation loop in A2A extended meson field + contractions with a batched GEMM over local time slices, enabling GPU offload. + + Given: + leftv[N_i][osite] — conjugated left SpinColourVectors (SIMD-packed) + loopRight[N_j][osite]— type-contracted right SpinColourVectors (SIMD-packed) + + Computes: + EMF[i,j,t] = sum_{x,s,c} leftv[i][x,t,s,c] * loopRight[j][x,t,s,c] + + via batched GEMM over nt local time slices, then GlobalSumVector across MPI. + + Memory layout (all C row-major): + W_buf [nt][N_i][nxyz*Nsc] — W[t][i][x*Nsc+sc] = leftv[i] at (x,t) + LR_buf [nt][N_j][nxyz*Nsc] — LR[t][j][x*Nsc+sc] = loopRight[j] at (x,t) + EMF_buf[nt][N_j][N_i] — column-major result; EMF[i,j,t] = EMF_buf[t][j][i] + + BLAS call (column-major, OP_T on A so A is read as W[i][k]): + C = A^T * B where A=W[N_i×K C-row], B=LR[N_j×K C-row], C=[N_j×N_i C-row] + → C[i,j] = sum_k W[i][k] * LR[j][k] = EMF[i,j] ✓ +*/ +template +class A2ASpatialSum +{ +public: + typedef typename vobj::scalar_type scalar; + typedef typename vobj::scalar_object sobj; + + GridBase *grid; + int N_i, N_j; + int nt, nxyz, Nsc; + + deviceVector W_buf; + deviceVector LR_buf; + deviceVector EMF_buf; + deviceVector W_ptrs; + deviceVector LR_ptrs; + deviceVector EMF_ptrs; + + A2ASpatialSum() : grid(nullptr), N_i(0), N_j(0), nt(0), nxyz(0), Nsc(0) {} + + void Allocate(int _N_i, int _N_j, GridBase *_grid) + { + grid = _grid; + N_i = _N_i; + N_j = _N_j; + Coordinate ldims = grid->LocalDimensions(); + nt = ldims[grid->Nd() - 1]; + nxyz = grid->lSites() / nt; + Nsc = sizeof(sobj) / sizeof(scalar); + + W_buf.resize(nt * N_i * nxyz * Nsc); + LR_buf.resize(nt * N_j * nxyz * Nsc); + EMF_buf.resize(nt * N_j * N_i); + + // Build persistent batch pointer arrays + W_ptrs.resize(nt); + LR_ptrs.resize(nt); + EMF_ptrs.resize(nt); + scalar *Wh = &W_buf[0]; + scalar *LRh = &LR_buf[0]; + scalar *EMFh = &EMF_buf[0]; + int lN_i = N_i, lN_j = N_j, lnxyz = nxyz, lNsc = Nsc; + for (int t = 0; t < nt; t++) { + acceleratorPut(W_ptrs[t], Wh + t * lN_i * lnxyz * lNsc); + acceleratorPut(LR_ptrs[t], LRh + t * lN_j * lnxyz * lNsc); + acceleratorPut(EMF_ptrs[t], EMFh + t * lN_j * lN_i); + } + } + + void PackLeft(const std::vector> &leftv) + { + GRID_ASSERT((int)leftv.size() == N_i); + PackVectors(leftv, &W_buf[0], N_i); + } + + void PackRight(const std::vector> &loopRight) + { + GRID_ASSERT((int)loopRight.size() == N_j); + PackVectors(loopRight, &LR_buf[0], N_j); + } + +private: + // Pack vecs[N] lattice fields into buf[nt][N][nxyz*Nsc], extracting all SIMD lanes. + void PackVectors(const std::vector> &vecs, scalar *buf, int N) + { + int nd = grid->_ndimension; + int osites = grid->oSites(); + int Nsimd = vobj::Nsimd(); + int lN = N; + int lNsc = Nsc; + int lnxyz = nxyz; + Coordinate rdimensions = grid->_rdimensions; + Coordinate ldims = grid->LocalDimensions(); + Coordinate simd = grid->_simd_layout; + + for (int n = 0; n < N; n++) { + autoView(src_v, vecs[n], AcceleratorRead); + accelerator_for(sf, osites, Nsimd, { +#ifdef GRID_SIMT + { + int lane = acceleratorSIMTlane(Nsimd); +#else + for (int lane = 0; lane < Nsimd; lane++) { +#endif + Coordinate icoor(nd), ocoor(nd), lcoor(nd); + Lexicographic::CoorFromIndex(icoor, lane, simd); + Lexicographic::CoorFromIndex(ocoor, sf, rdimensions); + for (int d = 0; d < nd; d++) + lcoor[d] = rdimensions[d] * icoor[d] + ocoor[d]; + + int l_t = lcoor[nd - 1]; + Coordinate xyz_coor = lcoor; + xyz_coor[nd - 1] = 0; + int64_t l_xyz; + Lexicographic::IndexFromCoor(xyz_coor, l_xyz, ldims); + + sobj data = extractLane(lane, src_v[sf]); + scalar *data_s = (scalar *)&data; + + int64_t base = (int64_t)l_t * lN * lnxyz * lNsc + + (int64_t)n * lnxyz * lNsc + + l_xyz * lNsc; + for (int sc = 0; sc < lNsc; sc++) + buf[base + sc] = data_s[sc]; + } + }); + } + } + +public: + + // Batched GEMM + MPI reduction → result[nt_global][N_i][N_j] + // + // BLAS (column-major, OP_T on A): + // C[N_j×N_i] = A^T[N_i×K] * B[N_j×K] with K=nxyz*Nsc + // reading A as C row-major [N_i][K] and B as C row-major [N_j][K] + // → C[i,j] = sum_k W[i,k] * LR[j,k] = EMF[i,j] ✓ + void Sum(Eigen::Tensor &result) + { + GridBLAS BLAS; + + int K = nxyz * Nsc; + BLAS.gemmBatched(GridBLAS_OP_T, GridBLAS_OP_N, + N_i, N_j, K, + scalar(1.0), + W_ptrs, + LR_ptrs, + scalar(0.0), + EMF_ptrs); + BLAS.synchronise(); + + // Copy from device and distribute into global-t layout + int nt_global = result.dimension(0); + int nd = grid->Nd(); + int lt_start = grid->LocalStarts()[nd - 1]; + + std::vector host_emf(nt * N_j * N_i); + acceleratorCopyFromDevice(&EMF_buf[0], host_emf.data(), + nt * N_j * N_i * sizeof(scalar)); + + // EMF_buf[t][j*N_i + i] = EMF[i,j] for local t + std::vector global_emf(nt_global * N_i * N_j, scalar(0.0)); + for (int lt = 0; lt < nt; lt++) { + int gt = lt + lt_start; + for (int i = 0; i < N_i; i++) + for (int j = 0; j < N_j; j++) + global_emf[gt * N_i * N_j + i * N_j + j] = host_emf[lt * N_j * N_i + j * N_i + i]; + } + grid->GlobalSumVector(global_emf.data(), nt_global * N_i * N_j); + + for (int gt = 0; gt < nt_global; gt++) + for (int i = 0; i < N_i; i++) + for (int j = 0; j < N_j; j++) + result(gt, i, j) = global_emf[gt * N_i * N_j + i * N_j + j]; + } +}; + +NAMESPACE_END(Grid); diff --git a/tests/Test_extended_meson_field.cc b/tests/Test_extended_meson_field.cc new file mode 100644 index 00000000..f56e0b4c --- /dev/null +++ b/tests/Test_extended_meson_field.cc @@ -0,0 +1,790 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: tests/Test_extended_meson_field.cc + +Copyright (C) 2015-2025 + +Author: Peter Boyle +Author: Masaaki Tomii (original Hadrons kernels) + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +#include "disable_tests_without_instantiations.h" +#ifdef ENABLE_FERMION_INSTANTIATIONS + +#include +#include + +using namespace Grid; + +typedef WilsonImplD FImpl; +typedef typename FImpl::FermionField FermionField; +typedef typename FImpl::SiteSpinor vobj; +typedef typename vobj::scalar_type scalar_type; +typedef typename vobj::vector_type vector_type; +typedef iSpinColourMatrix SpinColourMatrix_v; +typedef iSpinColourVector SpinColourVector_v; +typedef iSpinMatrix SpinMatrix_v; +typedef iSinglet Scalar_v; +typedef iSinglet Scalar_s; +typedef Lattice PropagatorField; + +// CPU reference + optionally batched GEMM spatial sum, ported from +// Hadrons/Modules/MContraction/A2AExtendedMesonField.hpp +// (M. Tomii, mtomii/Hadrons:local-2025-edits). Hadrons infrastructure removed. +// thread_for / CpuRead / orthogdim=3 preserved. +class A2AExtendedMesonFieldRef +{ +public: + // result is indexed [nt][N_i][N_j]. + // use_blas=true replaces the scalar spatial accumulation with A2ASpatialSum. + static void compute( + Eigen::Tensor &result, + const std::vector &left, + const std::vector &right, + const std::vector &loop1, + const std::vector &loop2, + const std::vector &gamma1, + const std::vector &gamma2, + int type, + bool use_blas = false) + { + GridBase *grid = left[0].Grid(); + + const int orthogdim = 3; + int rd = grid->_rdimensions[orthogdim]; + int ld = grid->_ldimensions[orthogdim]; + int Nd = grid->_ndimension; + int Nsimd = grid->Nsimd(); + + int nt = result.dimension(0); + int N_i = (int)left.size(); + int N_j = (int)right.size(); + + std::string tag = std::string(use_blas ? "[blas" : "[ref ") + " type=" + std::to_string(type) + "]"; + auto Tms = [](double us) { return us * 1e-3; }; + double t0; + + // ------------------------------------------------------------------ + // Loop propagator: sum_k outerProduct(loop1[k], loop2[k]) + // ------------------------------------------------------------------ + t0 = usecond(); + PropagatorField loop(grid); + loop = Zero(); + for (unsigned int k = 0; k < loop1.size(); ++k) + loop += outerProduct(loop1[k], loop2[k]); + std::cout << GridLogMessage << tag << " loop_build: " << Tms(usecond()-t0) << " ms\n"; + + // ------------------------------------------------------------------ + // Pack conjugated left vectors + // ------------------------------------------------------------------ + t0 = usecond(); + std::vector leftv(N_i, grid); + for (int i = 0; i < N_i; i++) + leftv[i] = conjugate(left[i]); + std::cout << GridLogMessage << tag << " pack_left: " << Tms(usecond()-t0) << " ms\n"; + + // ------------------------------------------------------------------ + // Per-site loop contraction into PropagatorField tloop (type-dependent) + // ------------------------------------------------------------------ + t0 = usecond(); + PropagatorField tloop(grid); + tloop = Zero(); + { + autoView(tloopv, tloop, CpuWrite); + autoView(loopv, loop, CpuRead); + if (type == 0) { + thread_for(ss, grid->oSites(), { + for (int s1 = 0; s1 < Ns; ++s1) + for (int s2 = 0; s2 < Ns; ++s2) + tloopv[ss]()(s1,s2)(0,0) = loopv[ss]()(s1,s2)(0,0) + + loopv[ss]()(s1,s2)(1,1) + + loopv[ss]()(s1,s2)(2,2); + }); + } + if (type == 1) { + thread_for(ss, grid->oSites(), { + tloopv[ss] = Zero(); + for (int mu = 0; mu < (int)gamma1.size(); ++mu) + tloopv[ss] = tloopv[ss] + Gamma(gamma1[mu]) * loopv[ss] * Gamma(gamma2[mu]); + }); + } + if (type == 2) { + thread_for(ss, grid->oSites(), { + tloopv[ss] = Zero(); + for (int mu = 0; mu < (int)gamma2.size(); ++mu) { + SpinColourMatrix_v tmp = Gamma(gamma2[mu]) * loopv[ss]; + int s1 = mu / Ns; + int s2 = mu % Ns; + for (int c1 = 0; c1 < Nc; ++c1) + for (int c2 = 0; c2 < Nc; ++c2) + tloopv[ss]()(s1,s2)(c1,c2) = tmp()(0,0)(c1,c2) + tmp()(1,1)(c1,c2) + + tmp()(2,2)(c1,c2) + tmp()(3,3)(c1,c2); + } + }); + } + if (type == 3) { + thread_for(ss, grid->oSites(), { + SpinMatrix_v spinLoop = Zero(); + for (int s1 = 0; s1 < Ns; ++s1) + for (int s2 = 0; s2 < Ns; ++s2) + spinLoop()(s1,s2)() = loopv[ss]()(s1,s2)(0,0) + + loopv[ss]()(s1,s2)(1,1) + + loopv[ss]()(s1,s2)(2,2); + tloopv[ss] = Zero(); + for (int mu = 0; mu < (int)gamma1.size(); ++mu) { + SpinMatrix_v tmp2 = Gamma(gamma1[mu]) * spinLoop * Gamma(gamma2[mu]); + for (int s1 = 0; s1 < Ns; ++s1) + for (int s2 = 0; s2 < Ns; ++s2) + tloopv[ss]()(s1,s2)(0,0) = tloopv[ss]()(s1,s2)(0,0) + tmp2()(s1,s2)(); + } + }); + } + } + std::cout << GridLogMessage << tag << " tloop: " << Tms(usecond()-t0) << " ms\n"; + + // Select addLoopRight kernel for this type + std::function &, + const std::vector &)> addLoopRight; + + if (type == 0) { + addLoopRight = [](SpinColourVector_v &lR, + const SpinColourMatrix_v &loopm, + const SpinColourVector_v &rightv, + const std::vector &g1, + const std::vector &g2) { + SpinMatrix_v spinLoop = Zero(); + for (int s1 = 0; s1 < Ns; ++s1) + for (int s2 = 0; s2 < Ns; ++s2) + spinLoop()(s1,s2)() = loopm()(s1,s2)(0,0); + for (int mu = 0; mu < (int)g1.size(); ++mu) { + SpinMatrix_v GLoop = Gamma(g2[mu]) * spinLoop; + auto trGLoop = GLoop()(0,0)() + GLoop()(1,1)() + GLoop()(2,2)() + GLoop()(3,3)(); + SpinColourVector_v Grightv = Gamma(g1[mu]) * rightv; + for (int s = 0; s < Ns; ++s) + for (int c = 0; c < Nc; ++c) + lR()(s)(c) += Grightv()(s)(c) * trGLoop; + } + }; + } + if (type == 1) { + addLoopRight = [](SpinColourVector_v &lR, + const SpinColourMatrix_v &loopm, + const SpinColourVector_v &rightv, + const std::vector &g1, + const std::vector &g2) { + for (int s = 0; s < Ns; ++s) + for (int c = 0; c < Nc; ++c) { + lR()(s)(c) + += loopm()(s,0)(c,0) * rightv()(0)(0) + + loopm()(s,0)(c,1) * rightv()(0)(1) + + loopm()(s,0)(c,2) * rightv()(0)(2) + + loopm()(s,1)(c,0) * rightv()(1)(0) + + loopm()(s,1)(c,1) * rightv()(1)(1) + + loopm()(s,1)(c,2) * rightv()(1)(2) + + loopm()(s,2)(c,0) * rightv()(2)(0) + + loopm()(s,2)(c,1) * rightv()(2)(1) + + loopm()(s,2)(c,2) * rightv()(2)(2) + + loopm()(s,3)(c,0) * rightv()(3)(0) + + loopm()(s,3)(c,1) * rightv()(3)(1) + + loopm()(s,3)(c,2) * rightv()(3)(2); + } + }; + } + if (type == 2) { + addLoopRight = [](SpinColourVector_v &lR, + const SpinColourMatrix_v &loopm, + const SpinColourVector_v &rightv, + const std::vector &g1, + const std::vector &g2) { + for (int mu = 0; mu < (int)g1.size(); ++mu) { + int s1 = mu / Ns; + int s2 = mu % Ns; + SpinColourVector_v Grightv = Gamma(g1[mu]) * rightv; + for (int s = 0; s < Ns; ++s) + for (int c = 0; c < Nc; ++c) + lR()(s)(c) += loopm()(s1,s2)(c,0) * Grightv()(s)(0) + + loopm()(s1,s2)(c,1) * Grightv()(s)(1) + + loopm()(s1,s2)(c,2) * Grightv()(s)(2); + } + }; + } + if (type == 3) { + addLoopRight = [](SpinColourVector_v &lR, + const SpinColourMatrix_v &loopm, + const SpinColourVector_v &rightv, + const std::vector &g1, + const std::vector &g2) { + for (int s = 0; s < Ns; ++s) + for (int c = 0; c < Nc; ++c) + lR()(s)(c) += loopm()(s,0)(0,0) * rightv()(0)(c) + + loopm()(s,1)(0,0) * rightv()(1)(c) + + loopm()(s,2)(0,0) * rightv()(2)(c) + + loopm()(s,3)(0,0) * rightv()(3)(c); + }; + } + + // ------------------------------------------------------------------ + // Pack loopRight[j] = type-kernel(tloop, right[j]) per site + // ------------------------------------------------------------------ + t0 = usecond(); + std::vector loopRight(N_j, grid); + { + autoView(tlv, tloop, CpuRead); + for (int j = 0; j < N_j; j++) { + loopRight[j] = Zero(); + autoView(lRv, loopRight[j], CpuWrite); + autoView(rv, right[j], CpuRead); + thread_for(ss, grid->oSites(), { + addLoopRight(lRv[ss], tlv[ss], rv[ss], gamma1, gamma2); + }); + } + } + std::cout << GridLogMessage << tag << " pack_loopright: " << Tms(usecond()-t0) << " ms\n"; + + if (use_blas) { + // ------------------------------------------------------------------ + // BLAS path: A2ASpatialSum (Allocate + PackLeft + PackRight + Sum) + // ------------------------------------------------------------------ + A2ASpatialSum spatial_sum; + double t_blas_start = usecond(); + + t0 = usecond(); + spatial_sum.Allocate(N_i, N_j, grid); + std::cout << GridLogMessage << tag << " Allocate: " << Tms(usecond()-t0) << " ms\n"; + + t0 = usecond(); + spatial_sum.PackLeft(leftv); + std::cout << GridLogMessage << tag << " PackLeft: " << Tms(usecond()-t0) << " ms\n"; + + t0 = usecond(); + spatial_sum.PackRight(loopRight); + std::cout << GridLogMessage << tag << " PackRight: " << Tms(usecond()-t0) << " ms\n"; + + t0 = usecond(); + spatial_sum.Sum(result); + std::cout << GridLogMessage << tag << " Sum (GEMM+MPI): " << Tms(usecond()-t0) << " ms\n"; + + std::cout << GridLogMessage << tag << " A2ASpatialSum: " << Tms(usecond()-t_blas_start) << " ms [TOTAL]\n"; + + } else { + // ------------------------------------------------------------------ + // Reference path: SIMD spatial sum + scalar extraction + // ------------------------------------------------------------------ + int MFrvol = rd * N_i * N_j; + int MFlvol = ld * N_i * N_j; + + Vector lvSum(MFrvol); + thread_for(r, MFrvol, { lvSum[r] = Zero(); }); + + t0 = usecond(); + { + int e1 = grid->_slice_nblock[orthogdim]; + int e2 = grid->_slice_block [orthogdim]; + int stride = grid->_slice_stride[orthogdim]; + using LView = decltype(leftv[0].View(CpuRead)); + using RView = decltype(loopRight[0].View(CpuRead)); + std::vector lv_views; + std::vector lr_views; + lv_views.reserve(N_i); + lr_views.reserve(N_j); + for (int i = 0; i < N_i; i++) lv_views.push_back(leftv[i].View(CpuRead)); + for (int j = 0; j < N_j; j++) lr_views.push_back(loopRight[j].View(CpuRead)); + thread_for(r, rd, { + int so = r * grid->_ostride[orthogdim]; + int base = N_i * N_j * r; + for (int n = 0; n < e1; n++) + for (int b = 0; b < e2; b++) { + int ss = so + n * stride + b; + for (int ii = 0; ii < N_i; ii++) + for (int jj = 0; jj < N_j; jj++) { + int idx = jj + N_j * ii + base; + for (int s = 0; s < Ns; ++s) + for (int c = 0; c < Nc; ++c) + lvSum[idx]()()() += lv_views[ii][ss]()(s)(c) * lr_views[jj][ss]()(s)(c); + } + } + }); + for (auto &v : lv_views) v.ViewClose(); + for (auto &v : lr_views) v.ViewClose(); + } + std::cout << GridLogMessage << tag << " spatial_sum: " << Tms(usecond()-t0) << " ms\n"; + + Vector lsSum(MFlvol); + thread_for(r, MFlvol, { lsSum[r] = scalar_type(0.0); }); + + t0 = usecond(); + thread_for(rt, rd, { + Coordinate icoor(Nd); + ExtractBuffer extracted(Nsimd); + for (int ii = 0; ii < N_i; ii++) + for (int jj = 0; jj < N_j; jj++) { + int ij_rdx = jj + N_j * (ii + N_i * rt); + extract(lvSum[ij_rdx], extracted); + for (int idx = 0; idx < Nsimd; idx++) { + grid->iCoorFromIindex(icoor, idx); + int ldx = rt + icoor[orthogdim] * rd; + int ij_ldx = jj + N_j * (ii + N_i * ldx); + lsSum[ij_ldx] = lsSum[ij_ldx] + extracted[idx]; + } + } + }); + std::cout << GridLogMessage << tag << " simd_extract: " << Tms(usecond()-t0) << " ms\n"; + + int pd = grid->_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + + t0 = usecond(); + Vector cache(nt * N_i * N_j, ComplexD(0.0)); + for (int lt = 0; lt < ld; lt++) + for (int pt = 0; pt < pd; pt++) { + int t = lt + pt * ld; + for (int ii = 0; ii < N_i; ii++) + for (int jj = 0; jj < N_j; jj++) { + if (pt == pc) { + int ij_ldx = jj + N_j * (ii + N_i * lt); + cache[jj + N_j * (ii + N_i * t)] = lsSum[ij_ldx]()()(); + } + } + } + grid->GlobalSumVector(cache.data(), nt * N_i * N_j); + std::cout << GridLogMessage << tag << " globalsum: " << Tms(usecond()-t0) << " ms\n"; + + for (int t = 0; t < nt; t++) + for (int ii = 0; ii < N_i; ii++) + for (int jj = 0; jj < N_j; jj++) + result(t, ii, jj) = cache[jj + N_j * (ii + N_i * t)]; + } + } +}; + +// ================================================================ +// Free-function GPU kernels — accelerator_for, v(ss) reads, +// coalescedWrite writes, vobj-level arithmetic throughout. +// Gamma arrays passed as Vector (UVM). +// ================================================================ + +void A2ALoopPropagator(PropagatorField &loop, + const std::vector &loop1, + const std::vector &loop2) +{ + loop = Zero(); + for (unsigned int k = 0; k < loop1.size(); ++k) + loop += outerProduct(loop1[k], loop2[k]); +} + +void A2APackLeftConjugated(FermionField &out, const FermionField &in) +{ + autoView(outv, out, AcceleratorWrite); + autoView(inv, in, AcceleratorRead); + uint64_t Osites = in.Grid()->oSites(); + int Nsimd = SpinColourVector_v::Nsimd(); + accelerator_for(ss, Osites, Nsimd, { + coalescedWrite(outv[ss], conjugate(inv(ss))); + }); +} + +// Type 0: colour-trace stored in (s1,s2)(0,0) +void A2ALoopLeftContractionType0(PropagatorField &tloop, const PropagatorField &loop) +{ + autoView(tloopv, tloop, AcceleratorWrite); + autoView(loopv, loop, AcceleratorRead); + uint64_t Osites = loop.Grid()->oSites(); + int Nsimd = SpinColourMatrix_v::Nsimd(); + accelerator_for(ss, Osites, Nsimd, { + auto l = loopv(ss); + auto tmp = l; tmp = Zero(); + for (int s1 = 0; s1 < Ns; ++s1) + for (int s2 = 0; s2 < Ns; ++s2) + tmp()(s1,s2)(0,0) = l()(s1,s2)(0,0) + l()(s1,s2)(1,1) + l()(s1,s2)(2,2); + coalescedWrite(tloopv[ss], tmp); + }); +} + +// Type 1: tloop = sum_mu Gamma(g1[mu]) * loop * Gamma(g2[mu]) +void A2ALoopLeftContractionType1(PropagatorField &tloop, const PropagatorField &loop, + const Vector &gamma1, + const Vector &gamma2) +{ + int ng = (int)gamma1.size(); + const Gamma::Algebra *g1 = gamma1.data(); + const Gamma::Algebra *g2 = gamma2.data(); + autoView(tloopv, tloop, AcceleratorWrite); + autoView(loopv, loop, AcceleratorRead); + uint64_t Osites = loop.Grid()->oSites(); + int Nsimd = SpinColourMatrix_v::Nsimd(); + accelerator_for(ss, Osites, Nsimd, { + auto l = loopv(ss); + auto tmp = l; tmp = Zero(); + for (int mu = 0; mu < ng; ++mu) + tmp = tmp + Gamma(g1[mu]) * l * Gamma(g2[mu]); + coalescedWrite(tloopv[ss], tmp); + }); +} + +// Type 2: for mu=[0..ng), s1=mu/Ns, s2=mu%Ns; +// tloop(s1,s2)(c1,c2) = Tr_spin( Gamma(g2[mu]) * loop )(c1,c2) +void A2ALoopLeftContractionType2(PropagatorField &tloop, const PropagatorField &loop, + const Vector &gamma2) +{ + int ng = (int)gamma2.size(); + const Gamma::Algebra *g2 = gamma2.data(); + autoView(tloopv, tloop, AcceleratorWrite); + autoView(loopv, loop, AcceleratorRead); + uint64_t Osites = loop.Grid()->oSites(); + int Nsimd = SpinColourMatrix_v::Nsimd(); + accelerator_for(ss, Osites, Nsimd, { + auto l = loopv(ss); + auto tmp = l; tmp = Zero(); + for (int mu = 0; mu < ng; ++mu) { + auto gtmp = Gamma(g2[mu]) * l; + int s1 = mu / Ns; + int s2 = mu % Ns; + for (int c1 = 0; c1 < Nc; ++c1) + for (int c2 = 0; c2 < Nc; ++c2) + tmp()(s1,s2)(c1,c2) = gtmp()(0,0)(c1,c2) + gtmp()(1,1)(c1,c2) + + gtmp()(2,2)(c1,c2) + gtmp()(3,3)(c1,c2); + } + coalescedWrite(tloopv[ss], tmp); + }); +} + +// Type 3: colour-trace → spin matrix → sum_mu G1*spinLoop*G2 stored in (s1,s2)(0,0) +void A2ALoopLeftContractionType3(PropagatorField &tloop, const PropagatorField &loop, + const Vector &gamma1, + const Vector &gamma2) +{ + int ng = (int)gamma1.size(); + const Gamma::Algebra *g1 = gamma1.data(); + const Gamma::Algebra *g2 = gamma2.data(); + autoView(tloopv, tloop, AcceleratorWrite); + autoView(loopv, loop, AcceleratorRead); + uint64_t Osites = loop.Grid()->oSites(); + int Nsimd = SpinColourMatrix_v::Nsimd(); + accelerator_for(ss, Osites, Nsimd, { + auto l = loopv(ss); + SpinMatrix_v spinLoop; spinLoop = Zero(); + for (int s1 = 0; s1 < Ns; ++s1) + for (int s2 = 0; s2 < Ns; ++s2) + spinLoop()(s1,s2)() = l()(s1,s2)(0,0) + l()(s1,s2)(1,1) + l()(s1,s2)(2,2); + auto tmp = l; tmp = Zero(); + for (int mu = 0; mu < ng; ++mu) { + SpinMatrix_v tmp2 = Gamma(g1[mu]) * spinLoop * Gamma(g2[mu]); + for (int s1 = 0; s1 < Ns; ++s1) + for (int s2 = 0; s2 < Ns; ++s2) + tmp()(s1,s2)(0,0) = tmp()(s1,s2)(0,0) + tmp2()(s1,s2)(); + } + coalescedWrite(tloopv[ss], tmp); + }); +} + +// Type 0: loopRight = sum_mu Tr(G2*spinLoop) * G1*right +// where spinLoop(s1,s2) = tloop(s1,s2)(0,0) +void A2ALoopRightContractionType0(FermionField &loopRight, + const PropagatorField &tloop, + const FermionField &right, + const Vector &gamma1, + const Vector &gamma2) +{ + int ng = (int)gamma1.size(); + const Gamma::Algebra *g1 = gamma1.data(); + const Gamma::Algebra *g2 = gamma2.data(); + autoView(lRv, loopRight, AcceleratorWrite); + autoView(tlv, tloop, AcceleratorRead); + autoView(rv, right, AcceleratorRead); + uint64_t Osites = right.Grid()->oSites(); + int Nsimd = SpinColourVector_v::Nsimd(); + accelerator_for(ss, Osites, Nsimd, { + auto loopm = tlv(ss); + auto rightv = rv(ss); + SpinMatrix_v spinLoop; spinLoop = Zero(); + for (int s1 = 0; s1 < Ns; ++s1) + for (int s2 = 0; s2 < Ns; ++s2) + spinLoop()(s1,s2)() = loopm()(s1,s2)(0,0); + SpinColourVector_v lR; lR = Zero(); + for (int mu = 0; mu < ng; ++mu) { + auto GLoop = Gamma(g2[mu]) * spinLoop; + auto trGLoop = GLoop()(0,0)() + GLoop()(1,1)() + GLoop()(2,2)() + GLoop()(3,3)(); + auto Grightv = Gamma(g1[mu]) * rightv; + for (int s = 0; s < Ns; ++s) + for (int c = 0; c < Nc; ++c) + lR()(s)(c) = lR()(s)(c) + Grightv()(s)(c) * trGLoop; + } + coalescedWrite(lRv[ss], lR); + }); +} + +// Type 1: loopRight = tloop * right (SpinColourMatrix * SpinColourVector) +void A2ALoopRightContractionType1(FermionField &loopRight, + const PropagatorField &tloop, + const FermionField &right) +{ + autoView(lRv, loopRight, AcceleratorWrite); + autoView(tlv, tloop, AcceleratorRead); + autoView(rv, right, AcceleratorRead); + uint64_t Osites = right.Grid()->oSites(); + int Nsimd = SpinColourVector_v::Nsimd(); + accelerator_for(ss, Osites, Nsimd, { + coalescedWrite(lRv[ss], tlv(ss) * rv(ss)); + }); +} + +// Type 2: loopRight(s)(c) = sum_{mu,c'} tloop(s1,s2)(c,c') * (G(g1[mu])*right)(s)(c') +void A2ALoopRightContractionType2(FermionField &loopRight, + const PropagatorField &tloop, + const FermionField &right, + const Vector &gamma1) +{ + int ng = (int)gamma1.size(); + const Gamma::Algebra *g1 = gamma1.data(); + autoView(lRv, loopRight, AcceleratorWrite); + autoView(tlv, tloop, AcceleratorRead); + autoView(rv, right, AcceleratorRead); + uint64_t Osites = right.Grid()->oSites(); + int Nsimd = SpinColourVector_v::Nsimd(); + accelerator_for(ss, Osites, Nsimd, { + auto loopm = tlv(ss); + auto rightv = rv(ss); + SpinColourVector_v lR; lR = Zero(); + for (int mu = 0; mu < ng; ++mu) { + int s1 = mu / Ns; + int s2 = mu % Ns; + auto Grightv = Gamma(g1[mu]) * rightv; + for (int s = 0; s < Ns; ++s) + for (int c = 0; c < Nc; ++c) + lR()(s)(c) = lR()(s)(c) + + loopm()(s1,s2)(c,0) * Grightv()(s)(0) + + loopm()(s1,s2)(c,1) * Grightv()(s)(1) + + loopm()(s1,s2)(c,2) * Grightv()(s)(2); + } + coalescedWrite(lRv[ss], lR); + }); +} + +// Type 3: loopRight(s)(c) = sum_{s'} tloop(s,s')(0,0) * right(s')(c) +void A2ALoopRightContractionType3(FermionField &loopRight, + const PropagatorField &tloop, + const FermionField &right) +{ + autoView(lRv, loopRight, AcceleratorWrite); + autoView(tlv, tloop, AcceleratorRead); + autoView(rv, right, AcceleratorRead); + uint64_t Osites = right.Grid()->oSites(); + int Nsimd = SpinColourVector_v::Nsimd(); + accelerator_for(ss, Osites, Nsimd, { + auto loopm = tlv(ss); + auto rightv = rv(ss); + SpinColourVector_v lR; lR = Zero(); + for (int s = 0; s < Ns; ++s) + for (int c = 0; c < Nc; ++c) + lR()(s)(c) = loopm()(s,0)(0,0) * rightv()(0)(c) + + loopm()(s,1)(0,0) * rightv()(1)(c) + + loopm()(s,2)(0,0) * rightv()(2)(c) + + loopm()(s,3)(0,0) * rightv()(3)(c); + coalescedWrite(lRv[ss], lR); + }); +} + +// ================================================================ +// GPU-offloaded extended meson field: accelerator_for contractions +// + A2ASpatialSum GEMM spatial reduction. +// ================================================================ +class A2AExtendedMesonFieldGPU +{ +public: + static void compute( + Eigen::Tensor &result, + const std::vector &left, + const std::vector &right, + const std::vector &loop1, + const std::vector &loop2, + const std::vector &gamma1_in, + const std::vector &gamma2_in, + int type) + { + GridBase *grid = left[0].Grid(); + int N_i = (int)left.size(); + int N_j = (int)right.size(); + + std::string tag = std::string("[gpu type=") + std::to_string(type) + "]"; + auto Tms = [](double us) { return us * 1e-3; }; + double t0; + + Vector gamma1(gamma1_in.begin(), gamma1_in.end()); + Vector gamma2(gamma2_in.begin(), gamma2_in.end()); + + t0 = usecond(); + PropagatorField loop(grid); + A2ALoopPropagator(loop, loop1, loop2); + std::cout << GridLogMessage << tag << " loop_build: " << Tms(usecond()-t0) << " ms\n"; + + t0 = usecond(); + std::vector leftv(N_i, grid); + for (int i = 0; i < N_i; i++) + A2APackLeftConjugated(leftv[i], left[i]); + std::cout << GridLogMessage << tag << " pack_left: " << Tms(usecond()-t0) << " ms\n"; + + t0 = usecond(); + PropagatorField tloop(grid); + tloop = Zero(); + switch (type) { + case 0: A2ALoopLeftContractionType0(tloop, loop); break; + case 1: A2ALoopLeftContractionType1(tloop, loop, gamma1, gamma2); break; + case 2: A2ALoopLeftContractionType2(tloop, loop, gamma2); break; + case 3: A2ALoopLeftContractionType3(tloop, loop, gamma1, gamma2); break; + } + std::cout << GridLogMessage << tag << " tloop: " << Tms(usecond()-t0) << " ms\n"; + + t0 = usecond(); + std::vector loopRight(N_j, grid); + for (int j = 0; j < N_j; j++) { + switch (type) { + case 0: A2ALoopRightContractionType0(loopRight[j], tloop, right[j], gamma1, gamma2); break; + case 1: A2ALoopRightContractionType1(loopRight[j], tloop, right[j]); break; + case 2: A2ALoopRightContractionType2(loopRight[j], tloop, right[j], gamma1); break; + case 3: A2ALoopRightContractionType3(loopRight[j], tloop, right[j]); break; + } + } + std::cout << GridLogMessage << tag << " pack_loopright: " << Tms(usecond()-t0) << " ms\n"; + + A2ASpatialSum spatial_sum; + double t_blas = usecond(); + + t0 = usecond(); + spatial_sum.Allocate(N_i, N_j, grid); + std::cout << GridLogMessage << tag << " Allocate: " << Tms(usecond()-t0) << " ms\n"; + + t0 = usecond(); + spatial_sum.PackLeft(leftv); + std::cout << GridLogMessage << tag << " PackLeft: " << Tms(usecond()-t0) << " ms\n"; + + t0 = usecond(); + spatial_sum.PackRight(loopRight); + std::cout << GridLogMessage << tag << " PackRight: " << Tms(usecond()-t0) << " ms\n"; + + t0 = usecond(); + spatial_sum.Sum(result); + std::cout << GridLogMessage << tag << " Sum (GEMM+MPI): " << Tms(usecond()-t0) << " ms\n"; + + std::cout << GridLogMessage << tag << " A2ASpatialSum: " << Tms(usecond()-t_blas) << " ms [TOTAL]\n"; + } +}; + +int main(int argc, char *argv[]) +{ + Grid_init(&argc, &argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(4, vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + GridCartesian grid(latt_size, simd_layout, mpi_layout); + + int Nt = latt_size[Tp]; + int N_i = 8; + int N_j = 8; + int Nloop = 4; + + GridParallelRNG pRNG(&grid); + pRNG.SeedFixedIntegers({1, 2, 3, 4}); + + std::vector left(N_i, &grid); + std::vector right(N_j, &grid); + std::vector loop1(Nloop, &grid); + std::vector loop2(Nloop, &grid); + + for (auto &f : left) random(pRNG, f); + for (auto &f : right) random(pRNG, f); + for (auto &f : loop1) random(pRNG, f); + for (auto &f : loop2) random(pRNG, f); + + std::vector GammaMU = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + Eigen::Tensor result_ref(Nt, N_i, N_j); + Eigen::Tensor result_blas(Nt, N_i, N_j); + Eigen::Tensor result_gpu(Nt, N_i, N_j); + double t_ref = 0, t_blas = 0, t_gpu = 0, start, stop; + + for (int type = 0; type < 4; type++) { + + result_ref.setZero(); + start = usecond(); + A2AExtendedMesonFieldRef::compute(result_ref, left, right, loop1, loop2, + GammaMU, GammaMU, type, false); + stop = usecond(); t_ref = stop - start; + + result_blas.setZero(); + start = usecond(); + A2AExtendedMesonFieldRef::compute(result_blas, left, right, loop1, loop2, + GammaMU, GammaMU, type, true); + stop = usecond(); t_blas = stop - start; + + result_gpu.setZero(); + start = usecond(); + A2AExtendedMesonFieldGPU::compute(result_gpu, left, right, loop1, loop2, + GammaMU, GammaMU, type); + stop = usecond(); t_gpu = stop - start; + + double norm2_ref = 0.0, norm2_blas = 0.0, norm2_gpu = 0.0; + double norm2_diff_blas = 0.0, norm2_diff_gpu = 0.0; + for (int t = 0; t < Nt; t++) + for (int ii = 0; ii < N_i; ii++) + for (int jj = 0; jj < N_j; jj++) { + norm2_ref += std::norm(result_ref(t, ii, jj)); + norm2_blas += std::norm(result_blas(t, ii, jj)); + norm2_gpu += std::norm(result_gpu(t, ii, jj)); + ComplexD diff_blas = result_ref(t, ii, jj) - result_blas(t, ii, jj); + ComplexD diff_gpu = result_ref(t, ii, jj) - result_gpu(t, ii, jj); + norm2_diff_blas += std::norm(diff_blas); + norm2_diff_gpu += std::norm(diff_gpu); + } + + double rel_blas = (norm2_ref > 0) ? std::sqrt(norm2_diff_blas / norm2_ref) : 0.0; + double rel_gpu = (norm2_ref > 0) ? std::sqrt(norm2_diff_gpu / norm2_ref) : 0.0; + + std::cout << GridLogMessage + << "type=" << type + << " norm2_ref=" << norm2_ref + << " norm2_blas=" << norm2_blas + << " norm2_gpu=" << norm2_gpu + << " rel_blas=" << rel_blas + << " rel_gpu=" << rel_gpu + << " t_ref=" << t_ref * 1e-6 << "s" + << " t_blas=" << t_blas * 1e-6 << "s" + << " t_gpu=" << t_gpu * 1e-6 << "s" + << std::endl; + + GRID_ASSERT(rel_blas < 1e-10); + GRID_ASSERT(rel_gpu < 1e-10); + } + + std::cout << GridLogMessage << "All types passed A2ASpatialSum and GPU regression." << std::endl; + + Grid_finalize(); + return EXIT_SUCCESS; +} + +#endif