1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-06-04 19:24:36 +01:00

A2ALoopPropagator: fuse outer product sum into single accelerator_for kernel

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Peter Boyle
2026-05-28 10:39:37 -04:00
parent 5b58d1da62
commit dbd3a0e612
+28 -3
View File
@@ -387,9 +387,34 @@ void A2ALoopPropagator(PropagatorField &loop,
const std::vector<FermionField> &loop1,
const std::vector<FermionField> &loop2)
{
loop = Zero();
for (unsigned int k = 0; k < loop1.size(); ++k)
loop += outerProduct(loop1[k], loop2[k]);
int Nk = (int)loop1.size();
uint64_t oSites = loop.Grid()->oSites();
int Nsimd = SpinColourVector_v::Nsimd();
typedef decltype(loop1[0].View(AcceleratorRead)) View;
std::vector<View> v1, v2;
v1.reserve(Nk); v2.reserve(Nk);
for (int k = 0; k < Nk; k++) {
v1.push_back(loop1[k].View(AcceleratorRead));
v2.push_back(loop2[k].View(AcceleratorRead));
}
deviceVector<SpinColourVector_v *> l1p(Nk), l2p(Nk);
for (int k = 0; k < Nk; k++) {
acceleratorPut(l1p[k], &v1[k][0]);
acceleratorPut(l2p[k], &v2[k][0]);
}
autoView(loopv, loop, AcceleratorWrite);
SpinColourVector_v **l1 = &l1p[0];
SpinColourVector_v **l2 = &l2p[0];
int lNk = Nk;
accelerator_for(ss, oSites, Nsimd, {
auto result = outerProduct(coalescedRead(l1[0][ss]), coalescedRead(l2[0][ss]));
for (int k = 1; k < lNk; k++)
result = result + outerProduct(coalescedRead(l1[k][ss]), coalescedRead(l2[k][ss]));
coalescedWrite(loopv[ss], result);
});
}
void A2APackLeftConjugated(FermionField &out, const FermionField &in)