mirror of
https://github.com/paboyle/Grid.git
synced 2026-06-06 04:04:36 +01:00
Compare commits
59 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 8540b2a85d | |||
| dbd3a0e612 | |||
| 5b58d1da62 | |||
| 377db1bc08 | |||
| 699564997e | |||
| f2750fae09 | |||
| ed12fa09c5 | |||
| b914403bbe | |||
| c1566fb9a2 | |||
| 905da6f083 | |||
| cb199c127c | |||
| 1a932ea33b | |||
| 5822a6599c | |||
| 0eeb334fe0 | |||
| d8d16407e9 | |||
| 12e3499b6d | |||
| 9576011011 | |||
| 155b34c1aa | |||
| 982ffe9ebe | |||
| 0251ecaeab | |||
| 372a27d645 | |||
| 72b4a061f3 | |||
| 29198efabe | |||
| 50aa51f93a | |||
| 79ccc81a86 | |||
| 3f0fdbb597 | |||
| ea57bd8f03 | |||
| bdba5b8403 | |||
| 58cc6ca9c0 | |||
| e5996b440d | |||
| ad9d03fd85 | |||
| 4de160ce20 | |||
| fc8c8ce6e7 | |||
| ddbb7f07c8 | |||
| 1e29c59bcc | |||
| b6abdc3845 | |||
| 2fadd8bb62 | |||
| 60df2dd5d0 | |||
| 66b529b345 | |||
| 1304172a93 | |||
| 1315d4604d | |||
| a31af31328 | |||
| 26c3c7d8f9 | |||
| 0650d7c7eb | |||
| 068f95ad2d | |||
| f4fbf7c9ca | |||
| 843d6497b2 | |||
| 747c167658 | |||
| fca2c5dba0 | |||
| e12bc7f07c | |||
| dc6ae51cab | |||
| baa70d8ec9 | |||
| c93b338bdd | |||
| c0472aa0ec | |||
| 09552cfd73 | |||
| 003fec509c | |||
| 773a82d87f | |||
| 286c29d6fb | |||
| 969b0a3922 |
@@ -30,9 +30,6 @@ Key configure options:
|
||||
| `--enable-Nc=` | `3` (default), `2`, `4`, `5` |
|
||||
| `--with-gmp=`, `--with-mpfr=`, `--with-fftw=`, `--with-lime=` | paths to libs |
|
||||
| `--enable-hdf5`, `--enable-mkl`, `--enable-lapack` | optional features |
|
||||
| `--enable-debug=yes` | adds `-g`, removes `-O3` |
|
||||
|
||||
Use `make V=1` for verbose compiler output (shows full flags; required for bug reports).
|
||||
|
||||
Platform recipes from `README.md`:
|
||||
- **KNL**: `--enable-simd=KNL --enable-comms=mpi3-auto --enable-mkl`
|
||||
@@ -99,17 +96,3 @@ GPU support is injected via macros (`accelerator_for`, `accelerator_for2dNB`). T
|
||||
- The `RealD`/`RealF`/`ComplexD`/`ComplexF` typedefs are used everywhere; avoid raw `double`/`float`.
|
||||
- Logging uses `Grid_log`, `Grid_error` macros (from `Grid/log/`); performance-critical paths use the `GRID_TRACE` / timer macros from `Grid/perfmon/`.
|
||||
- Reductions across MPI ranks go through `GridBase::GlobalSum` / `GlobalMax`; never reduce with bare MPI calls inside library code.
|
||||
|
||||
## Skills
|
||||
|
||||
`skills/` contains seven user-invocable Claude Code skills encoding deep domain knowledge for HPC work in this repo. Invoke with `/skill-name` or ask Claude to use them by name:
|
||||
|
||||
| Skill | When to use |
|
||||
|-------|-------------|
|
||||
| `gpu-memory-performance` | Bandwidth/occupancy problems — `acceleratorThreads()` pitfalls, `coalescedRead/Write`, fused vs staged HBM patterns |
|
||||
| `gpu-runtime-correctness` | Wrong answers, non-deterministic results, premature `q.wait()` returns |
|
||||
| `communication-overlap` | Designing GPU+MPI overlap pipelines; replacing broken accelerator-aware MPI paths with host-staged 7-phase pipeline |
|
||||
| `mpi-heterogeneous` | Collective hangs, buffer aliasing in `MPI_Sendrecv`, heterogeneous topology bugs |
|
||||
| `hang-diagnosis` | Distinguishing ioctl hangs, infinite poll loops, collective deadlocks, and silent wrong-answer races |
|
||||
| `correctness-verification` | Reproducibility checksums, double-wait testing, bisecting non-deterministic failures |
|
||||
| `compiler-validation` | Confirming compiler/optimisation flags are safe before production runs |
|
||||
|
||||
@@ -53,6 +53,7 @@ NAMESPACE_CHECK(approx);
|
||||
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
|
||||
// Not really deflation, but useful
|
||||
#include <Grid/algorithms/blas/MomentumProject.h>
|
||||
#include <Grid/algorithms/blas/A2ASpatialSum.h>
|
||||
NAMESPACE_CHECK(deflation);
|
||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||
NAMESPACE_CHECK(ConjGrad);
|
||||
|
||||
@@ -0,0 +1,213 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: Grid/algorithms/blas/A2ASpatialSum.h
|
||||
|
||||
Copyright (C) 2025
|
||||
|
||||
Author: Peter Boyle <pboyle@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*
|
||||
A2ASpatialSum
|
||||
|
||||
Replaces the scalar spatial accumulation loop in A2A extended meson field
|
||||
contractions with a batched GEMM over local time slices, enabling GPU offload.
|
||||
|
||||
Given:
|
||||
leftv[N_i][osite] — conjugated left SpinColourVectors (SIMD-packed)
|
||||
loopRight[N_j][osite]— type-contracted right SpinColourVectors (SIMD-packed)
|
||||
|
||||
Computes:
|
||||
EMF[i,j,t] = sum_{x,s,c} leftv[i][x,t,s,c] * loopRight[j][x,t,s,c]
|
||||
|
||||
via batched GEMM over nt local time slices, then GlobalSumVector across MPI.
|
||||
|
||||
Memory layout (all C row-major):
|
||||
W_buf [nt][N_i][nxyz*Nsc] — W[t][i][x*Nsc+sc] = leftv[i] at (x,t)
|
||||
LR_buf [nt][N_j][nxyz*Nsc] — LR[t][j][x*Nsc+sc] = loopRight[j] at (x,t)
|
||||
EMF_buf[nt][N_j][N_i] — column-major result; EMF[i,j,t] = EMF_buf[t][j][i]
|
||||
|
||||
BLAS call (column-major, OP_T on A so A is read as W[i][k]):
|
||||
C = A^T * B where A=W[N_i×K C-row], B=LR[N_j×K C-row], C=[N_j×N_i C-row]
|
||||
→ C[i,j] = sum_k W[i][k] * LR[j][k] = EMF[i,j] ✓
|
||||
*/
|
||||
template<class vobj>
|
||||
class A2ASpatialSum
|
||||
{
|
||||
public:
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
|
||||
GridBase *grid;
|
||||
int N_i, N_j;
|
||||
int nt, nxyz, Nsc;
|
||||
|
||||
deviceVector<scalar> W_buf;
|
||||
deviceVector<scalar> LR_buf;
|
||||
deviceVector<scalar> EMF_buf;
|
||||
deviceVector<scalar *> W_ptrs;
|
||||
deviceVector<scalar *> LR_ptrs;
|
||||
deviceVector<scalar *> EMF_ptrs;
|
||||
|
||||
A2ASpatialSum() : grid(nullptr), N_i(0), N_j(0), nt(0), nxyz(0), Nsc(0) {}
|
||||
|
||||
void Allocate(int _N_i, int _N_j, GridBase *_grid)
|
||||
{
|
||||
grid = _grid;
|
||||
N_i = _N_i;
|
||||
N_j = _N_j;
|
||||
Coordinate ldims = grid->LocalDimensions();
|
||||
nt = ldims[grid->Nd() - 1];
|
||||
nxyz = grid->lSites() / nt;
|
||||
Nsc = sizeof(sobj) / sizeof(scalar);
|
||||
|
||||
W_buf.resize(nt * N_i * nxyz * Nsc);
|
||||
LR_buf.resize(nt * N_j * nxyz * Nsc);
|
||||
EMF_buf.resize(nt * N_j * N_i);
|
||||
|
||||
// Build persistent batch pointer arrays
|
||||
W_ptrs.resize(nt);
|
||||
LR_ptrs.resize(nt);
|
||||
EMF_ptrs.resize(nt);
|
||||
scalar *Wh = &W_buf[0];
|
||||
scalar *LRh = &LR_buf[0];
|
||||
scalar *EMFh = &EMF_buf[0];
|
||||
int lN_i = N_i, lN_j = N_j, lnxyz = nxyz, lNsc = Nsc;
|
||||
for (int t = 0; t < nt; t++) {
|
||||
acceleratorPut(W_ptrs[t], Wh + t * lN_i * lnxyz * lNsc);
|
||||
acceleratorPut(LR_ptrs[t], LRh + t * lN_j * lnxyz * lNsc);
|
||||
acceleratorPut(EMF_ptrs[t], EMFh + t * lN_j * lN_i);
|
||||
}
|
||||
}
|
||||
|
||||
void PackLeft(const std::vector<Lattice<vobj>> &leftv)
|
||||
{
|
||||
GRID_ASSERT((int)leftv.size() == N_i);
|
||||
PackVectors(leftv, &W_buf[0], N_i);
|
||||
}
|
||||
|
||||
void PackRight(const std::vector<Lattice<vobj>> &loopRight)
|
||||
{
|
||||
GRID_ASSERT((int)loopRight.size() == N_j);
|
||||
PackVectors(loopRight, &LR_buf[0], N_j);
|
||||
}
|
||||
|
||||
private:
|
||||
// Pack vecs[N] lattice fields into buf[nt][N][nxyz*Nsc], extracting all SIMD lanes.
|
||||
void PackVectors(const std::vector<Lattice<vobj>> &vecs, scalar *buf, int N)
|
||||
{
|
||||
int nd = grid->_ndimension;
|
||||
int osites = grid->oSites();
|
||||
int Nsimd = vobj::Nsimd();
|
||||
int lN = N;
|
||||
int lNsc = Nsc;
|
||||
int lnxyz = nxyz;
|
||||
Coordinate rdimensions = grid->_rdimensions;
|
||||
Coordinate ldims = grid->LocalDimensions();
|
||||
Coordinate simd = grid->_simd_layout;
|
||||
|
||||
for (int n = 0; n < N; n++) {
|
||||
autoView(src_v, vecs[n], AcceleratorRead);
|
||||
accelerator_for(sf, osites, Nsimd, {
|
||||
#ifdef GRID_SIMT
|
||||
{
|
||||
int lane = acceleratorSIMTlane(Nsimd);
|
||||
#else
|
||||
for (int lane = 0; lane < Nsimd; lane++) {
|
||||
#endif
|
||||
Coordinate icoor(nd), ocoor(nd), lcoor(nd);
|
||||
Lexicographic::CoorFromIndex(icoor, lane, simd);
|
||||
Lexicographic::CoorFromIndex(ocoor, sf, rdimensions);
|
||||
for (int d = 0; d < nd; d++)
|
||||
lcoor[d] = rdimensions[d] * icoor[d] + ocoor[d];
|
||||
|
||||
int l_t = lcoor[nd - 1];
|
||||
Coordinate xyz_coor = lcoor;
|
||||
xyz_coor[nd - 1] = 0;
|
||||
int64_t l_xyz;
|
||||
Lexicographic::IndexFromCoor(xyz_coor, l_xyz, ldims);
|
||||
|
||||
sobj data = extractLane(lane, src_v[sf]);
|
||||
scalar *data_s = (scalar *)&data;
|
||||
|
||||
int64_t base = (int64_t)l_t * lN * lnxyz * lNsc
|
||||
+ (int64_t)n * lnxyz * lNsc
|
||||
+ l_xyz * lNsc;
|
||||
for (int sc = 0; sc < lNsc; sc++)
|
||||
buf[base + sc] = data_s[sc];
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
// Batched GEMM + MPI reduction → result[nt_global][N_i][N_j]
|
||||
//
|
||||
// BLAS (column-major, OP_T on A):
|
||||
// C[N_j×N_i] = A^T[N_i×K] * B[N_j×K] with K=nxyz*Nsc
|
||||
// reading A as C row-major [N_i][K] and B as C row-major [N_j][K]
|
||||
// → C[i,j] = sum_k W[i,k] * LR[j,k] = EMF[i,j] ✓
|
||||
void Sum(Eigen::Tensor<ComplexD, 3> &result)
|
||||
{
|
||||
GridBLAS BLAS;
|
||||
|
||||
int K = nxyz * Nsc;
|
||||
BLAS.gemmBatched(GridBLAS_OP_T, GridBLAS_OP_N,
|
||||
N_i, N_j, K,
|
||||
scalar(1.0),
|
||||
W_ptrs,
|
||||
LR_ptrs,
|
||||
scalar(0.0),
|
||||
EMF_ptrs);
|
||||
BLAS.synchronise();
|
||||
|
||||
// Copy from device and distribute into global-t layout
|
||||
int nt_global = result.dimension(0);
|
||||
int nd = grid->Nd();
|
||||
int lt_start = grid->LocalStarts()[nd - 1];
|
||||
|
||||
std::vector<scalar> host_emf(nt * N_j * N_i);
|
||||
acceleratorCopyFromDevice(&EMF_buf[0], host_emf.data(),
|
||||
nt * N_j * N_i * sizeof(scalar));
|
||||
|
||||
// EMF_buf[t][j*N_i + i] = EMF[i,j] for local t
|
||||
std::vector<scalar> global_emf(nt_global * N_i * N_j, scalar(0.0));
|
||||
for (int lt = 0; lt < nt; lt++) {
|
||||
int gt = lt + lt_start;
|
||||
for (int i = 0; i < N_i; i++)
|
||||
for (int j = 0; j < N_j; j++)
|
||||
global_emf[gt * N_i * N_j + i * N_j + j] = host_emf[lt * N_j * N_i + j * N_i + i];
|
||||
}
|
||||
grid->GlobalSumVector(global_emf.data(), nt_global * N_i * N_j);
|
||||
|
||||
for (int gt = 0; gt < nt_global; gt++)
|
||||
for (int i = 0; i < N_i; i++)
|
||||
for (int j = 0; j < N_j; j++)
|
||||
result(gt, i, j) = global_emf[gt * N_i * N_j + i * N_j + j];
|
||||
}
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
@@ -288,21 +288,17 @@ public:
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
if ( Nm>Nk ) {
|
||||
Glog <<" #Ritz values of poly(A) before shift ["<<Nm<<" values]:"<<std::endl;
|
||||
for(int i=0; i<Nm; ++i){
|
||||
std::cout.precision(8);
|
||||
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
std::cout << "Rval = "<<std::setw(16)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||
}
|
||||
|
||||
// Glog <<" #Apply shifted QR transformations "<<std::endl;
|
||||
//int k2 = Nk+Nu;
|
||||
int k2 = Nk;
|
||||
|
||||
|
||||
Eigen::MatrixXcd BTDM = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||
Q = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||
|
||||
|
||||
unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
|
||||
|
||||
for(int ip=Nk; ip<Nm; ++ip){
|
||||
Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
|
||||
shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
|
||||
}
|
||||
|
||||
@@ -330,11 +326,11 @@ public:
|
||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
||||
_sort.push(eval2,Nk);
|
||||
Glog << "#Ritz values of poly(A) after shift ["<<Nk<<" values]:"<<std::endl;
|
||||
// Glog << "#Ritz value after shift: "<< std::endl;
|
||||
for(int i=0; i<Nk; ++i){
|
||||
std::cout.precision(8);
|
||||
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
std::cout << "Rval = "<<std::setw(16)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||
// std::cout.precision(13);
|
||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||
}
|
||||
}
|
||||
//----------------------------------------------------------------------
|
||||
@@ -714,17 +710,11 @@ private:
|
||||
//lme[0][L] = beta;
|
||||
|
||||
for (int u=0; u<Nu; ++u) {
|
||||
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
|
||||
GRID_ASSERT (!isnan(norm2(w[u])));
|
||||
}
|
||||
// Diagnostic: print alpha (lmd) and beta (lme) block diagonals for this step
|
||||
{
|
||||
Glog << " blk b="<<std::setw(3)<<b<<" alpha:";
|
||||
for (int u=0; u<Nu; ++u)
|
||||
std::cout << " " << std::setw(10) << std::setprecision(6) << real(lmd[u][L+u]);
|
||||
std::cout << " |beta|:";
|
||||
for (int u=0; u<Nu; ++u)
|
||||
std::cout << " " << std::setw(10) << std::setprecision(6) << abs(lme[u][L+u]);
|
||||
std::cout << std::endl;
|
||||
for (int k=L+u; k<R; ++k) {
|
||||
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
||||
}
|
||||
}
|
||||
// Glog << "LinAlg done "<< std::endl;
|
||||
|
||||
|
||||
+28
-23
@@ -32,6 +32,33 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//////////////////////////////////////
|
||||
// innerProductD scalar overloads must be visible before norm2 is defined
|
||||
//////////////////////////////////////
|
||||
accelerator_inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){ return innerProduct(l,r); }
|
||||
accelerator_inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){ return innerProduct(l,r); }
|
||||
accelerator_inline RealD innerProductD(const RealD &l,const RealD &r){ return innerProduct(l,r); }
|
||||
accelerator_inline RealD innerProductD(const RealF &l,const RealF &r){ return innerProduct(l,r); }
|
||||
|
||||
accelerator_inline vComplexD innerProductD(const vComplexD &l,const vComplexD &r){ return innerProduct(l,r); }
|
||||
accelerator_inline vRealD innerProductD(const vRealD &l,const vRealD &r){ return innerProduct(l,r); }
|
||||
accelerator_inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r)
|
||||
{
|
||||
vComplexD la,lb;
|
||||
vComplexD ra,rb;
|
||||
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
|
||||
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
|
||||
return innerProduct(la,ra) + innerProduct(lb,rb);
|
||||
}
|
||||
accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r)
|
||||
{
|
||||
vRealD la,lb;
|
||||
vRealD ra,rb;
|
||||
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
|
||||
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
|
||||
return innerProduct(la,ra) + innerProduct(lb,rb);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// innerProduct Scalar x Scalar -> Scalar
|
||||
// innerProduct Vector x Vector -> Scalar
|
||||
@@ -138,30 +165,8 @@ auto Reduce (const iScalar<l>& lhs) -> iScalar<decltype(Reduce(lhs._internal))>
|
||||
|
||||
//////////////////////////////////////
|
||||
// innerProductD : if single promote to double and evaluate with sum 2x
|
||||
// (scalar/vector overloads are declared above norm2 for ADL visibility)
|
||||
//////////////////////////////////////
|
||||
accelerator_inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){ return innerProduct(l,r); }
|
||||
accelerator_inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){ return innerProduct(l,r); }
|
||||
accelerator_inline RealD innerProductD(const RealD &l,const RealD &r){ return innerProduct(l,r); }
|
||||
accelerator_inline RealD innerProductD(const RealF &l,const RealF &r){ return innerProduct(l,r); }
|
||||
|
||||
accelerator_inline vComplexD innerProductD(const vComplexD &l,const vComplexD &r){ return innerProduct(l,r); }
|
||||
accelerator_inline vRealD innerProductD(const vRealD &l,const vRealD &r){ return innerProduct(l,r); }
|
||||
accelerator_inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r)
|
||||
{
|
||||
vComplexD la,lb;
|
||||
vComplexD ra,rb;
|
||||
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
|
||||
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
|
||||
return innerProduct(la,ra) + innerProduct(lb,rb);
|
||||
}
|
||||
accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r)
|
||||
{
|
||||
vRealD la,lb;
|
||||
vRealD ra,rb;
|
||||
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
|
||||
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
|
||||
return innerProduct(la,ra) + innerProduct(lb,rb);
|
||||
}
|
||||
|
||||
// Now do it for vector, matrix, scalar
|
||||
template<class l,class r,int N> accelerator_inline
|
||||
|
||||
@@ -33,6 +33,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
@@ -96,49 +97,19 @@ int main (int argc, char ** argv)
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
|
||||
NaiveStaggeredFermionD Dn(Umu,Grid,RBGrid,mass,c1,u0,params);
|
||||
|
||||
FermionField src_e(&RBGrid); pickCheckerboard(Even, src_e, src);
|
||||
FermionField src_o(&RBGrid); pickCheckerboard(Odd, src_o, src);
|
||||
FermionField res_e(&RBGrid); res_e = Zero();
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
|
||||
int ncall=1000;
|
||||
|
||||
// ImprovedStaggered Dhop
|
||||
for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
|
||||
for(int i=0;i<ncall;i++){
|
||||
Ds.Dhop(src,result,0);
|
||||
}
|
||||
double t1=usecond();
|
||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // 1146 flops/site
|
||||
std::cout<<GridLogMessage << ncall << " ImprovedStaggered Dhop calls in "<< (t1-t0)<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "ImprovedStaggered Dhop mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
// ImprovedStaggered DhopEO
|
||||
for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
|
||||
t1=usecond();
|
||||
flops=(16*(3*(6+8+8)) + 15*3*2)*(volume/2)*ncall;
|
||||
std::cout<<GridLogMessage << ncall << " ImprovedStaggered DhopEO calls in "<< (t1-t0)<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "ImprovedStaggered DhopEO mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
// NaiveStaggered Dhop
|
||||
for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
|
||||
t1=usecond();
|
||||
flops=(8*(3*(6+8+8)) + 7*3*2)*volume*ncall;
|
||||
std::cout<<GridLogMessage << ncall << " NaiveStaggered Dhop calls in "<< (t1-t0)<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "NaiveStaggered Dhop mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
// NaiveStaggered DhopEO
|
||||
for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
|
||||
t1=usecond();
|
||||
flops=(8*(3*(6+8+8)) + 7*3*2)*(volume/2)*ncall;
|
||||
std::cout<<GridLogMessage << ncall << " NaiveStaggered DhopEO calls in "<< (t1-t0)<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "NaiveStaggered DhopEO mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||
|
||||
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
@@ -96,45 +96,22 @@ int main (int argc, char ** argv)
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
ImprovedStaggeredFermionF Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
|
||||
NaiveStaggeredFermionF Dn(Umu,Grid,RBGrid,mass,c1,u0,params);
|
||||
|
||||
FermionField src_e(&RBGrid); pickCheckerboard(Even, src_e, src);
|
||||
FermionField src_o(&RBGrid); pickCheckerboard(Odd, src_o, src);
|
||||
FermionField res_e(&RBGrid); res_e = Zero();
|
||||
|
||||
int ncall=10000;
|
||||
|
||||
// ImprovedStaggered Dhop
|
||||
for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
|
||||
|
||||
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
|
||||
int ncall=1000;
|
||||
for(int i=0;i<ncall;i++){
|
||||
Ds.Dhop(src,result,0);
|
||||
}
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
|
||||
for(int i=0;i<ncall;i++){
|
||||
Ds.Dhop(src,result,0);
|
||||
}
|
||||
double t1=usecond();
|
||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // 1146 flops/site
|
||||
std::cout<<GridLogMessage << "ImprovedStaggered Dhop mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
// ImprovedStaggered DhopEO
|
||||
for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
|
||||
t1=usecond();
|
||||
flops=(16*(3*(6+8+8)) + 15*3*2)*(volume/2)*ncall;
|
||||
std::cout<<GridLogMessage << "ImprovedStaggered DhopEO mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
// NaiveStaggered Dhop
|
||||
for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
|
||||
t1=usecond();
|
||||
flops=(8*(3*(6+8+8)) + 7*3*2)*volume*ncall;
|
||||
std::cout<<GridLogMessage << "NaiveStaggered Dhop mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
// NaiveStaggered DhopEO
|
||||
for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
|
||||
t1=usecond();
|
||||
flops=(8*(3*(6+8+8)) + 7*3*2)*(volume/2)*ncall;
|
||||
std::cout<<GridLogMessage << "NaiveStaggered DhopEO mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||
|
||||
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
@@ -716,161 +716,6 @@ public:
|
||||
return mflops_best;
|
||||
}
|
||||
|
||||
static double NaiveStaggered(int L)
|
||||
{
|
||||
double mflops;
|
||||
double mflops_best = 0;
|
||||
double mflops_worst= 0;
|
||||
std::vector<double> mflops_all;
|
||||
|
||||
///////////////////////////////////////////////////////
|
||||
// Set/Get the layout & grid size
|
||||
///////////////////////////////////////////////////////
|
||||
int threads = GridThread::GetThreads();
|
||||
Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
|
||||
Coordinate local({L,L,L,L});
|
||||
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
|
||||
|
||||
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());
|
||||
uint64_t NP = TmpGrid->RankCount();
|
||||
uint64_t NN = TmpGrid->NodeCount();
|
||||
NN_global=NN;
|
||||
uint64_t SHM=NP/NN;
|
||||
|
||||
|
||||
///////// Welcome message ////////////
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "Benchmark NaiveStaggered on "<<L<<"^4 local volume "<<std::endl;
|
||||
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
|
||||
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
|
||||
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
|
||||
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
|
||||
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
|
||||
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
|
||||
///////// Lattice Init ////////////
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
|
||||
|
||||
///////// RNG Init ////////////
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
|
||||
typedef NaiveStaggeredFermionF Action;
|
||||
typedef typename Action::FermionField Fermion;
|
||||
typedef LatticeGaugeFieldF Gauge;
|
||||
|
||||
Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
|
||||
typename Action::ImplParams params;
|
||||
Action Ds(Umu,*FGrid,*FrbGrid,mass,c1,u0,params);
|
||||
|
||||
///////// Source preparation ////////////
|
||||
Fermion src (FGrid); random(RNG4,src);
|
||||
Fermion src_e (FrbGrid);
|
||||
Fermion src_o (FrbGrid);
|
||||
Fermion r_e (FrbGrid);
|
||||
Fermion r_o (FrbGrid);
|
||||
Fermion r_eo (FGrid);
|
||||
|
||||
{
|
||||
|
||||
pickCheckerboard(Even,src_e,src);
|
||||
pickCheckerboard(Odd,src_o,src);
|
||||
|
||||
const int num_cases = 2;
|
||||
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
|
||||
|
||||
controls Cases [] = {
|
||||
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||
{ StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||
{ StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
|
||||
};
|
||||
|
||||
for(int c=0;c<num_cases;c++) {
|
||||
|
||||
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
|
||||
StaggeredKernelsStatic::Opt = Cases[c].Opt;
|
||||
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
|
||||
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
|
||||
int nwarm = 10;
|
||||
double t0=usecond();
|
||||
FGrid->Barrier();
|
||||
for(int i=0;i<nwarm;i++){
|
||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||
}
|
||||
FGrid->Barrier();
|
||||
double t1=usecond();
|
||||
|
||||
uint64_t no = 50;
|
||||
uint64_t ni = 100;
|
||||
|
||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||
|
||||
time_statistics timestat;
|
||||
std::vector<double> t_time(no);
|
||||
for(uint64_t i=0;i<no;i++){
|
||||
t0=usecond();
|
||||
for(uint64_t j=0;j<ni;j++){
|
||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||
}
|
||||
t1=usecond();
|
||||
t_time[i] = t1-t0;
|
||||
}
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=((8*(3*(6+8+8)) + 7*3*2)*1.0*volume)/2;
|
||||
double mf_hi, mf_lo, mf_err;
|
||||
|
||||
timestat.statistics(t_time);
|
||||
mf_hi = flops/timestat.min*ni;
|
||||
mf_lo = flops/timestat.max*ni;
|
||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||
|
||||
mflops = flops/timestat.mean*ni;
|
||||
mflops_all.push_back(mflops);
|
||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
|
||||
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
|
||||
std::cout<<GridLogMessage <<fmt << std::endl;
|
||||
std::cout<<GridLogMessage ;
|
||||
|
||||
for(int i=0;i<mflops_all.size();i++){
|
||||
std::cout<<mflops_all[i]/NN<<" ; " ;
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
return mflops_best;
|
||||
}
|
||||
|
||||
static double Clover(int L)
|
||||
{
|
||||
double mflops;
|
||||
@@ -1042,7 +887,6 @@ int main (int argc, char ** argv)
|
||||
std::vector<double> clover;
|
||||
std::vector<double> dwf4;
|
||||
std::vector<double> staggered;
|
||||
std::vector<double> naive_staggered;
|
||||
|
||||
int Ls=1;
|
||||
if (do_dslash){
|
||||
@@ -1070,21 +914,13 @@ int main (int argc, char ** argv)
|
||||
staggered.push_back(result);
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Naive Staggered dslash 4D vectorised" <<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
for(int l=0;l<L_list.size();l++){
|
||||
double result = Benchmark::NaiveStaggered(L_list[l]) ;
|
||||
naive_staggered.push_back(result);
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered \t\t Naive Staggered" <<std::endl;
|
||||
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
|
||||
for(int l=0;l<L_list.size();l++){
|
||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<" \t\t "<<naive_staggered[l]<<std::endl;
|
||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
|
||||
}
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
}
|
||||
@@ -1094,14 +930,14 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered \t\t NaiveStag \t|\t (GF/s per node)" <<std::endl;
|
||||
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
||||
fprintf(FP,"Per node summary table\n");
|
||||
fprintf(FP,"\n");
|
||||
fprintf(FP,"L , Wilson, DWF4, Staggered, NaiveStag\n");
|
||||
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
|
||||
fprintf(FP,"\n");
|
||||
for(int l=0;l<L_list.size();l++){
|
||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<" \t " <<naive_staggered[l]/NN<<std::endl;
|
||||
fprintf(FP,"%d , %.0f, %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.,naive_staggered[l]/NN/1000.);
|
||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
||||
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
|
||||
}
|
||||
fprintf(FP,"\n");
|
||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
---
|
||||
name: ref_a2a_emf_work
|
||||
description: "A2A Extended Meson Field GPU offload work — status, file locations, pending task"
|
||||
metadata:
|
||||
node_type: memory
|
||||
type: project
|
||||
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
|
||||
---
|
||||
|
||||
## What was built
|
||||
|
||||
`Grid/algorithms/blas/A2ASpatialSum.h` — batched GEMM spatial sum replacing scalar SIMD accumulation. Included via `Grid/algorithms/Algorithms.h`.
|
||||
|
||||
`tests/Test_extended_meson_field.cc` — test with class `A2AExtendedMesonFieldRef` containing:
|
||||
- CPU reference path (`use_blas=false`)
|
||||
- BLAS path (`use_blas=true`) using `A2ASpatialSum`
|
||||
- Per-phase timing with `[ref type=N]` / `[blas type=N]` labels
|
||||
- 4 contraction types (0-3), all verified at machine precision (~4e-16 rel_err)
|
||||
|
||||
## Pending task: GPU offload class
|
||||
|
||||
**Goal**: Write `A2AExtendedMesonFieldGPU` in the same test file, replacing all `thread_for` loops with `accelerator_for`-based free function kernels.
|
||||
|
||||
The `thread_for` blocks to replace all have the form:
|
||||
```cpp
|
||||
thread_for(r, rd, {
|
||||
int so = r * grid->_ostride[orthogdim];
|
||||
for (int n = 0; n < e1; n++)
|
||||
for (int b = 0; b < e2; b++) {
|
||||
int ss = so + n * stride + b;
|
||||
// work
|
||||
}
|
||||
});
|
||||
```
|
||||
Replace with `accelerator_for(ss, grid->oSites(), Nsimd, { ... })`.
|
||||
|
||||
**Free functions to write** (each takes `Lattice<T>` args, opens views internally):
|
||||
- `A2ALoopPropagator` — outerProduct sum (loop build)
|
||||
- `A2APackLeftConjugated` — conjugate left fermion fields into `Lattice<SpinColourVector_v>`
|
||||
- `A2ALoopLeftContractionType0/1/2/3` — per-site loop × loop propagator → `tloop`
|
||||
- `A2ALoopRightContractionType0/1/2/3` — per-site tloop × right → `loopRight[j]`
|
||||
|
||||
**Data structure changes required**:
|
||||
- `tloopv`: `std::vector<SpinColourMatrix_v>` → `Lattice<SpinColourMatrix_v>` (PropagatorField)
|
||||
- `leftv[i]`: `std::vector<SpinColourVector_v>` → `Lattice<SpinColourVector_v>`
|
||||
- `loopRight[j]`: `std::vector<SpinColourVector_v>` → `Lattice<SpinColourVector_v>`
|
||||
|
||||
**Why**: `std::vector<vobj>` is host memory, not GPU accessible. See [[ref_lattice_vs_vector]].
|
||||
|
||||
**`A2ASpatialSum` impact**: `PackLeft`/`PackRight` currently take `std::vector<std::vector<vobj>>`. Once leftv/loopRight become `std::vector<Lattice<vobj>>`, those signatures must change to match.
|
||||
|
||||
## Timing on 8.8.8.16 (N_i=N_j=8, Nloop=4, 1 MPI rank)
|
||||
|
||||
Dominant costs:
|
||||
- `loop_build`: 4-6 ms (outerProduct over 4 propagators)
|
||||
- `pack_loopright`: 0.9-2.2 ms (type-dependent)
|
||||
- `spatial_sum` (ref): ~1.5 ms
|
||||
- `A2ASpatialSum TOTAL`: 2.5-4.3 ms (PackLeft+PackRight dominate GEMM on small volume)
|
||||
|
||||
## Related
|
||||
[[ref_accelerator_for]] [[ref_coalesced_views]] [[ref_lattice_vs_vector]] [[ref_grid_simt_pattern]]
|
||||
@@ -0,0 +1,43 @@
|
||||
---
|
||||
name: ref_accelerator_for
|
||||
description: Grid accelerator_for usage — converting block-strided thread_for to GPU-portable oSites loops
|
||||
metadata:
|
||||
node_type: memory
|
||||
type: reference
|
||||
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
|
||||
---
|
||||
|
||||
## Pattern: block-strided thread_for → accelerator_for over oSites
|
||||
|
||||
Old CPU-only pattern (block-strided over orthog dimension):
|
||||
```cpp
|
||||
thread_for(r, rd, {
|
||||
int so = r * grid->_ostride[orthogdim];
|
||||
for (int n = 0; n < e1; n++)
|
||||
for (int b = 0; b < e2; b++) {
|
||||
int ss = so + n * stride + b;
|
||||
// work on site ss
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
GPU-portable replacement:
|
||||
```cpp
|
||||
accelerator_for(ss, grid->oSites(), Nsimd, {
|
||||
// work on site ss — one SIMT thread per (osite, lane) on GPU
|
||||
// one thread per osite (lane loop implicit via GRID_SIMT) on CPU
|
||||
});
|
||||
```
|
||||
|
||||
Key rules:
|
||||
- `accelerator_for(iter, count, Nsimd, body)` — Nsimd is `vobj::Nsimd()` or `grid->Nsimd()`
|
||||
- On CPU: expands to `thread_for` over count, `acceleratorSIMTlane` always returns 0 — must use `#ifdef GRID_SIMT` pattern if iterating lanes explicitly (see [[ref_grid_simt_pattern]])
|
||||
- On GPU: one SIMT thread per (iter × lane), `acceleratorSIMTlane(Nsimd)` returns actual lane
|
||||
- Loop body must capture only scalar/POD by value or via device-accessible pointers; no `std::vector` or host containers inside the body
|
||||
- `Coordinate` inside `accelerator_for` must be `AcceleratorVector<int, MaxDims>` (stack-allocated, device-safe) — Grid's `Coordinate` typedef already satisfies this
|
||||
|
||||
## Where defined
|
||||
`Grid/threads/Accelerator.h` — CPU path ~line 607; GPU paths in conditional blocks above.
|
||||
|
||||
## Model file
|
||||
`Grid/algorithms/blas/MomentumProject.h` — `ImportVector` is the canonical example of correct `accelerator_for` + SIMD lane extraction.
|
||||
@@ -0,0 +1,70 @@
|
||||
---
|
||||
name: ref_coalesced_views
|
||||
description: Grid coalescedRead/coalescedWrite and autoView — GPU-portable field access inside accelerator_for
|
||||
metadata:
|
||||
node_type: memory
|
||||
type: reference
|
||||
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
|
||||
---
|
||||
|
||||
## View access modes
|
||||
|
||||
```cpp
|
||||
autoView(v, field, AcceleratorRead); // read-only, device-accessible
|
||||
autoView(v, field, AcceleratorWrite); // write-only, device-accessible
|
||||
autoView(v, field, AcceleratorReadWrite); // read-write, device-accessible
|
||||
autoView(v, field, CpuRead); // CPU only (avoids GPU migration)
|
||||
autoView(v, field, CpuWrite); // CPU only
|
||||
```
|
||||
|
||||
Views must be opened **before** `accelerator_for` and closed (go out of scope) **after**. Never open a view inside the accelerator_for body.
|
||||
|
||||
## coalescedRead / coalescedWrite
|
||||
|
||||
Inside `accelerator_for(ss, oSites, Nsimd, { ... })`:
|
||||
|
||||
```cpp
|
||||
auto site = coalescedRead(v[ss]); // reads SIMT lane; returns scalar_object on GPU, vobj on CPU
|
||||
coalescedWrite(v[ss], site); // writes SIMT lane
|
||||
```
|
||||
|
||||
- `coalescedRead(v[ss])` calls `v.operator()(ss)` which on GPU returns `extractLane(lane, v[ss])` — one lane per SIMT thread, contiguous across threads → coalesced
|
||||
- On CPU returns the full vobj (no lane extraction needed; handled transparently)
|
||||
- The returned type is `decltype(coalescedRead(v[ss]))` — use `auto` or match with scalar_object
|
||||
|
||||
## Typical kernel pattern
|
||||
|
||||
```cpp
|
||||
autoView(out_v, out, AcceleratorWrite);
|
||||
autoView(in_v, in, AcceleratorRead);
|
||||
accelerator_for(ss, grid->oSites(), vobj::Nsimd(), {
|
||||
auto x = coalescedRead(in_v[ss]);
|
||||
// modify x ...
|
||||
coalescedWrite(out_v[ss], x);
|
||||
});
|
||||
```
|
||||
|
||||
## Free function kernel signature
|
||||
|
||||
```cpp
|
||||
template<class vobj>
|
||||
void MyKernel(Lattice<vobj> &out, const Lattice<vobj> &in)
|
||||
{
|
||||
GridBase *grid = in.Grid();
|
||||
autoView(out_v, out, AcceleratorWrite);
|
||||
autoView(in_v, in, AcceleratorRead);
|
||||
accelerator_for(ss, grid->oSites(), vobj::Nsimd(), {
|
||||
auto x = coalescedRead(in_v[ss]);
|
||||
coalescedWrite(out_v[ss], x);
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
## What NOT to do
|
||||
- Do not access `std::vector` elements inside `accelerator_for` — not device-accessible
|
||||
- Do not use `CpuRead`/`CpuWrite` views inside `accelerator_for` — GPU will fault
|
||||
- Do not assign to `v[ss]` directly inside `accelerator_for` — use `coalescedWrite`
|
||||
- Do not open multiple write views on the same field simultaneously
|
||||
|
||||
## Related
|
||||
[[ref_accelerator_for]] [[ref_lattice_vs_vector]]
|
||||
@@ -0,0 +1,47 @@
|
||||
---
|
||||
name: ref_grid_simt_pattern
|
||||
description: Grid GRID_SIMT
|
||||
metadata:
|
||||
node_type: memory
|
||||
type: reference
|
||||
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
|
||||
---
|
||||
|
||||
## The problem
|
||||
|
||||
On CPU, `accelerator_for(sf, oSites, Nsimd, {...})` expands to `thread_for(sf, oSites, {...})` — one thread per osite. `acceleratorSIMTlane(Nsimd)` always returns **0** on CPU. If you need to iterate all Nsimd lanes (e.g. to extract SIMD-packed data), you must loop explicitly on CPU.
|
||||
|
||||
On GPU, `accelerator_for` launches one SIMT thread per (osite × lane). `acceleratorSIMTlane(Nsimd)` returns the actual lane index [0, Nsimd).
|
||||
|
||||
## Correct pattern (from MomentumProject::ImportVector)
|
||||
|
||||
```cpp
|
||||
accelerator_for(sf, osites, Nsimd, {
|
||||
#ifdef GRID_SIMT
|
||||
{
|
||||
int lane = acceleratorSIMTlane(Nsimd);
|
||||
#else
|
||||
for (int lane = 0; lane < Nsimd; lane++) {
|
||||
#endif
|
||||
// body using lane
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
- On GPU: `GRID_SIMT` is defined → single-lane body, lane from hardware
|
||||
- On CPU: `GRID_SIMT` is not defined → explicit lane loop inside the osite thread
|
||||
|
||||
## When is this needed?
|
||||
|
||||
Only when you explicitly need the lane index, e.g.:
|
||||
- Extracting scalar data from SIMD-packed `vobj` via `extractLane(lane, src[sf])`
|
||||
- Computing full local coordinates from (osite, lane) → `Lexicographic::CoorFromIndex(icoor, lane, simd_layout)`
|
||||
|
||||
When using `coalescedRead`/`coalescedWrite`, this pattern is **not needed** — those handle lane selection transparently.
|
||||
|
||||
## Pitfall that caused a bug
|
||||
|
||||
`A2ASpatialSum::PackVectors` originally used `accelerator_for` without the `#ifdef GRID_SIMT` lane loop. On CPU, only lane=0 was extracted, giving wrong norms (~8× too small for `GEN_SIMD_WIDTH=64`, `Nsimd=4`). Fix: add the `#ifdef GRID_SIMT` pattern. See [[ref_accelerator_for]].
|
||||
|
||||
## Model file
|
||||
`Grid/algorithms/blas/MomentumProject.h`, function `ImportVector`, lines ~166-207.
|
||||
@@ -0,0 +1,48 @@
|
||||
---
|
||||
name: ref_lattice_vs_vector
|
||||
description: When to use Lattice<T> vs std::vector<T> for GPU-portable field storage in Grid
|
||||
metadata:
|
||||
node_type: memory
|
||||
type: reference
|
||||
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
|
||||
---
|
||||
|
||||
## Rule
|
||||
|
||||
Use `Lattice<vobj>` (or `std::vector<Lattice<vobj>>`) for any field that will be read or written inside `accelerator_for`. `std::vector<vobj>` is host memory and is NOT device-accessible.
|
||||
|
||||
## Before vs after GPU offload
|
||||
|
||||
```cpp
|
||||
// CPU-only (host memory, not GPU accessible)
|
||||
std::vector<SpinColourVector_v> tloopv(oSites, Zero());
|
||||
// accessed directly: tloopv[ss]
|
||||
|
||||
// GPU-portable
|
||||
Lattice<SpinColourVector_v> tloop(grid);
|
||||
// accessed via view: autoView(tloop_v, tloop, AcceleratorWrite);
|
||||
// coalescedWrite(tloop_v[ss], val);
|
||||
```
|
||||
|
||||
## Corollary: function signatures
|
||||
|
||||
CPU-only version:
|
||||
```cpp
|
||||
void PackLeft(const std::vector<std::vector<vobj>> &leftv);
|
||||
```
|
||||
|
||||
GPU-portable version:
|
||||
```cpp
|
||||
void PackLeft(const std::vector<Lattice<vobj>> &leftv);
|
||||
```
|
||||
|
||||
## deviceVector for raw device buffers
|
||||
|
||||
`deviceVector<T>` (defined in Grid) is like `std::vector<T>` but in device-accessible memory. Use for raw scalar scratch/pack buffers (e.g. GEMM input/output staging). Not for structured lattice data.
|
||||
|
||||
## Pointer arrays for batched BLAS
|
||||
|
||||
`deviceVector<scalar *>` holds batch pointer arrays. Populate with `acceleratorPut(ptrs[t], base + offset)` — sets device-side pointer from host. See `A2ASpatialSum::Allocate`.
|
||||
|
||||
## Related
|
||||
[[ref_coalesced_views]] [[ref_accelerator_for]]
|
||||
@@ -1,91 +1,76 @@
|
||||
Per node summary table
|
||||
|
||||
L , Wilson, DWF4, Staggered, NaiveStag
|
||||
|
||||
8 , 90, 933, 38, 23
|
||||
12 , 403, 1688, 178, 113
|
||||
16 , 188, 1647, 449, 295
|
||||
24 , 947, 1574, 674, 553
|
||||
32 , 931, 1371, 718, 643
|
||||
|
||||
Memory Bandwidth
|
||||
|
||||
Bytes, GB/s per node
|
||||
786432, 40.271620
|
||||
12582912, 433.611792
|
||||
63700992, 905.374321
|
||||
201326592, 1114.979152
|
||||
491520000, 1180.241898
|
||||
6291456, 379.297050
|
||||
100663296, 3754.674992
|
||||
509607936, 6521.472413
|
||||
1610612736, 8513.456479
|
||||
3932160000, 9018.901766
|
||||
|
||||
|
||||
GEMM
|
||||
|
||||
M, N, K, BATCH, GF/s per rank
|
||||
16, 8, 16, 256, 0.564958
|
||||
16, 16, 16, 256, 243.148058
|
||||
16, 32, 16, 256, 440.346877
|
||||
32, 8, 32, 256, 439.194136
|
||||
32, 16, 32, 256, 847.334141
|
||||
32, 32, 32, 256, 1430.892623
|
||||
64, 8, 64, 256, 1242.756741
|
||||
64, 16, 64, 256, 2196.689493
|
||||
64, 32, 64, 256, 3697.458072
|
||||
16, 8, 256, 256, 899.582627
|
||||
16, 16, 256, 256, 1673.537756
|
||||
16, 32, 256, 256, 2959.597089
|
||||
32, 8, 256, 256, 1558.858630
|
||||
32, 16, 256, 256, 2864.839445
|
||||
32, 32, 256, 256, 4810.671254
|
||||
64, 8, 256, 256, 2386.092942
|
||||
64, 16, 256, 256, 4451.665937
|
||||
64, 32, 256, 256, 5942.124095
|
||||
8, 256, 16, 256, 799.867271
|
||||
16, 256, 16, 256, 1584.624888
|
||||
32, 256, 16, 256, 1949.422338
|
||||
8, 256, 32, 256, 1389.417474
|
||||
16, 256, 32, 256, 2668.344493
|
||||
32, 256, 32, 256, 3234.162120
|
||||
8, 256, 64, 256, 2150.925128
|
||||
16, 256, 64, 256, 4012.488132
|
||||
32, 256, 64, 256, 5154.785521
|
||||
|
||||
|
||||
|
||||
Communications
|
||||
|
||||
Packet bytes, direction, GB/s per node
|
||||
4718592, 1, 245.026198
|
||||
4718592, 2, 251.180996
|
||||
4718592, 3, 361.110977
|
||||
4718592, 5, 247.898447
|
||||
4718592, 6, 249.867523
|
||||
4718592, 7, 359.033061
|
||||
15925248, 1, 255.030946
|
||||
15925248, 2, 264.453890
|
||||
15925248, 3, 392.949183
|
||||
15925248, 5, 256.040644
|
||||
15925248, 6, 264.681896
|
||||
15925248, 7, 392.102622
|
||||
37748736, 1, 258.823333
|
||||
37748736, 2, 268.181577
|
||||
37748736, 3, 401.478191
|
||||
37748736, 5, 258.995363
|
||||
37748736, 6, 268.206586
|
||||
37748736, 7, 400.397611
|
||||
|
||||
|
||||
GEMM
|
||||
|
||||
M, N, K, BATCH, GF/s per rank fp64
|
||||
16, 8, 16, 4096, 693.316363
|
||||
16, 12, 16, 4096, 657.277058
|
||||
16, 16, 16, 4096, 711.992616
|
||||
32, 8, 32, 4096, 821.084324
|
||||
32, 12, 32, 4096, 1279.852719
|
||||
32, 16, 32, 4096, 2647.096674
|
||||
64, 8, 64, 4096, 2630.192325
|
||||
64, 12, 64, 4096, 3338.071321
|
||||
64, 16, 64, 4096, 3950.899281
|
||||
16, 8, 256, 4096, 1638.362501
|
||||
16, 12, 256, 4096, 2377.502234
|
||||
16, 16, 256, 4096, 3048.328833
|
||||
32, 8, 256, 4096, 2917.384276
|
||||
32, 12, 256, 4096, 4103.085151
|
||||
32, 16, 256, 4096, 5102.971860
|
||||
64, 8, 256, 4096, 3222.258206
|
||||
64, 12, 256, 4096, 4619.456391
|
||||
64, 16, 256, 4096, 5847.916650
|
||||
8, 256, 16, 4096, 1728.073337
|
||||
12, 256, 16, 4096, 2356.653970
|
||||
16, 256, 16, 4096, 2676.876038
|
||||
8, 256, 32, 4096, 2611.531990
|
||||
12, 256, 32, 4096, 3451.573106
|
||||
16, 256, 32, 4096, 3966.915301
|
||||
8, 256, 64, 4096, 3436.248737
|
||||
12, 256, 64, 4096, 4539.497945
|
||||
16, 256, 64, 4096, 5307.992323
|
||||
|
||||
|
||||
|
||||
GEMM
|
||||
|
||||
M, N, K, BATCH, GF/s per rank fp32
|
||||
16, 8, 16, 4096, 499.017445
|
||||
16, 12, 16, 4096, 731.543385
|
||||
16, 16, 16, 4096, 958.800786
|
||||
32, 8, 32, 4096, 1549.813550
|
||||
32, 12, 32, 4096, 2147.907502
|
||||
32, 16, 32, 4096, 2601.698596
|
||||
64, 8, 64, 4096, 3785.446233
|
||||
64, 12, 64, 4096, 5116.694843
|
||||
64, 16, 64, 4096, 6109.345016
|
||||
16, 8, 256, 4096, 1206.627737
|
||||
16, 12, 256, 4096, 1809.699599
|
||||
16, 16, 256, 4096, 2412.014053
|
||||
32, 8, 256, 4096, 2406.114488
|
||||
32, 12, 256, 4096, 3605.531907
|
||||
32, 16, 256, 4096, 4798.444037
|
||||
64, 8, 256, 4096, 4688.711196
|
||||
64, 12, 256, 4096, 6990.696301
|
||||
64, 16, 256, 4096, 9214.749925
|
||||
8, 256, 16, 4096, 2596.307289
|
||||
12, 256, 16, 4096, 3439.892562
|
||||
16, 256, 16, 4096, 3907.201036
|
||||
8, 256, 32, 4096, 3012.752067
|
||||
12, 256, 32, 4096, 3904.217583
|
||||
16, 256, 32, 4096, 4599.047092
|
||||
8, 256, 64, 4096, 3721.999042
|
||||
12, 256, 64, 4096, 5098.573927
|
||||
16, 256, 64, 4096, 6159.080872
|
||||
Per node summary table
|
||||
|
||||
L , Wilson, DWF4, Staggered, GF/s per node
|
||||
|
||||
8 , 155, 1386, 50
|
||||
12 , 694, 4208, 230
|
||||
16 , 1841, 6675, 609
|
||||
24 , 3934, 8573, 1641
|
||||
32 , 5083, 9771, 3086
|
||||
|
||||
|
||||
|
@@ -1,20 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Configure Grid from a build directory two levels below the source root,
|
||||
# e.g.: mkdir -p systems/mac-arm/build && cd systems/mac-arm/build && bash ../config-command-homebrew
|
||||
#
|
||||
# Prerequisites: source sourceme-homebrew.sh first.
|
||||
|
||||
CXX=mpicxx ../../configure \
|
||||
--enable-simd=GEN \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-unified=yes \
|
||||
--prefix="${HOME}/Grid-install" \
|
||||
--with-lime="${CLIME}" \
|
||||
--with-openssl="${OPENSSL}" \
|
||||
--with-gmp="${GMP}" \
|
||||
--with-mpfr="${MPFR}" \
|
||||
--with-fftw="${FFTW}" \
|
||||
--enable-Sp=no \
|
||||
--disable-fermion-reps \
|
||||
--disable-gparity \
|
||||
--disable-debug
|
||||
@@ -3,15 +3,14 @@
|
||||
CXX=mpicxx ../../configure \
|
||||
--enable-simd=GEN \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-Sp=no \
|
||||
--enable-unified=yes \
|
||||
--disable-fermion-reps \
|
||||
--disable-gparity \
|
||||
--with-fftw=$FFTW \
|
||||
--enable-unified=yes \
|
||||
--prefix /Users/peterboyle/QCD/vtk/Grid/install \
|
||||
--prefix /Users/peterboyle/QCD/Grid-install \
|
||||
--with-lime=$CLIME \
|
||||
--with-openssl=$OPENSSL \
|
||||
--with-gmp=$GMP \
|
||||
--with-mpfr=$MPFR \
|
||||
--disable-debug
|
||||
--with-fftw=$FFTW \
|
||||
--disable-debug
|
||||
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Environment for building Grid on Apple Silicon Mac with Homebrew dependencies.
|
||||
# Usage: source systems/mac-arm/sourceme-homebrew.sh
|
||||
|
||||
HOMEBREW=/opt/homebrew
|
||||
|
||||
export GMP=${HOMEBREW}/opt/gmp
|
||||
export MPFR=${HOMEBREW}/opt/mpfr
|
||||
export FFTW=${HOMEBREW}/opt/fftw
|
||||
export OPENSSL=${HOMEBREW}/opt/openssl@3
|
||||
export CLIME=/usr/local
|
||||
|
||||
export PATH="${HOMEBREW}/opt/open-mpi/bin:${HOMEBREW}/bin:${PATH}"
|
||||
export LDFLAGS="-L${OPENSSL}/lib"
|
||||
export CPPFLAGS="-I${OPENSSL}/include"
|
||||
@@ -1,11 +1,7 @@
|
||||
source /Users/peterboyle/QCD//Spack/spack//share/spack/setup-env.sh
|
||||
|
||||
export FFTW=`spack find --paths fftw | grep ^fftw | awk '{print $2}' `
|
||||
#export HDF5=`spack find --paths hdf5+cxx | grep ^hdf5 | awk '{print $2}' `
|
||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||
export MPFR=`spack find --paths mpfr | grep ^mpfr | awk '{print $2}' `
|
||||
export OPENSSL=`spack find --paths openssl | grep openssl | awk '{print $2}' `
|
||||
export GMP=`spack find --paths gmp | grep ^gmp | awk '{print $2}' `
|
||||
|
||||
export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
|
||||
export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
|
||||
|
||||
@@ -0,0 +1,841 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: tests/Test_extended_meson_field.cc
|
||||
|
||||
Copyright (C) 2015-2025
|
||||
|
||||
Author: Peter Boyle <pboyle@bnl.gov>
|
||||
Author: Masaaki Tomii <masaaki.tomii@uconn.edu> (original Hadrons kernels)
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
#include "disable_tests_without_instantiations.h"
|
||||
#ifdef ENABLE_FERMION_INSTANTIATIONS
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/utils/A2Autils.h>
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
typedef WilsonImplD FImpl;
|
||||
typedef typename FImpl::FermionField FermionField;
|
||||
typedef typename FImpl::SiteSpinor vobj;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;
|
||||
typedef iSpinColourVector<vector_type> SpinColourVector_v;
|
||||
typedef iSpinMatrix<vector_type> SpinMatrix_v;
|
||||
typedef iSinglet<vector_type> Scalar_v;
|
||||
typedef iSinglet<scalar_type> Scalar_s;
|
||||
typedef Lattice<SpinColourMatrix_v> PropagatorField;
|
||||
|
||||
// CPU reference + optionally batched GEMM spatial sum, ported from
|
||||
// Hadrons/Modules/MContraction/A2AExtendedMesonField.hpp
|
||||
// (M. Tomii, mtomii/Hadrons:local-2025-edits). Hadrons infrastructure removed.
|
||||
// thread_for / CpuRead / orthogdim=3 preserved.
|
||||
class A2AExtendedMesonFieldRef
|
||||
{
|
||||
public:
|
||||
// result is indexed [nt][N_i][N_j].
|
||||
// use_blas=true replaces the scalar spatial accumulation with A2ASpatialSum.
|
||||
static void compute(
|
||||
Eigen::Tensor<ComplexD, 3> &result,
|
||||
const std::vector<FermionField> &left,
|
||||
const std::vector<FermionField> &right,
|
||||
const std::vector<FermionField> &loop1,
|
||||
const std::vector<FermionField> &loop2,
|
||||
const std::vector<Gamma::Algebra> &gamma1,
|
||||
const std::vector<Gamma::Algebra> &gamma2,
|
||||
int type,
|
||||
bool use_blas = false)
|
||||
{
|
||||
GridBase *grid = left[0].Grid();
|
||||
|
||||
const int orthogdim = 3;
|
||||
int rd = grid->_rdimensions[orthogdim];
|
||||
int ld = grid->_ldimensions[orthogdim];
|
||||
int Nd = grid->_ndimension;
|
||||
int Nsimd = grid->Nsimd();
|
||||
|
||||
int nt = result.dimension(0);
|
||||
int N_i = (int)left.size();
|
||||
int N_j = (int)right.size();
|
||||
|
||||
std::string tag = std::string(use_blas ? "[blas" : "[ref ") + " type=" + std::to_string(type) + "]";
|
||||
auto Tms = [](double us) { return us * 1e-3; };
|
||||
double t0;
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Loop propagator: sum_k outerProduct(loop1[k], loop2[k])
|
||||
// ------------------------------------------------------------------
|
||||
t0 = usecond();
|
||||
PropagatorField loop(grid);
|
||||
loop = Zero();
|
||||
for (unsigned int k = 0; k < loop1.size(); ++k)
|
||||
loop += outerProduct(loop1[k], loop2[k]);
|
||||
std::cout << GridLogMessage << tag << " loop_build: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Pack conjugated left vectors
|
||||
// ------------------------------------------------------------------
|
||||
t0 = usecond();
|
||||
std::vector<FermionField> leftv(N_i, grid);
|
||||
for (int i = 0; i < N_i; i++)
|
||||
leftv[i] = conjugate(left[i]);
|
||||
std::cout << GridLogMessage << tag << " pack_left: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Per-site loop contraction into PropagatorField tloop (type-dependent)
|
||||
// ------------------------------------------------------------------
|
||||
t0 = usecond();
|
||||
PropagatorField tloop(grid);
|
||||
tloop = Zero();
|
||||
{
|
||||
autoView(tloopv, tloop, CpuWrite);
|
||||
autoView(loopv, loop, CpuRead);
|
||||
if (type == 0) {
|
||||
thread_for(ss, grid->oSites(), {
|
||||
for (int s1 = 0; s1 < Ns; ++s1)
|
||||
for (int s2 = 0; s2 < Ns; ++s2)
|
||||
tloopv[ss]()(s1,s2)(0,0) = loopv[ss]()(s1,s2)(0,0)
|
||||
+ loopv[ss]()(s1,s2)(1,1)
|
||||
+ loopv[ss]()(s1,s2)(2,2);
|
||||
});
|
||||
}
|
||||
if (type == 1) {
|
||||
thread_for(ss, grid->oSites(), {
|
||||
tloopv[ss] = Zero();
|
||||
for (int mu = 0; mu < (int)gamma1.size(); ++mu)
|
||||
tloopv[ss] = tloopv[ss] + Gamma(gamma1[mu]) * loopv[ss] * Gamma(gamma2[mu]);
|
||||
});
|
||||
}
|
||||
if (type == 2) {
|
||||
thread_for(ss, grid->oSites(), {
|
||||
tloopv[ss] = Zero();
|
||||
for (int mu = 0; mu < (int)gamma2.size(); ++mu) {
|
||||
SpinColourMatrix_v tmp = Gamma(gamma2[mu]) * loopv[ss];
|
||||
int s1 = mu / Ns;
|
||||
int s2 = mu % Ns;
|
||||
for (int c1 = 0; c1 < Nc; ++c1)
|
||||
for (int c2 = 0; c2 < Nc; ++c2)
|
||||
tloopv[ss]()(s1,s2)(c1,c2) = tmp()(0,0)(c1,c2) + tmp()(1,1)(c1,c2)
|
||||
+ tmp()(2,2)(c1,c2) + tmp()(3,3)(c1,c2);
|
||||
}
|
||||
});
|
||||
}
|
||||
if (type == 3) {
|
||||
thread_for(ss, grid->oSites(), {
|
||||
SpinMatrix_v spinLoop = Zero();
|
||||
for (int s1 = 0; s1 < Ns; ++s1)
|
||||
for (int s2 = 0; s2 < Ns; ++s2)
|
||||
spinLoop()(s1,s2)() = loopv[ss]()(s1,s2)(0,0)
|
||||
+ loopv[ss]()(s1,s2)(1,1)
|
||||
+ loopv[ss]()(s1,s2)(2,2);
|
||||
tloopv[ss] = Zero();
|
||||
for (int mu = 0; mu < (int)gamma1.size(); ++mu) {
|
||||
SpinMatrix_v tmp2 = Gamma(gamma1[mu]) * spinLoop * Gamma(gamma2[mu]);
|
||||
for (int s1 = 0; s1 < Ns; ++s1)
|
||||
for (int s2 = 0; s2 < Ns; ++s2)
|
||||
tloopv[ss]()(s1,s2)(0,0) = tloopv[ss]()(s1,s2)(0,0) + tmp2()(s1,s2)();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
std::cout << GridLogMessage << tag << " tloop: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
// Select addLoopRight kernel for this type
|
||||
std::function<void(SpinColourVector_v &,
|
||||
const SpinColourMatrix_v &,
|
||||
const SpinColourVector_v &,
|
||||
const std::vector<Gamma::Algebra> &,
|
||||
const std::vector<Gamma::Algebra> &)> addLoopRight;
|
||||
|
||||
if (type == 0) {
|
||||
addLoopRight = [](SpinColourVector_v &lR,
|
||||
const SpinColourMatrix_v &loopm,
|
||||
const SpinColourVector_v &rightv,
|
||||
const std::vector<Gamma::Algebra> &g1,
|
||||
const std::vector<Gamma::Algebra> &g2) {
|
||||
SpinMatrix_v spinLoop = Zero();
|
||||
for (int s1 = 0; s1 < Ns; ++s1)
|
||||
for (int s2 = 0; s2 < Ns; ++s2)
|
||||
spinLoop()(s1,s2)() = loopm()(s1,s2)(0,0);
|
||||
for (int mu = 0; mu < (int)g1.size(); ++mu) {
|
||||
SpinMatrix_v GLoop = Gamma(g2[mu]) * spinLoop;
|
||||
auto trGLoop = GLoop()(0,0)() + GLoop()(1,1)() + GLoop()(2,2)() + GLoop()(3,3)();
|
||||
SpinColourVector_v Grightv = Gamma(g1[mu]) * rightv;
|
||||
for (int s = 0; s < Ns; ++s)
|
||||
for (int c = 0; c < Nc; ++c)
|
||||
lR()(s)(c) += Grightv()(s)(c) * trGLoop;
|
||||
}
|
||||
};
|
||||
}
|
||||
if (type == 1) {
|
||||
addLoopRight = [](SpinColourVector_v &lR,
|
||||
const SpinColourMatrix_v &loopm,
|
||||
const SpinColourVector_v &rightv,
|
||||
const std::vector<Gamma::Algebra> &g1,
|
||||
const std::vector<Gamma::Algebra> &g2) {
|
||||
for (int s = 0; s < Ns; ++s)
|
||||
for (int c = 0; c < Nc; ++c) {
|
||||
lR()(s)(c)
|
||||
+= loopm()(s,0)(c,0) * rightv()(0)(0)
|
||||
+ loopm()(s,0)(c,1) * rightv()(0)(1)
|
||||
+ loopm()(s,0)(c,2) * rightv()(0)(2)
|
||||
+ loopm()(s,1)(c,0) * rightv()(1)(0)
|
||||
+ loopm()(s,1)(c,1) * rightv()(1)(1)
|
||||
+ loopm()(s,1)(c,2) * rightv()(1)(2)
|
||||
+ loopm()(s,2)(c,0) * rightv()(2)(0)
|
||||
+ loopm()(s,2)(c,1) * rightv()(2)(1)
|
||||
+ loopm()(s,2)(c,2) * rightv()(2)(2)
|
||||
+ loopm()(s,3)(c,0) * rightv()(3)(0)
|
||||
+ loopm()(s,3)(c,1) * rightv()(3)(1)
|
||||
+ loopm()(s,3)(c,2) * rightv()(3)(2);
|
||||
}
|
||||
};
|
||||
}
|
||||
if (type == 2) {
|
||||
addLoopRight = [](SpinColourVector_v &lR,
|
||||
const SpinColourMatrix_v &loopm,
|
||||
const SpinColourVector_v &rightv,
|
||||
const std::vector<Gamma::Algebra> &g1,
|
||||
const std::vector<Gamma::Algebra> &g2) {
|
||||
for (int mu = 0; mu < (int)g1.size(); ++mu) {
|
||||
int s1 = mu / Ns;
|
||||
int s2 = mu % Ns;
|
||||
SpinColourVector_v Grightv = Gamma(g1[mu]) * rightv;
|
||||
for (int s = 0; s < Ns; ++s)
|
||||
for (int c = 0; c < Nc; ++c)
|
||||
lR()(s)(c) += loopm()(s1,s2)(c,0) * Grightv()(s)(0)
|
||||
+ loopm()(s1,s2)(c,1) * Grightv()(s)(1)
|
||||
+ loopm()(s1,s2)(c,2) * Grightv()(s)(2);
|
||||
}
|
||||
};
|
||||
}
|
||||
if (type == 3) {
|
||||
addLoopRight = [](SpinColourVector_v &lR,
|
||||
const SpinColourMatrix_v &loopm,
|
||||
const SpinColourVector_v &rightv,
|
||||
const std::vector<Gamma::Algebra> &g1,
|
||||
const std::vector<Gamma::Algebra> &g2) {
|
||||
for (int s = 0; s < Ns; ++s)
|
||||
for (int c = 0; c < Nc; ++c)
|
||||
lR()(s)(c) += loopm()(s,0)(0,0) * rightv()(0)(c)
|
||||
+ loopm()(s,1)(0,0) * rightv()(1)(c)
|
||||
+ loopm()(s,2)(0,0) * rightv()(2)(c)
|
||||
+ loopm()(s,3)(0,0) * rightv()(3)(c);
|
||||
};
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Pack loopRight[j] = type-kernel(tloop, right[j]) per site
|
||||
// ------------------------------------------------------------------
|
||||
t0 = usecond();
|
||||
std::vector<FermionField> loopRight(N_j, grid);
|
||||
{
|
||||
autoView(tlv, tloop, CpuRead);
|
||||
for (int j = 0; j < N_j; j++) {
|
||||
loopRight[j] = Zero();
|
||||
autoView(lRv, loopRight[j], CpuWrite);
|
||||
autoView(rv, right[j], CpuRead);
|
||||
thread_for(ss, grid->oSites(), {
|
||||
addLoopRight(lRv[ss], tlv[ss], rv[ss], gamma1, gamma2);
|
||||
});
|
||||
}
|
||||
}
|
||||
std::cout << GridLogMessage << tag << " pack_loopright: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
if (use_blas) {
|
||||
// ------------------------------------------------------------------
|
||||
// BLAS path: A2ASpatialSum (Allocate + PackLeft + PackRight + Sum)
|
||||
// ------------------------------------------------------------------
|
||||
A2ASpatialSum<SpinColourVector_v> spatial_sum;
|
||||
double t_blas_start = usecond();
|
||||
|
||||
t0 = usecond();
|
||||
spatial_sum.Allocate(N_i, N_j, grid);
|
||||
std::cout << GridLogMessage << tag << " Allocate: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
spatial_sum.PackLeft(leftv);
|
||||
std::cout << GridLogMessage << tag << " PackLeft: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
spatial_sum.PackRight(loopRight);
|
||||
std::cout << GridLogMessage << tag << " PackRight: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
spatial_sum.Sum(result);
|
||||
std::cout << GridLogMessage << tag << " Sum (GEMM+MPI): " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
std::cout << GridLogMessage << tag << " A2ASpatialSum: " << Tms(usecond()-t_blas_start) << " ms [TOTAL]\n";
|
||||
|
||||
} else {
|
||||
// ------------------------------------------------------------------
|
||||
// Reference path: SIMD spatial sum + scalar extraction
|
||||
// ------------------------------------------------------------------
|
||||
int MFrvol = rd * N_i * N_j;
|
||||
int MFlvol = ld * N_i * N_j;
|
||||
|
||||
Vector<Scalar_v> lvSum(MFrvol);
|
||||
thread_for(r, MFrvol, { lvSum[r] = Zero(); });
|
||||
|
||||
t0 = usecond();
|
||||
{
|
||||
int e1 = grid->_slice_nblock[orthogdim];
|
||||
int e2 = grid->_slice_block [orthogdim];
|
||||
int stride = grid->_slice_stride[orthogdim];
|
||||
using LView = decltype(leftv[0].View(CpuRead));
|
||||
using RView = decltype(loopRight[0].View(CpuRead));
|
||||
std::vector<LView> lv_views;
|
||||
std::vector<RView> lr_views;
|
||||
lv_views.reserve(N_i);
|
||||
lr_views.reserve(N_j);
|
||||
for (int i = 0; i < N_i; i++) lv_views.push_back(leftv[i].View(CpuRead));
|
||||
for (int j = 0; j < N_j; j++) lr_views.push_back(loopRight[j].View(CpuRead));
|
||||
thread_for(r, rd, {
|
||||
int so = r * grid->_ostride[orthogdim];
|
||||
int base = N_i * N_j * r;
|
||||
for (int n = 0; n < e1; n++)
|
||||
for (int b = 0; b < e2; b++) {
|
||||
int ss = so + n * stride + b;
|
||||
for (int ii = 0; ii < N_i; ii++)
|
||||
for (int jj = 0; jj < N_j; jj++) {
|
||||
int idx = jj + N_j * ii + base;
|
||||
for (int s = 0; s < Ns; ++s)
|
||||
for (int c = 0; c < Nc; ++c)
|
||||
lvSum[idx]()()() += lv_views[ii][ss]()(s)(c) * lr_views[jj][ss]()(s)(c);
|
||||
}
|
||||
}
|
||||
});
|
||||
for (auto &v : lv_views) v.ViewClose();
|
||||
for (auto &v : lr_views) v.ViewClose();
|
||||
}
|
||||
std::cout << GridLogMessage << tag << " spatial_sum: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
Vector<Scalar_s> lsSum(MFlvol);
|
||||
thread_for(r, MFlvol, { lsSum[r] = scalar_type(0.0); });
|
||||
|
||||
t0 = usecond();
|
||||
thread_for(rt, rd, {
|
||||
Coordinate icoor(Nd);
|
||||
ExtractBuffer<Scalar_s> extracted(Nsimd);
|
||||
for (int ii = 0; ii < N_i; ii++)
|
||||
for (int jj = 0; jj < N_j; jj++) {
|
||||
int ij_rdx = jj + N_j * (ii + N_i * rt);
|
||||
extract(lvSum[ij_rdx], extracted);
|
||||
for (int idx = 0; idx < Nsimd; idx++) {
|
||||
grid->iCoorFromIindex(icoor, idx);
|
||||
int ldx = rt + icoor[orthogdim] * rd;
|
||||
int ij_ldx = jj + N_j * (ii + N_i * ldx);
|
||||
lsSum[ij_ldx] = lsSum[ij_ldx] + extracted[idx];
|
||||
}
|
||||
}
|
||||
});
|
||||
std::cout << GridLogMessage << tag << " simd_extract: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
int pd = grid->_processors[orthogdim];
|
||||
int pc = grid->_processor_coor[orthogdim];
|
||||
|
||||
t0 = usecond();
|
||||
Vector<ComplexD> cache(nt * N_i * N_j, ComplexD(0.0));
|
||||
for (int lt = 0; lt < ld; lt++)
|
||||
for (int pt = 0; pt < pd; pt++) {
|
||||
int t = lt + pt * ld;
|
||||
for (int ii = 0; ii < N_i; ii++)
|
||||
for (int jj = 0; jj < N_j; jj++) {
|
||||
if (pt == pc) {
|
||||
int ij_ldx = jj + N_j * (ii + N_i * lt);
|
||||
cache[jj + N_j * (ii + N_i * t)] = lsSum[ij_ldx]()()();
|
||||
}
|
||||
}
|
||||
}
|
||||
grid->GlobalSumVector(cache.data(), nt * N_i * N_j);
|
||||
std::cout << GridLogMessage << tag << " globalsum: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
for (int t = 0; t < nt; t++)
|
||||
for (int ii = 0; ii < N_i; ii++)
|
||||
for (int jj = 0; jj < N_j; jj++)
|
||||
result(t, ii, jj) = cache[jj + N_j * (ii + N_i * t)];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// ================================================================
|
||||
// Free-function GPU kernels — accelerator_for, v(ss) reads,
|
||||
// coalescedWrite writes, vobj-level arithmetic throughout.
|
||||
// Gamma arrays passed as Vector<Gamma::Algebra> (UVM).
|
||||
// ================================================================
|
||||
|
||||
void A2ALoopPropagator(PropagatorField &loop,
|
||||
const std::vector<FermionField> &loop1,
|
||||
const std::vector<FermionField> &loop2)
|
||||
{
|
||||
int Nk = (int)loop1.size();
|
||||
uint64_t oSites = loop.Grid()->oSites();
|
||||
int Nsimd = SpinColourVector_v::Nsimd();
|
||||
|
||||
typedef decltype(loop1[0].View(AcceleratorRead)) View;
|
||||
std::vector<View> v1, v2;
|
||||
v1.reserve(Nk); v2.reserve(Nk);
|
||||
for (int k = 0; k < Nk; k++) {
|
||||
v1.push_back(loop1[k].View(AcceleratorRead));
|
||||
v2.push_back(loop2[k].View(AcceleratorRead));
|
||||
}
|
||||
|
||||
deviceVector<SpinColourVector_v *> l1p(Nk), l2p(Nk);
|
||||
for (int k = 0; k < Nk; k++) {
|
||||
acceleratorPut(l1p[k], &v1[k][0]);
|
||||
acceleratorPut(l2p[k], &v2[k][0]);
|
||||
}
|
||||
|
||||
autoView(loopv, loop, AcceleratorWrite);
|
||||
SpinColourVector_v **l1 = &l1p[0];
|
||||
SpinColourVector_v **l2 = &l2p[0];
|
||||
int lNk = Nk;
|
||||
accelerator_for(ss, oSites, Nsimd, {
|
||||
auto result = outerProduct(coalescedRead(l1[0][ss]), coalescedRead(l2[0][ss]));
|
||||
for (int k = 1; k < lNk; k++)
|
||||
result = result + outerProduct(coalescedRead(l1[k][ss]), coalescedRead(l2[k][ss]));
|
||||
coalescedWrite(loopv[ss], result);
|
||||
});
|
||||
}
|
||||
|
||||
void A2APackLeftConjugated(FermionField &out, const FermionField &in)
|
||||
{
|
||||
autoView(outv, out, AcceleratorWrite);
|
||||
autoView(inv, in, AcceleratorRead);
|
||||
uint64_t Osites = in.Grid()->oSites();
|
||||
int Nsimd = SpinColourVector_v::Nsimd();
|
||||
accelerator_for(ss, Osites, Nsimd, {
|
||||
coalescedWrite(outv[ss], conjugate(inv(ss)));
|
||||
});
|
||||
}
|
||||
|
||||
// Type 0: colour-trace stored in (s1,s2)(0,0)
|
||||
void A2ALoopLeftContractionType0(PropagatorField &tloop, const PropagatorField &loop)
|
||||
{
|
||||
autoView(tloopv, tloop, AcceleratorWrite);
|
||||
autoView(loopv, loop, AcceleratorRead);
|
||||
uint64_t Osites = loop.Grid()->oSites();
|
||||
int Nsimd = SpinColourMatrix_v::Nsimd();
|
||||
accelerator_for(ss, Osites, Nsimd, {
|
||||
auto l = loopv(ss);
|
||||
auto tmp = l; tmp = Zero();
|
||||
for (int s1 = 0; s1 < Ns; ++s1)
|
||||
for (int s2 = 0; s2 < Ns; ++s2)
|
||||
tmp()(s1,s2)(0,0) = l()(s1,s2)(0,0) + l()(s1,s2)(1,1) + l()(s1,s2)(2,2);
|
||||
coalescedWrite(tloopv[ss], tmp);
|
||||
});
|
||||
}
|
||||
|
||||
// Type 1: tloop = sum_mu Gamma(g1[mu]) * loop * Gamma(g2[mu])
|
||||
void A2ALoopLeftContractionType1(PropagatorField &tloop, const PropagatorField &loop,
|
||||
const Vector<Gamma::Algebra> &gamma1,
|
||||
const Vector<Gamma::Algebra> &gamma2)
|
||||
{
|
||||
int ng = (int)gamma1.size();
|
||||
const Gamma::Algebra *g1 = gamma1.data();
|
||||
const Gamma::Algebra *g2 = gamma2.data();
|
||||
autoView(tloopv, tloop, AcceleratorWrite);
|
||||
autoView(loopv, loop, AcceleratorRead);
|
||||
uint64_t Osites = loop.Grid()->oSites();
|
||||
int Nsimd = SpinColourMatrix_v::Nsimd();
|
||||
accelerator_for(ss, Osites, Nsimd, {
|
||||
auto l = loopv(ss);
|
||||
auto tmp = l; tmp = Zero();
|
||||
for (int mu = 0; mu < ng; ++mu)
|
||||
tmp = tmp + Gamma(g1[mu]) * l * Gamma(g2[mu]);
|
||||
coalescedWrite(tloopv[ss], tmp);
|
||||
});
|
||||
}
|
||||
|
||||
// Type 2: for mu=[0..ng), s1=mu/Ns, s2=mu%Ns;
|
||||
// tloop(s1,s2)(c1,c2) = Tr_spin( Gamma(g2[mu]) * loop )(c1,c2)
|
||||
void A2ALoopLeftContractionType2(PropagatorField &tloop, const PropagatorField &loop,
|
||||
const Vector<Gamma::Algebra> &gamma2)
|
||||
{
|
||||
int ng = (int)gamma2.size();
|
||||
const Gamma::Algebra *g2 = gamma2.data();
|
||||
autoView(tloopv, tloop, AcceleratorWrite);
|
||||
autoView(loopv, loop, AcceleratorRead);
|
||||
uint64_t Osites = loop.Grid()->oSites();
|
||||
int Nsimd = SpinColourMatrix_v::Nsimd();
|
||||
accelerator_for(ss, Osites, Nsimd, {
|
||||
auto l = loopv(ss);
|
||||
auto tmp = l; tmp = Zero();
|
||||
for (int mu = 0; mu < ng; ++mu) {
|
||||
auto gtmp = Gamma(g2[mu]) * l;
|
||||
int s1 = mu / Ns;
|
||||
int s2 = mu % Ns;
|
||||
for (int c1 = 0; c1 < Nc; ++c1)
|
||||
for (int c2 = 0; c2 < Nc; ++c2)
|
||||
tmp()(s1,s2)(c1,c2) = gtmp()(0,0)(c1,c2) + gtmp()(1,1)(c1,c2)
|
||||
+ gtmp()(2,2)(c1,c2) + gtmp()(3,3)(c1,c2);
|
||||
}
|
||||
coalescedWrite(tloopv[ss], tmp);
|
||||
});
|
||||
}
|
||||
|
||||
// Type 3: colour-trace → spin matrix → sum_mu G1*spinLoop*G2 stored in (s1,s2)(0,0)
|
||||
void A2ALoopLeftContractionType3(PropagatorField &tloop, const PropagatorField &loop,
|
||||
const Vector<Gamma::Algebra> &gamma1,
|
||||
const Vector<Gamma::Algebra> &gamma2)
|
||||
{
|
||||
int ng = (int)gamma1.size();
|
||||
const Gamma::Algebra *g1 = gamma1.data();
|
||||
const Gamma::Algebra *g2 = gamma2.data();
|
||||
autoView(tloopv, tloop, AcceleratorWrite);
|
||||
autoView(loopv, loop, AcceleratorRead);
|
||||
uint64_t Osites = loop.Grid()->oSites();
|
||||
int Nsimd = SpinColourMatrix_v::Nsimd();
|
||||
accelerator_for(ss, Osites, Nsimd, {
|
||||
typedef decltype(coalescedRead(loopv[0])) calcSCMatrix;
|
||||
typedef iSpinMatrix<typename calcSCMatrix::vector_type> calcSpinMatrix;
|
||||
auto l = loopv(ss);
|
||||
calcSpinMatrix spinLoop; spinLoop = Zero();
|
||||
for (int s1 = 0; s1 < Ns; ++s1)
|
||||
for (int s2 = 0; s2 < Ns; ++s2)
|
||||
spinLoop()(s1,s2)() = l()(s1,s2)(0,0) + l()(s1,s2)(1,1) + l()(s1,s2)(2,2);
|
||||
auto tmp = l; tmp = Zero();
|
||||
for (int mu = 0; mu < ng; ++mu) {
|
||||
calcSpinMatrix tmp2 = Gamma(g1[mu]) * spinLoop * Gamma(g2[mu]);
|
||||
for (int s1 = 0; s1 < Ns; ++s1)
|
||||
for (int s2 = 0; s2 < Ns; ++s2)
|
||||
tmp()(s1,s2)(0,0) = tmp()(s1,s2)(0,0) + tmp2()(s1,s2)();
|
||||
}
|
||||
coalescedWrite(tloopv[ss], tmp);
|
||||
});
|
||||
}
|
||||
|
||||
// Type 0: loopRight = sum_mu Tr(G2*spinLoop) * G1*right
|
||||
// where spinLoop(s1,s2) = tloop(s1,s2)(0,0)
|
||||
void A2ALoopRightContractionType0(FermionField &loopRight,
|
||||
const PropagatorField &tloop,
|
||||
const FermionField &right,
|
||||
const Vector<Gamma::Algebra> &gamma1,
|
||||
const Vector<Gamma::Algebra> &gamma2)
|
||||
{
|
||||
int ng = (int)gamma1.size();
|
||||
const Gamma::Algebra *g1 = gamma1.data();
|
||||
const Gamma::Algebra *g2 = gamma2.data();
|
||||
autoView(lRv, loopRight, AcceleratorWrite);
|
||||
autoView(tlv, tloop, AcceleratorRead);
|
||||
autoView(rv, right, AcceleratorRead);
|
||||
uint64_t Osites = right.Grid()->oSites();
|
||||
int Nsimd = SpinColourVector_v::Nsimd();
|
||||
accelerator_for(ss, Osites, Nsimd, {
|
||||
typedef decltype(coalescedRead(rv[0])) calcSCVector;
|
||||
typedef decltype(coalescedRead(tlv[0])) calcSCMatrix;
|
||||
typedef iSpinMatrix<typename calcSCMatrix::vector_type> calcSpinMatrix;
|
||||
auto loopm = tlv(ss);
|
||||
auto rightv = rv(ss);
|
||||
calcSpinMatrix spinLoop; spinLoop = Zero();
|
||||
for (int s1 = 0; s1 < Ns; ++s1)
|
||||
for (int s2 = 0; s2 < Ns; ++s2)
|
||||
spinLoop()(s1,s2)() = loopm()(s1,s2)(0,0);
|
||||
calcSCVector lR; lR = Zero();
|
||||
for (int mu = 0; mu < ng; ++mu) {
|
||||
auto GLoop = Gamma(g2[mu]) * spinLoop;
|
||||
auto trGLoop = GLoop()(0,0)() + GLoop()(1,1)() + GLoop()(2,2)() + GLoop()(3,3)();
|
||||
auto Grightv = Gamma(g1[mu]) * rightv;
|
||||
for (int s = 0; s < Ns; ++s)
|
||||
for (int c = 0; c < Nc; ++c)
|
||||
lR()(s)(c) = lR()(s)(c) + Grightv()(s)(c) * trGLoop;
|
||||
}
|
||||
coalescedWrite(lRv[ss], lR);
|
||||
});
|
||||
}
|
||||
|
||||
// Type 1: loopRight = tloop * right (SpinColourMatrix * SpinColourVector)
|
||||
void A2ALoopRightContractionType1(FermionField &loopRight,
|
||||
const PropagatorField &tloop,
|
||||
const FermionField &right)
|
||||
{
|
||||
autoView(lRv, loopRight, AcceleratorWrite);
|
||||
autoView(tlv, tloop, AcceleratorRead);
|
||||
autoView(rv, right, AcceleratorRead);
|
||||
uint64_t Osites = right.Grid()->oSites();
|
||||
int Nsimd = SpinColourVector_v::Nsimd();
|
||||
accelerator_for(ss, Osites, Nsimd, {
|
||||
coalescedWrite(lRv[ss], tlv(ss) * rv(ss));
|
||||
});
|
||||
}
|
||||
|
||||
// Type 2: loopRight(s)(c) = sum_{mu,c'} tloop(s1,s2)(c,c') * (G(g1[mu])*right)(s)(c')
|
||||
void A2ALoopRightContractionType2(FermionField &loopRight,
|
||||
const PropagatorField &tloop,
|
||||
const FermionField &right,
|
||||
const Vector<Gamma::Algebra> &gamma1)
|
||||
{
|
||||
int ng = (int)gamma1.size();
|
||||
const Gamma::Algebra *g1 = gamma1.data();
|
||||
autoView(lRv, loopRight, AcceleratorWrite);
|
||||
autoView(tlv, tloop, AcceleratorRead);
|
||||
autoView(rv, right, AcceleratorRead);
|
||||
uint64_t Osites = right.Grid()->oSites();
|
||||
int Nsimd = SpinColourVector_v::Nsimd();
|
||||
accelerator_for(ss, Osites, Nsimd, {
|
||||
typedef decltype(coalescedRead(rv[0])) calcSCVector;
|
||||
auto loopm = tlv(ss);
|
||||
auto rightv = rv(ss);
|
||||
calcSCVector lR; lR = Zero();
|
||||
for (int mu = 0; mu < ng; ++mu) {
|
||||
int s1 = mu / Ns;
|
||||
int s2 = mu % Ns;
|
||||
auto Grightv = Gamma(g1[mu]) * rightv;
|
||||
for (int s = 0; s < Ns; ++s)
|
||||
for (int c = 0; c < Nc; ++c)
|
||||
lR()(s)(c) = lR()(s)(c)
|
||||
+ loopm()(s1,s2)(c,0) * Grightv()(s)(0)
|
||||
+ loopm()(s1,s2)(c,1) * Grightv()(s)(1)
|
||||
+ loopm()(s1,s2)(c,2) * Grightv()(s)(2);
|
||||
}
|
||||
coalescedWrite(lRv[ss], lR);
|
||||
});
|
||||
}
|
||||
|
||||
// Type 3: loopRight(s)(c) = sum_{s'} tloop(s,s')(0,0) * right(s')(c)
|
||||
void A2ALoopRightContractionType3(FermionField &loopRight,
|
||||
const PropagatorField &tloop,
|
||||
const FermionField &right)
|
||||
{
|
||||
autoView(lRv, loopRight, AcceleratorWrite);
|
||||
autoView(tlv, tloop, AcceleratorRead);
|
||||
autoView(rv, right, AcceleratorRead);
|
||||
uint64_t Osites = right.Grid()->oSites();
|
||||
int Nsimd = SpinColourVector_v::Nsimd();
|
||||
accelerator_for(ss, Osites, Nsimd, {
|
||||
typedef decltype(coalescedRead(rv[0])) calcSCVector;
|
||||
auto loopm = tlv(ss);
|
||||
auto rightv = rv(ss);
|
||||
calcSCVector lR; lR = Zero();
|
||||
for (int s = 0; s < Ns; ++s)
|
||||
for (int c = 0; c < Nc; ++c)
|
||||
lR()(s)(c) = loopm()(s,0)(0,0) * rightv()(0)(c)
|
||||
+ loopm()(s,1)(0,0) * rightv()(1)(c)
|
||||
+ loopm()(s,2)(0,0) * rightv()(2)(c)
|
||||
+ loopm()(s,3)(0,0) * rightv()(3)(c);
|
||||
coalescedWrite(lRv[ss], lR);
|
||||
});
|
||||
}
|
||||
|
||||
// ================================================================
|
||||
// GPU-offloaded extended meson field: accelerator_for contractions
|
||||
// + A2ASpatialSum GEMM spatial reduction.
|
||||
// ================================================================
|
||||
class A2AExtendedMesonFieldGPU
|
||||
{
|
||||
public:
|
||||
static void compute(
|
||||
Eigen::Tensor<ComplexD, 3> &result,
|
||||
const std::vector<FermionField> &left,
|
||||
const std::vector<FermionField> &right,
|
||||
const std::vector<FermionField> &loop1,
|
||||
const std::vector<FermionField> &loop2,
|
||||
const std::vector<Gamma::Algebra> &gamma1_in,
|
||||
const std::vector<Gamma::Algebra> &gamma2_in,
|
||||
int type)
|
||||
{
|
||||
GridBase *grid = left[0].Grid();
|
||||
int N_i = (int)left.size();
|
||||
int N_j = (int)right.size();
|
||||
|
||||
std::string tag = std::string("[gpu type=") + std::to_string(type) + "]";
|
||||
auto Tms = [](double us) { return us * 1e-3; };
|
||||
double t0;
|
||||
|
||||
Vector<Gamma::Algebra> gamma1(gamma1_in.begin(), gamma1_in.end());
|
||||
Vector<Gamma::Algebra> gamma2(gamma2_in.begin(), gamma2_in.end());
|
||||
|
||||
t0 = usecond();
|
||||
for (auto &f : loop1) { autoView(v, f, AcceleratorRead); }
|
||||
for (auto &f : loop2) { autoView(v, f, AcceleratorRead); }
|
||||
std::cout << GridLogMessage << tag << " view_open_loop: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
PropagatorField loop(grid);
|
||||
A2ALoopPropagator(loop, loop1, loop2);
|
||||
std::cout << GridLogMessage << tag << " loop_build: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
for (int i = 0; i < N_i; i++) { autoView(v, left[i], AcceleratorRead); }
|
||||
std::cout << GridLogMessage << tag << " view_open_left: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
std::vector<FermionField> leftv(N_i, grid);
|
||||
for (int i = 0; i < N_i; i++)
|
||||
A2APackLeftConjugated(leftv[i], left[i]);
|
||||
std::cout << GridLogMessage << tag << " pack_left: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
PropagatorField tloop(grid);
|
||||
tloop = Zero();
|
||||
switch (type) {
|
||||
case 0: A2ALoopLeftContractionType0(tloop, loop); break;
|
||||
case 1: A2ALoopLeftContractionType1(tloop, loop, gamma1, gamma2); break;
|
||||
case 2: A2ALoopLeftContractionType2(tloop, loop, gamma2); break;
|
||||
case 3: A2ALoopLeftContractionType3(tloop, loop, gamma1, gamma2); break;
|
||||
}
|
||||
std::cout << GridLogMessage << tag << " tloop: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
{ autoView(tlv, tloop, AcceleratorRead); }
|
||||
for (int j = 0; j < N_j; j++) { autoView(rv, right[j], AcceleratorRead); }
|
||||
std::cout << GridLogMessage << tag << " view_open_right: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
std::vector<FermionField> loopRight(N_j, grid);
|
||||
for (int j = 0; j < N_j; j++) {
|
||||
switch (type) {
|
||||
case 0: A2ALoopRightContractionType0(loopRight[j], tloop, right[j], gamma1, gamma2); break;
|
||||
case 1: A2ALoopRightContractionType1(loopRight[j], tloop, right[j]); break;
|
||||
case 2: A2ALoopRightContractionType2(loopRight[j], tloop, right[j], gamma1); break;
|
||||
case 3: A2ALoopRightContractionType3(loopRight[j], tloop, right[j]); break;
|
||||
}
|
||||
}
|
||||
std::cout << GridLogMessage << tag << " pack_loopright: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
A2ASpatialSum<SpinColourVector_v> spatial_sum;
|
||||
double t_blas = usecond();
|
||||
|
||||
t0 = usecond();
|
||||
spatial_sum.Allocate(N_i, N_j, grid);
|
||||
std::cout << GridLogMessage << tag << " Allocate: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
spatial_sum.PackLeft(leftv);
|
||||
std::cout << GridLogMessage << tag << " PackLeft: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
spatial_sum.PackRight(loopRight);
|
||||
std::cout << GridLogMessage << tag << " PackRight: " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
t0 = usecond();
|
||||
spatial_sum.Sum(result);
|
||||
std::cout << GridLogMessage << tag << " Sum (GEMM+MPI): " << Tms(usecond()-t0) << " ms\n";
|
||||
|
||||
std::cout << GridLogMessage << tag << " A2ASpatialSum: " << Tms(usecond()-t_blas) << " ms [TOTAL]\n";
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
Grid_init(&argc, &argv);
|
||||
|
||||
Coordinate latt_size = GridDefaultLatt();
|
||||
Coordinate simd_layout = GridDefaultSimd(4, vComplexD::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
GridCartesian grid(latt_size, simd_layout, mpi_layout);
|
||||
|
||||
int Nt = latt_size[Tp];
|
||||
int N_i = 8;
|
||||
int N_j = 8;
|
||||
int Nloop = 4;
|
||||
|
||||
if (GridCmdOptionExists(argv, argv+argc, "--Ni"))
|
||||
N_i = std::stoi(GridCmdOptionPayload(argv, argv+argc, "--Ni"));
|
||||
if (GridCmdOptionExists(argv, argv+argc, "--Nj"))
|
||||
N_j = std::stoi(GridCmdOptionPayload(argv, argv+argc, "--Nj"));
|
||||
|
||||
GridParallelRNG pRNG(&grid);
|
||||
pRNG.SeedFixedIntegers({1, 2, 3, 4});
|
||||
|
||||
std::vector<FermionField> left(N_i, &grid);
|
||||
std::vector<FermionField> right(N_j, &grid);
|
||||
std::vector<FermionField> loop1(Nloop, &grid);
|
||||
std::vector<FermionField> loop2(Nloop, &grid);
|
||||
|
||||
for (auto &f : left) random(pRNG, f);
|
||||
for (auto &f : right) random(pRNG, f);
|
||||
for (auto &f : loop1) random(pRNG, f);
|
||||
for (auto &f : loop2) random(pRNG, f);
|
||||
|
||||
std::vector<Gamma::Algebra> GammaMU = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
Eigen::Tensor<ComplexD, 3> result_ref(Nt, N_i, N_j);
|
||||
Eigen::Tensor<ComplexD, 3> result_blas(Nt, N_i, N_j);
|
||||
Eigen::Tensor<ComplexD, 3> result_gpu(Nt, N_i, N_j);
|
||||
double t_ref = 0, t_blas = 0, t_gpu = 0, start, stop;
|
||||
|
||||
for (int type = 0; type < 4; type++) {
|
||||
|
||||
result_ref.setZero();
|
||||
start = usecond();
|
||||
A2AExtendedMesonFieldRef::compute(result_ref, left, right, loop1, loop2,
|
||||
GammaMU, GammaMU, type, false);
|
||||
stop = usecond(); t_ref = stop - start;
|
||||
|
||||
result_blas.setZero();
|
||||
start = usecond();
|
||||
A2AExtendedMesonFieldRef::compute(result_blas, left, right, loop1, loop2,
|
||||
GammaMU, GammaMU, type, true);
|
||||
stop = usecond(); t_blas = stop - start;
|
||||
|
||||
result_gpu.setZero();
|
||||
start = usecond();
|
||||
A2AExtendedMesonFieldGPU::compute(result_gpu, left, right, loop1, loop2,
|
||||
GammaMU, GammaMU, type);
|
||||
stop = usecond(); t_gpu = stop - start;
|
||||
|
||||
double norm2_ref = 0.0, norm2_blas = 0.0, norm2_gpu = 0.0;
|
||||
double norm2_diff_blas = 0.0, norm2_diff_gpu = 0.0;
|
||||
for (int t = 0; t < Nt; t++)
|
||||
for (int ii = 0; ii < N_i; ii++)
|
||||
for (int jj = 0; jj < N_j; jj++) {
|
||||
norm2_ref += norm2(result_ref(t, ii, jj));
|
||||
norm2_blas += norm2(result_blas(t, ii, jj));
|
||||
norm2_gpu += norm2(result_gpu(t, ii, jj));
|
||||
ComplexD diff_blas = result_ref(t, ii, jj) - result_blas(t, ii, jj);
|
||||
ComplexD diff_gpu = result_ref(t, ii, jj) - result_gpu(t, ii, jj);
|
||||
norm2_diff_blas += norm2(diff_blas);
|
||||
norm2_diff_gpu += norm2(diff_gpu);
|
||||
}
|
||||
|
||||
double rel_blas = (norm2_ref > 0) ? std::sqrt(norm2_diff_blas / norm2_ref) : 0.0;
|
||||
double rel_gpu = (norm2_ref > 0) ? std::sqrt(norm2_diff_gpu / norm2_ref) : 0.0;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "type=" << type
|
||||
<< " norm2_ref=" << norm2_ref
|
||||
<< " norm2_blas=" << norm2_blas
|
||||
<< " norm2_gpu=" << norm2_gpu
|
||||
<< " rel_blas=" << rel_blas
|
||||
<< " rel_gpu=" << rel_gpu
|
||||
<< " t_ref=" << t_ref * 1e-6 << "s"
|
||||
<< " t_blas=" << t_blas * 1e-6 << "s"
|
||||
<< " t_gpu=" << t_gpu * 1e-6 << "s"
|
||||
<< std::endl;
|
||||
|
||||
GRID_ASSERT(rel_blas < 1e-10);
|
||||
GRID_ASSERT(rel_gpu < 1e-10);
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "All types passed A2ASpatialSum and GPU regression." << std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -36,8 +36,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
//gridblasHandle_t GridBLAS::gridblasHandle;
|
||||
//int GridBLAS::gridblasInit;
|
||||
gridblasHandle_t GridBLAS::gridblasHandle;
|
||||
int GridBLAS::gridblasInit;
|
||||
|
||||
///////////////////////
|
||||
// Tells little dirac op to use MdagM as the .Op()
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Isolating the hipfft HIPFFT_PARSE_ERROR on ROCm 7 / hipFFT 1.0.20.
|
||||
*
|
||||
* Tests three orderings with an empty rocFFT cache to find which GPU
|
||||
* operation before plan creation triggers the failure:
|
||||
* A) hipMalloc only — hypothesis: passes (no async GPU work)
|
||||
* B) hipMalloc + hipMemset — hypothesis: fails (async GPU work in flight)
|
||||
* C) hipMalloc + hipMemset — hypothesis: passes (work completed before plan)
|
||||
* + hipDeviceSynchronize
|
||||
*
|
||||
* Compile:
|
||||
* hipcc -o Test_hipfft_bug_fail Test_hipfft_bug_fail.cc -lhipfft
|
||||
*
|
||||
* Run with empty cache:
|
||||
* rm -rf ~/.cache/
|
||||
* ./Test_hipfft_bug_fail
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <hipfft/hipfft.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
static const char *res(hipfftResult rv) {
|
||||
return rv == HIPFFT_SUCCESS ? "SUCCESS" : "PARSE_ERROR";
|
||||
}
|
||||
|
||||
static hipfftResult makePlan(int G, int howmany) {
|
||||
int n[] = {G};
|
||||
hipfftHandle p;
|
||||
size_t workSize = 0;
|
||||
hipfftCreate(&p);
|
||||
hipfftResult rv = hipfftMakePlanMany(p, 1, n,
|
||||
nullptr, 1, G, nullptr, 1, G,
|
||||
HIPFFT_Z2Z, howmany, &workSize);
|
||||
hipfftDestroy(p);
|
||||
return rv;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
hipDeviceProp_t prop;
|
||||
hipGetDeviceProperties(&prop, 0);
|
||||
printf("Device: %s\n", prop.name);
|
||||
#ifdef hipfftVersionMinor
|
||||
printf("hipFFT version: %d.%d.%d\n\n",
|
||||
hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
|
||||
#endif
|
||||
|
||||
for (int G : {4, 8, 16, 32}) {
|
||||
int howmany = 512;
|
||||
long nelems = (long)G * howmany;
|
||||
hipfftDoubleComplex *buf = nullptr;
|
||||
hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex));
|
||||
|
||||
// Tests ordered so each runs before a prior success can populate the cache.
|
||||
|
||||
// B first: hipMalloc + hipMemset (async GPU work in flight)
|
||||
// If this fails, A (no hipMemset) will pass, confirming hipMemset is the trigger.
|
||||
hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
|
||||
hipfftResult rvB = makePlan(G, howmany);
|
||||
printf("G=%-4d B) hipMalloc + hipMemset : %s\n", G, res(rvB));
|
||||
|
||||
// C: hipMalloc + hipMemset + sync — does syncing before plan creation fix it?
|
||||
hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
|
||||
hipDeviceSynchronize();
|
||||
hipfftResult rvC = makePlan(G, howmany);
|
||||
printf("G=%-4d C) hipMalloc + hipMemset + sync: %s\n", G, res(rvC));
|
||||
|
||||
// A last: hipMalloc only, no async GPU work — should always pass
|
||||
hipfftResult rvA = makePlan(G, howmany);
|
||||
printf("G=%-4d A) hipMalloc only : %s\n\n", G, res(rvA));
|
||||
|
||||
hipFree(buf);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Minimal program demonstrating the workaround for the hipfft ROCm 7 bug.
|
||||
*
|
||||
* Workaround: create the hipfft plan BEFORE any hipMalloc. Plan creation
|
||||
* for G < 32 then succeeds even with an empty rocFFT cache.
|
||||
*
|
||||
* Compile:
|
||||
* hipcc -o Test_hipfft_bug_pass Test_hipfft_bug_pass.cc -lhipfft
|
||||
*
|
||||
* Run:
|
||||
* rm -rf ~/.cache/rocfft
|
||||
* ./Test_hipfft_bug_pass
|
||||
*
|
||||
* Expected: all G values succeed.
|
||||
* Compare with Test_hipfft_bug_fail.cc which uses the opposite ordering.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <hipfft/hipfft.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
int main(void) {
|
||||
hipDeviceProp_t prop;
|
||||
hipGetDeviceProperties(&prop, 0);
|
||||
printf("Device: %s\n", prop.name);
|
||||
#ifdef hipfftVersionMinor
|
||||
printf("hipFFT version: %d.%d.%d\n\n",
|
||||
hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
|
||||
#endif
|
||||
|
||||
for (int G : {8, 16, 32}) {
|
||||
int howmany = 512;
|
||||
int n[] = {G};
|
||||
long nelems = (long)G * howmany;
|
||||
|
||||
// Plan created BEFORE hipMalloc — succeeds for all G
|
||||
hipfftHandle p;
|
||||
size_t workSize = 0;
|
||||
hipfftCreate(&p);
|
||||
hipfftResult rv = hipfftMakePlanMany(p, 1, n,
|
||||
nullptr, 1, G, nullptr, 1, G,
|
||||
HIPFFT_Z2Z, howmany, &workSize);
|
||||
printf("G=%-4d plan-then-hipMalloc: %d (%s)\n",
|
||||
G, (int)rv, rv == HIPFFT_SUCCESS ? "HIPFFT_SUCCESS" : "HIPFFT_PARSE_ERROR");
|
||||
|
||||
if (rv == HIPFFT_SUCCESS) {
|
||||
hipfftDoubleComplex *buf = nullptr;
|
||||
hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex));
|
||||
hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
|
||||
rv = hipfftExecZ2Z(p, buf, buf, HIPFFT_FORWARD);
|
||||
hipDeviceSynchronize();
|
||||
printf("G=%-4d execFwd: %d (%s)\n",
|
||||
G, (int)rv, rv == HIPFFT_SUCCESS ? "HIPFFT_SUCCESS" : "FAILED");
|
||||
hipFree(buf);
|
||||
}
|
||||
|
||||
hipfftDestroy(p);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
/*
|
||||
* Minimal reproducer for hipfftMakePlanMany / hipfftPlanMany failures.
|
||||
*
|
||||
* Compile on Frontier (no Grid headers needed):
|
||||
* hipcc -o Test_hipfft_minimal Test_hipfft_minimal.cc -lhipfft
|
||||
*
|
||||
* Run:
|
||||
* ./Test_hipfft_minimal
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <hipfft/hipfft.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
static const char *hipfftResultString(hipfftResult r) {
|
||||
switch (r) {
|
||||
case HIPFFT_SUCCESS: return "HIPFFT_SUCCESS";
|
||||
case HIPFFT_INVALID_PLAN: return "HIPFFT_INVALID_PLAN";
|
||||
case HIPFFT_ALLOC_FAILED: return "HIPFFT_ALLOC_FAILED";
|
||||
case HIPFFT_INVALID_TYPE: return "HIPFFT_INVALID_TYPE";
|
||||
case HIPFFT_INVALID_VALUE: return "HIPFFT_INVALID_VALUE";
|
||||
case HIPFFT_INTERNAL_ERROR: return "HIPFFT_INTERNAL_ERROR";
|
||||
case HIPFFT_EXEC_FAILED: return "HIPFFT_EXEC_FAILED";
|
||||
case HIPFFT_SETUP_FAILED: return "HIPFFT_SETUP_FAILED";
|
||||
case HIPFFT_INVALID_SIZE: return "HIPFFT_INVALID_SIZE";
|
||||
case HIPFFT_UNALIGNED_DATA: return "HIPFFT_UNALIGNED_DATA";
|
||||
case HIPFFT_INCOMPLETE_PARAMETER_LIST:return "HIPFFT_INCOMPLETE_PARAMETER_LIST";
|
||||
case HIPFFT_INVALID_DEVICE: return "HIPFFT_INVALID_DEVICE";
|
||||
case HIPFFT_PARSE_ERROR: return "HIPFFT_PARSE_ERROR";
|
||||
case HIPFFT_NO_WORKSPACE: return "HIPFFT_NO_WORKSPACE";
|
||||
case HIPFFT_NOT_IMPLEMENTED: return "HIPFFT_NOT_IMPLEMENTED";
|
||||
case HIPFFT_NOT_SUPPORTED: return "HIPFFT_NOT_SUPPORTED";
|
||||
default: return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
// Plan creation + execution for (G, howmany).
|
||||
// Tests two orderings to isolate whether a prior hipMalloc poisons hipfft
|
||||
// plan creation for small G on ROCm 7:
|
||||
// A) plan BEFORE hipMalloc — hypothesis: succeeds
|
||||
// B) hipMalloc BEFORE plan — hypothesis: fails for G < 32
|
||||
static void tryPlanAndExec(int G, long howmany) {
|
||||
int n[] = {G};
|
||||
long nelems = (long)G * howmany;
|
||||
|
||||
printf("--- G=%-4d howmany=%-10ld total_elems=%-12ld ---\n",
|
||||
G, howmany, nelems);
|
||||
|
||||
// Allocate device buffer (hipfftDoubleComplex = 16 bytes each)
|
||||
hipfftDoubleComplex *dbuf = nullptr;
|
||||
hipError_t herr = hipMalloc(&dbuf, nelems * sizeof(hipfftDoubleComplex));
|
||||
if (herr != hipSuccess) {
|
||||
printf(" hipMalloc failed (%d) for %ld elems — skipping\n\n", (int)herr, nelems);
|
||||
return;
|
||||
}
|
||||
hipMemset(dbuf, 0, nelems * sizeof(hipfftDoubleComplex));
|
||||
|
||||
// 1. hipfftPlanMany (one-step, nullptr embed) — current Grid path
|
||||
{
|
||||
hipfftHandle p;
|
||||
hipfftResult rv = hipfftPlanMany(&p, 1, n,
|
||||
nullptr, 1, G,
|
||||
nullptr, 1, G,
|
||||
HIPFFT_Z2Z, (int)howmany);
|
||||
printf(" hipfftPlanMany create : %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
if (rv == HIPFFT_SUCCESS) {
|
||||
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
|
||||
hipDeviceSynchronize();
|
||||
printf(" hipfftPlanMany execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
hipfftDestroy(p);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. hipfftCreate + hipfftMakePlanMany (two-step) — also current Grid path
|
||||
{
|
||||
hipfftHandle p;
|
||||
size_t workSize = 0;
|
||||
hipfftResult rc = hipfftCreate(&p);
|
||||
if (rc == HIPFFT_SUCCESS) {
|
||||
hipfftResult rv = hipfftMakePlanMany(p, 1, n,
|
||||
nullptr, 1, G,
|
||||
nullptr, 1, G,
|
||||
HIPFFT_Z2Z, (int)howmany, &workSize);
|
||||
printf(" hipfftMakePlanMany : %d (%s) workSize=%zu\n",
|
||||
(int)rv, hipfftResultString(rv), workSize);
|
||||
if (rv == HIPFFT_SUCCESS) {
|
||||
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
|
||||
hipDeviceSynchronize();
|
||||
printf(" hipfftMakePlanMany exec : %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
}
|
||||
hipfftDestroy(p);
|
||||
} else {
|
||||
printf(" hipfftCreate : %d (%s)\n", (int)rc, hipfftResultString(rc));
|
||||
}
|
||||
}
|
||||
|
||||
// 3. hipfftPlan1d (simplest API, batch = howmany)
|
||||
{
|
||||
hipfftHandle p;
|
||||
hipfftResult rv = hipfftPlan1d(&p, G, HIPFFT_Z2Z, (int)howmany);
|
||||
printf(" hipfftPlan1d create : %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
if (rv == HIPFFT_SUCCESS) {
|
||||
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
|
||||
hipDeviceSynchronize();
|
||||
printf(" hipfftPlan1d execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
hipfftDestroy(p);
|
||||
}
|
||||
}
|
||||
|
||||
hipFree(dbuf);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
// Print HIP device info
|
||||
int device = 0;
|
||||
hipGetDevice(&device);
|
||||
hipDeviceProp_t prop;
|
||||
hipGetDeviceProperties(&prop, device);
|
||||
printf("Device %d: %s warpSize=%d\n\n", device, prop.name, prop.warpSize);
|
||||
|
||||
#ifdef hipfftVersionMinor
|
||||
printf("hipFFT version: %d.%d.%d\n\n",
|
||||
hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
|
||||
#endif
|
||||
|
||||
// Original sweep with small howmany (these passed first time)
|
||||
printf("=== Small howmany (original sweep) ===\n\n");
|
||||
for (int G : {4, 8, 12, 16, 24, 32, 48, 64})
|
||||
tryPlanAndExec(G, 512);
|
||||
|
||||
// Grid-realistic howmany values derived from actual lattice geometries.
|
||||
// howmany = Ncomp * product(ldimensions[d] for d != dim)
|
||||
// For LatticeComplexD: Ncomp=1.
|
||||
printf("=== Grid-realistic parameters ===\n\n");
|
||||
|
||||
// --grid 16.16.16.16 4D FFT (KNOWN TO FAIL in Grid)
|
||||
// Each dim: G=16, Nperp=16^3=4096
|
||||
tryPlanAndExec(16, 4096);
|
||||
|
||||
// --grid 32.32.32.32 4D FFT (KNOWN TO SUCCEED in Grid)
|
||||
// Each dim: G=32, Nperp=32^3=32768
|
||||
tryPlanAndExec(32, 32768);
|
||||
|
||||
// --grid 32.32.32.32 Ls=8 5D DWF FFT (KNOWN TO FAIL on dim 0 in Grid)
|
||||
// dim 0: G=8, Nperp=32^4=1048576
|
||||
tryPlanAndExec(8, 1048576);
|
||||
// dim 1-4: G=32, Nperp=8*32^3=262144
|
||||
tryPlanAndExec(32, 262144);
|
||||
|
||||
// Extra intermediate cases to bracket the failure
|
||||
tryPlanAndExec(16, 1024);
|
||||
tryPlanAndExec(16, 2048);
|
||||
tryPlanAndExec(16, 8192);
|
||||
tryPlanAndExec(8, 4096);
|
||||
tryPlanAndExec(8, 65536);
|
||||
tryPlanAndExec(8, 262144);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,168 @@
|
||||
/*
|
||||
* Reproducer for HIPFFT_PARSE_ERROR (error 12) from hipfftMakePlanMany on
|
||||
* ROCm 7 / hipFFT 1.0.20 (Frontier, MI210 login and MI250X compute nodes).
|
||||
*
|
||||
* Observed failure: G < 32 returns HIPFFT_PARSE_ERROR from all three plan
|
||||
* creation APIs (hipfftPlanMany, hipfftMakePlanMany, hipfftPlan1d) when a
|
||||
* device buffer is allocated and zeroed with hipMalloc+hipMemset before the
|
||||
* plan creation call. G >= 32 succeeds.
|
||||
*
|
||||
* Contrast with Test_hipfft_minimal.cc (plan-first ordering) which passes
|
||||
* for all G even with an empty rocFFT cache.
|
||||
*
|
||||
* Compile on Frontier (no Grid headers needed):
|
||||
* hipcc -o Test_hipfft_repro Test_hipfft_repro.cc -lhipfft
|
||||
*
|
||||
* Run with empty cache to reproduce the failure:
|
||||
* rm -rf ~/.cache/rocfft
|
||||
* ./Test_hipfft_repro
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <hipfft/hipfft.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
static const char *hipfftResultString(hipfftResult r) {
|
||||
switch (r) {
|
||||
case HIPFFT_SUCCESS: return "HIPFFT_SUCCESS";
|
||||
case HIPFFT_INVALID_PLAN: return "HIPFFT_INVALID_PLAN";
|
||||
case HIPFFT_ALLOC_FAILED: return "HIPFFT_ALLOC_FAILED";
|
||||
case HIPFFT_INVALID_TYPE: return "HIPFFT_INVALID_TYPE";
|
||||
case HIPFFT_INVALID_VALUE: return "HIPFFT_INVALID_VALUE";
|
||||
case HIPFFT_INTERNAL_ERROR: return "HIPFFT_INTERNAL_ERROR";
|
||||
case HIPFFT_EXEC_FAILED: return "HIPFFT_EXEC_FAILED";
|
||||
case HIPFFT_SETUP_FAILED: return "HIPFFT_SETUP_FAILED";
|
||||
case HIPFFT_INVALID_SIZE: return "HIPFFT_INVALID_SIZE";
|
||||
case HIPFFT_UNALIGNED_DATA: return "HIPFFT_UNALIGNED_DATA";
|
||||
case HIPFFT_INCOMPLETE_PARAMETER_LIST:return "HIPFFT_INCOMPLETE_PARAMETER_LIST";
|
||||
case HIPFFT_INVALID_DEVICE: return "HIPFFT_INVALID_DEVICE";
|
||||
case HIPFFT_PARSE_ERROR: return "HIPFFT_PARSE_ERROR";
|
||||
case HIPFFT_NO_WORKSPACE: return "HIPFFT_NO_WORKSPACE";
|
||||
case HIPFFT_NOT_IMPLEMENTED: return "HIPFFT_NOT_IMPLEMENTED";
|
||||
case HIPFFT_NOT_SUPPORTED: return "HIPFFT_NOT_SUPPORTED";
|
||||
default: return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
// Plan creation + execution for (G, howmany) using hipfftCreate+hipfftMakePlanMany.
|
||||
// This is the path Grid's FFT.h now uses.
|
||||
static void tryPlanAndExec(int G, long howmany) {
|
||||
int n[] = {G};
|
||||
long nelems = (long)G * howmany;
|
||||
|
||||
printf("--- G=%-4d howmany=%-10ld total_elems=%-12ld ---\n",
|
||||
G, howmany, nelems);
|
||||
|
||||
// Allocate device buffer (hipfftDoubleComplex = 16 bytes each)
|
||||
hipfftDoubleComplex *dbuf = nullptr;
|
||||
hipError_t herr = hipMalloc(&dbuf, nelems * sizeof(hipfftDoubleComplex));
|
||||
if (herr != hipSuccess) {
|
||||
printf(" hipMalloc failed (%d) for %ld elems — skipping\n\n", (int)herr, nelems);
|
||||
return;
|
||||
}
|
||||
hipMemset(dbuf, 0, nelems * sizeof(hipfftDoubleComplex));
|
||||
|
||||
// 1. hipfftPlanMany (one-step, nullptr embed) — current Grid path
|
||||
{
|
||||
hipfftHandle p;
|
||||
hipfftResult rv = hipfftPlanMany(&p, 1, n,
|
||||
nullptr, 1, G,
|
||||
nullptr, 1, G,
|
||||
HIPFFT_Z2Z, (int)howmany);
|
||||
printf(" hipfftPlanMany create : %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
if (rv == HIPFFT_SUCCESS) {
|
||||
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
|
||||
hipDeviceSynchronize();
|
||||
printf(" hipfftPlanMany execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
hipfftDestroy(p);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. hipfftCreate + hipfftMakePlanMany (two-step) — also current Grid path
|
||||
{
|
||||
hipfftHandle p;
|
||||
size_t workSize = 0;
|
||||
hipfftResult rc = hipfftCreate(&p);
|
||||
if (rc == HIPFFT_SUCCESS) {
|
||||
hipfftResult rv = hipfftMakePlanMany(p, 1, n,
|
||||
nullptr, 1, G,
|
||||
nullptr, 1, G,
|
||||
HIPFFT_Z2Z, (int)howmany, &workSize);
|
||||
printf(" hipfftMakePlanMany : %d (%s) workSize=%zu\n",
|
||||
(int)rv, hipfftResultString(rv), workSize);
|
||||
if (rv == HIPFFT_SUCCESS) {
|
||||
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
|
||||
hipDeviceSynchronize();
|
||||
printf(" hipfftMakePlanMany exec : %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
}
|
||||
hipfftDestroy(p);
|
||||
} else {
|
||||
printf(" hipfftCreate : %d (%s)\n", (int)rc, hipfftResultString(rc));
|
||||
}
|
||||
}
|
||||
|
||||
// 3. hipfftPlan1d (simplest API, batch = howmany)
|
||||
{
|
||||
hipfftHandle p;
|
||||
hipfftResult rv = hipfftPlan1d(&p, G, HIPFFT_Z2Z, (int)howmany);
|
||||
printf(" hipfftPlan1d create : %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
if (rv == HIPFFT_SUCCESS) {
|
||||
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
|
||||
hipDeviceSynchronize();
|
||||
printf(" hipfftPlan1d execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
|
||||
hipfftDestroy(p);
|
||||
}
|
||||
}
|
||||
|
||||
hipFree(dbuf);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
// Print HIP device info
|
||||
int device = 0;
|
||||
hipGetDevice(&device);
|
||||
hipDeviceProp_t prop;
|
||||
hipGetDeviceProperties(&prop, device);
|
||||
printf("Device %d: %s warpSize=%d\n\n", device, prop.name, prop.warpSize);
|
||||
|
||||
#ifdef hipfftVersionMinor
|
||||
printf("hipFFT version: %d.%d.%d\n\n",
|
||||
hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
|
||||
#endif
|
||||
|
||||
// Original sweep with small howmany (these passed first time)
|
||||
printf("=== Small howmany (original sweep) ===\n\n");
|
||||
for (int G : {4, 8, 12, 16, 24, 32, 48, 64})
|
||||
tryPlanAndExec(G, 512);
|
||||
|
||||
// Grid-realistic howmany values derived from actual lattice geometries.
|
||||
// howmany = Ncomp * product(ldimensions[d] for d != dim)
|
||||
// For LatticeComplexD: Ncomp=1.
|
||||
printf("=== Grid-realistic parameters ===\n\n");
|
||||
|
||||
// --grid 16.16.16.16 4D FFT (KNOWN TO FAIL in Grid)
|
||||
// Each dim: G=16, Nperp=16^3=4096
|
||||
tryPlanAndExec(16, 4096);
|
||||
|
||||
// --grid 32.32.32.32 4D FFT (KNOWN TO SUCCEED in Grid)
|
||||
// Each dim: G=32, Nperp=32^3=32768
|
||||
tryPlanAndExec(32, 32768);
|
||||
|
||||
// --grid 32.32.32.32 Ls=8 5D DWF FFT (KNOWN TO FAIL on dim 0 in Grid)
|
||||
// dim 0: G=8, Nperp=32^4=1048576
|
||||
tryPlanAndExec(8, 1048576);
|
||||
// dim 1-4: G=32, Nperp=8*32^3=262144
|
||||
tryPlanAndExec(32, 262144);
|
||||
|
||||
// Extra intermediate cases to bracket the failure
|
||||
tryPlanAndExec(16, 1024);
|
||||
tryPlanAndExec(16, 2048);
|
||||
tryPlanAndExec(16, 8192);
|
||||
tryPlanAndExec(8, 4096);
|
||||
tryPlanAndExec(8, 65536);
|
||||
tryPlanAndExec(8, 262144);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,362 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: tests/debug/Test_staggered_hdcg.cc
|
||||
|
||||
Authors: Thomas Blum, Peter Boyle
|
||||
|
||||
HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
|
||||
for naive staggered fermions, based on arXiv:2409.03904.
|
||||
|
||||
Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
|
||||
- NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
|
||||
- 4D SchurStaggeredOperator: Mpc = m^2 - D_oe * D_eo (hermitian, positive-definite)
|
||||
- vColourVector fine field type (staggered has colour but no spin)
|
||||
- NextToNearestStencilGeometry4D: 33-point coarse stencil
|
||||
|
||||
Stencil count: D_oe*D_eo has 2-hop fine range. With blocking B >= 2 the coarse
|
||||
shifts have L1-distance <= 2, giving 33 stencil points in 4D:
|
||||
1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
|
||||
NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
|
||||
To extend to ImprovedStaggeredFermion later, use B >= 6.
|
||||
|
||||
Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
|
||||
|
||||
Usage (after build):
|
||||
./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
|
||||
|
||||
*************************************************************************************/
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
// Non-converging CG used as a smoother (fixed number of iterations)
|
||||
template<class Field>
|
||||
class CGSmoother : public LinearFunction<Field>
|
||||
{
|
||||
public:
|
||||
typedef LinearOperatorBase<Field> FineOperator;
|
||||
FineOperator &_Op;
|
||||
int iters;
|
||||
CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
|
||||
void operator()(const Field &in, Field &out)
|
||||
{
|
||||
ConjugateGradient<Field> CG(0.0, iters, false);
|
||||
out = Zero();
|
||||
CG(_Op, in, out);
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
fprintf(stderr, "TRACE: entering main\n"); fflush(stderr);
|
||||
Grid_init(&argc, &argv);
|
||||
fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Parameters — tune for production
|
||||
//--------------------------------------------------------------------
|
||||
const int nbasis = 24; // near-null space dimension
|
||||
const int cb = 0; // even checkerboard
|
||||
|
||||
RealD mass = 0.05;
|
||||
|
||||
// NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
|
||||
// c1 = coefficient of the hopping term (1.0 = standard normalisation).
|
||||
// u0 = tadpole factor (1.0 = no tadpole improvement).
|
||||
RealD c1 = 1.0;
|
||||
RealD u0 = 1.0;
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Grids
|
||||
// Fine: UGrid (4D full), UrbGrid (4D red-black)
|
||||
// Coarse: Coarse4d with dimensions = GridDefaultLatt() / Block
|
||||
//
|
||||
// Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
|
||||
// NaiveStaggeredFermion works with any Block >= {2,2,2,2}
|
||||
//--------------------------------------------------------------------
|
||||
fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr);
|
||||
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
|
||||
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
|
||||
fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr);
|
||||
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
|
||||
Coordinate Block({4, 4, 4, 4});
|
||||
Coordinate clatt = GridDefaultLatt();
|
||||
for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
|
||||
Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd());
|
||||
Coordinate cmpi = GridDefaultMpi();
|
||||
fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n",
|
||||
clatt[0],clatt[1],clatt[2],clatt[3],
|
||||
csimd[0],csimd[1],csimd[2],csimd[3],
|
||||
cmpi[0],cmpi[1],cmpi[2],cmpi[3],
|
||||
(int)vComplex::Nsimd()); fflush(stderr);
|
||||
|
||||
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi);
|
||||
fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// RNG + gauge field
|
||||
//--------------------------------------------------------------------
|
||||
fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr);
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers({1,2,3,4});
|
||||
fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr);
|
||||
GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid
|
||||
fprintf(stderr, "TRACE: Umu\n"); fflush(stderr);
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr);
|
||||
SU<Nc>::HotConfiguration(RNG4, Umu);
|
||||
fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr);
|
||||
NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
|
||||
fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr);
|
||||
SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
|
||||
fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Subspace: inverse-iteration near-null vectors
|
||||
//
|
||||
// CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors,
|
||||
// converging naturally to the low modes of HermOp without needing spectral
|
||||
// bound tuning. Switch to CreateSubspaceChebyshevNew once the spectrum is
|
||||
// well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator).
|
||||
//--------------------------------------------------------------------
|
||||
typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
|
||||
Subspace Aggregates(Coarse4d, UrbGrid, cb);
|
||||
|
||||
Aggregates.CreateSubspace(RNGrb, HermOp);
|
||||
Aggregates.Orthogonalise();
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Coarse geometry: NextToNearestStencilGeometry4D
|
||||
// hops=2 -> 33 stencil points in 4D
|
||||
//--------------------------------------------------------------------
|
||||
NextToNearestStencilGeometry4D geom(Coarse4d);
|
||||
|
||||
std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Single-RHS coarse operator (used for correctness check below)
|
||||
//--------------------------------------------------------------------
|
||||
typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
|
||||
typedef LittleDiracOp::CoarseVector CoarseVector;
|
||||
|
||||
LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
|
||||
LDO.CoarsenOperator(HermOp, Aggregates);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Correctness check: P M_fine P^T c ≈ M_coarse c
|
||||
//
|
||||
// Promote a random coarse vector into the fine subspace, apply the
|
||||
// fine operator, project back, and compare with the coarse operator
|
||||
// applied directly. Error should be at the level of subspace
|
||||
// approximation quality (smaller = better basis vectors).
|
||||
//--------------------------------------------------------------------
|
||||
{
|
||||
GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
|
||||
CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
|
||||
random(RNGc, c_src);
|
||||
|
||||
LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
|
||||
Aggregates.PromoteFromSubspace(c_src, f_v);
|
||||
HermOp.Op(f_v, f_Mv);
|
||||
Aggregates.ProjectToSubspace(c_proj, f_Mv);
|
||||
|
||||
LDO.M(c_src, c_ldop);
|
||||
|
||||
c_proj -= c_ldop;
|
||||
RealD err = norm2(c_proj) / norm2(c_ldop);
|
||||
std::cout << GridLogMessage
|
||||
<< "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Multi-RHS coarse grid
|
||||
//
|
||||
// The extra leading dimension holds nrhs right-hand sides packed into
|
||||
// SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
|
||||
//--------------------------------------------------------------------
|
||||
const int nrhs = vComplex::Nsimd() * 2;
|
||||
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate rhMpi ({1, mpi[0], mpi[1], mpi[2], mpi[3]});
|
||||
Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
|
||||
Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
|
||||
|
||||
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
|
||||
|
||||
typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
|
||||
MultiCoarseOp mrhs(geom, CoarseMrhs);
|
||||
mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Coarse-grid Lanczos for deflation
|
||||
//--------------------------------------------------------------------
|
||||
typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
|
||||
MrhsHermOp MrhsCoarseOp(mrhs);
|
||||
|
||||
// Estimate spectral bounds for Lanczos Chebyshev filter
|
||||
CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
|
||||
PowerMethod<CoarseVector> cPM;
|
||||
RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
|
||||
// Chebyshev filter window [lo, hi]:
|
||||
// lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th
|
||||
// coarse eigenvalues so that only the target modes receive cosh amplification.
|
||||
//
|
||||
// From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config):
|
||||
// Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746] ~= mass^2
|
||||
// Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275)
|
||||
// Group 2 (second group): lambda in [0.165, 0.179]
|
||||
//
|
||||
// lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275,
|
||||
// factor 8x below lambda_24=0.165).
|
||||
// hi = lambda_max_coarse * 1.1 ~= 2.121
|
||||
// y(lambda_0=0.002647) ~ -1.016 -> T_70 ~ 1.7e5 (cosh(70*0.182))
|
||||
// y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5
|
||||
// Relative spread across near-null cluster: ~4.3%
|
||||
// y(lambda_24=0.165) ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1
|
||||
//
|
||||
// order=71 (degree 70) is needed to give ~4% relative spread across the
|
||||
// near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried)
|
||||
// gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge.
|
||||
// Absolute amplification ~1e5; what matters for IRL convergence is the
|
||||
// relative spread, not the absolute value.
|
||||
// lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts).
|
||||
// lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1.
|
||||
RealD lambda_lo = 0.02;
|
||||
|
||||
std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo
|
||||
<< " hi=" << lambda_max*1.1 << " order=71" << std::endl;
|
||||
|
||||
Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
|
||||
|
||||
// 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28
|
||||
// in the first Lanczos restart. The remaining modes (~0.165) are a
|
||||
// second spectral group that needs more Krylov vectors; handle them
|
||||
// separately once the basic HDCG solve is validated.
|
||||
int Nk = 24;
|
||||
int Nm = 48;
|
||||
int Nstop = Nk;
|
||||
|
||||
GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16});
|
||||
|
||||
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
|
||||
IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
|
||||
Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
|
||||
|
||||
int Nconv;
|
||||
std::vector<RealD> eval(Nm);
|
||||
std::vector<CoarseVector> evec(Nm, Coarse4d); // evec on f_grid (single-RHS coarse)
|
||||
std::vector<CoarseVector> c_srcs(nrhs, Coarse4d); // src on same grid as evec
|
||||
for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
|
||||
IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// HDCG solver assembly
|
||||
//--------------------------------------------------------------------
|
||||
MultiRHSDeflation<CoarseVector> MrhsGuesser;
|
||||
MrhsGuesser.ImportEigenBasis(evec, eval);
|
||||
|
||||
// MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
|
||||
MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
|
||||
MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
|
||||
MrhsProjector.ImportBasis(Aggregates.subspace);
|
||||
|
||||
ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
|
||||
DoNothingGuesser<CoarseVector> DoNothing;
|
||||
HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
|
||||
|
||||
// Spectral radius of the fine operator, needed for the smoother shift.
|
||||
// Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod.
|
||||
LatticeStaggeredFermionD fine_pm_src(UrbGrid);
|
||||
random(RNGrb, fine_pm_src);
|
||||
PowerMethod<LatticeStaggeredFermionD> finePM;
|
||||
RealD fine_lambda_max = finePM(HermOp, fine_pm_src);
|
||||
|
||||
// Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100.
|
||||
//
|
||||
// The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the
|
||||
// interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial
|
||||
// focuses entirely on the HIGH-frequency part of the spectrum and leaves
|
||||
// near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there).
|
||||
//
|
||||
// This is the right target because the coarse-grid correction always introduces
|
||||
// high-frequency spectral leakage: the blocked coarse-grid degrees of freedom
|
||||
// are piecewise constant across coarse cells and therefore have sharp edges at
|
||||
// cell boundaries (like lego-block edges). Smoothness is measured by the
|
||||
// covariant Dirac derivative, so promoting the coarse solution back to the
|
||||
// fine grid inevitably excites high-frequency components — just as a step
|
||||
// function always carries high-frequency Fourier content. The smoother must
|
||||
// repair exactly these high modes.
|
||||
//
|
||||
// The smoother and the coarse-grid correction are applied alternately: together
|
||||
// they both lift the low eigenvalues and pull down the upper eigenvalues of the
|
||||
// composite preconditioned operator, reducing the condition number seen by the
|
||||
// outer HDCG iterations.
|
||||
//
|
||||
// DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered
|
||||
// the 8 roots over [0.005, 4.6] and diluted their effect on the high modes.
|
||||
RealD smootherShift = fine_lambda_max / 200.0;
|
||||
std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = "
|
||||
<< fine_lambda_max << "/200 = " << smootherShift << std::endl;
|
||||
ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
|
||||
CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
|
||||
|
||||
TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
|
||||
HDCG(1.0e-8, 500,
|
||||
HermOp,
|
||||
smoother,
|
||||
HPDSolve, // M1 (coarse correction)
|
||||
HPDSolve, // Vstart (initial guess projection)
|
||||
MrhsProjector,
|
||||
MrhsGuesser,
|
||||
CoarseMrhs);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Solve: nrhs right-hand sides simultaneously
|
||||
//--------------------------------------------------------------------
|
||||
std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
|
||||
std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
|
||||
|
||||
GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid
|
||||
for (int r = 0; r < nrhs; r++) {
|
||||
random(RNGrb2, src[r]);
|
||||
sol[r] = Zero();
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Baseline: standard single-RHS CG on HermOp (no preconditioning)
|
||||
// Run before HDCG to establish the unpreconditioned iteration count
|
||||
// and wall-clock time for direct comparison.
|
||||
//--------------------------------------------------------------------
|
||||
{
|
||||
ConjugateGradient<LatticeStaggeredFermionD> CG(1.0e-8, 50000, false);
|
||||
std::vector<LatticeStaggeredFermionD> cg_sol(nrhs, UrbGrid);
|
||||
for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero();
|
||||
|
||||
RealD t0 = usecond();
|
||||
int total_iters = 0;
|
||||
for (int r = 0; r < nrhs; r++) {
|
||||
std::cout << GridLogMessage << "====== CG baseline RHS " << r
|
||||
<< " ======" << std::endl;
|
||||
CG(HermOp, src[r], cg_sol[r]);
|
||||
total_iters += CG.IterationsToComplete;
|
||||
}
|
||||
RealD t1 = usecond();
|
||||
std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, "
|
||||
<< total_iters << " total iterations, "
|
||||
<< (t1 - t0) / 1.0e6 << " s total, "
|
||||
<< (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// HDCG solve
|
||||
//--------------------------------------------------------------------
|
||||
HDCG(src, sol);
|
||||
|
||||
Grid_finalize();
|
||||
return 0;
|
||||
}
|
||||
@@ -1,373 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: tests/debug/Test_staggered_hdcg.cc
|
||||
|
||||
Authors: Thomas Blum, Peter Boyle
|
||||
|
||||
HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
|
||||
for naive staggered fermions, based on arXiv:2409.03904.
|
||||
|
||||
Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
|
||||
- NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
|
||||
- 4D SchurStaggeredOperator: Mpc = m^2 - D_oe * D_eo (hermitian, positive-definite)
|
||||
- vColourVector fine field type (staggered has colour but no spin)
|
||||
- NextToNearestStencilGeometry4D: 33-point coarse stencil
|
||||
|
||||
Stencil count: D_oe*D_eo has 2-hop fine range. With blocking B >= 2 the coarse
|
||||
shifts have L1-distance <= 2, giving 33 stencil points in 4D:
|
||||
1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
|
||||
NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
|
||||
To extend to ImprovedStaggeredFermion later, use B >= 6.
|
||||
|
||||
Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
|
||||
|
||||
Usage (after build):
|
||||
./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
|
||||
|
||||
*************************************************************************************/
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
// Non-converging CG used as a smoother (fixed number of iterations)
|
||||
template<class Field>
|
||||
class CGSmoother : public LinearFunction<Field>
|
||||
{
|
||||
public:
|
||||
typedef LinearOperatorBase<Field> FineOperator;
|
||||
FineOperator &_Op;
|
||||
int iters;
|
||||
CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
|
||||
void operator()(const Field &in, Field &out)
|
||||
{
|
||||
ConjugateGradient<Field> CG(0.0, iters, false);
|
||||
out = Zero();
|
||||
CG(_Op, in, out);
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
fprintf(stderr, "TRACE: entering main\n"); fflush(stderr);
|
||||
Grid_init(&argc, &argv);
|
||||
fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Parameters — tune for production
|
||||
//--------------------------------------------------------------------
|
||||
const int nbasis = 24; // near-null space dimension
|
||||
const int cb = 0; // even checkerboard
|
||||
|
||||
RealD mass = 0.00184;
|
||||
|
||||
// NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
|
||||
// c1 = coefficient of the hopping term (1.0 = standard normalisation).
|
||||
// u0 = tadpole factor (1.0 = no tadpole improvement).
|
||||
RealD c1 = 1.0;
|
||||
RealD u0 = 1.0;
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Grids
|
||||
// Fine: UGrid (4D full), UrbGrid (4D red-black)
|
||||
// Coarse: Coarse4d with dimensions = GridDefaultLatt() / Block
|
||||
//
|
||||
// Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
|
||||
// NaiveStaggeredFermion works with any Block >= {2,2,2,2}
|
||||
//--------------------------------------------------------------------
|
||||
fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr);
|
||||
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
|
||||
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
|
||||
fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr);
|
||||
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
|
||||
Coordinate Block({4, 4, 4, 4});
|
||||
Coordinate clatt = GridDefaultLatt();
|
||||
for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
|
||||
Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd());
|
||||
Coordinate cmpi = GridDefaultMpi();
|
||||
fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n",
|
||||
clatt[0],clatt[1],clatt[2],clatt[3],
|
||||
csimd[0],csimd[1],csimd[2],csimd[3],
|
||||
cmpi[0],cmpi[1],cmpi[2],cmpi[3],
|
||||
(int)vComplex::Nsimd()); fflush(stderr);
|
||||
|
||||
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi);
|
||||
fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// RNG + gauge field
|
||||
//--------------------------------------------------------------------
|
||||
fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr);
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers({1,2,3,4});
|
||||
fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr);
|
||||
GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid
|
||||
fprintf(stderr, "TRACE: Umu\n"); fflush(stderr);
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
int HotStart = 0;
|
||||
if ( HotStart ) {
|
||||
fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr);
|
||||
SU<Nc>::HotConfiguration(RNG4, Umu);
|
||||
} else {
|
||||
FieldMetaData header;
|
||||
std::string file("./configuration.ildg");
|
||||
IldgReader IR;
|
||||
IR.open(file);
|
||||
IR.readConfiguration(Umu,header);
|
||||
IR.close();
|
||||
}
|
||||
|
||||
fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr);
|
||||
NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
|
||||
fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr);
|
||||
SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
|
||||
fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Subspace: inverse-iteration near-null vectors
|
||||
//
|
||||
// CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors,
|
||||
// converging naturally to the low modes of HermOp without needing spectral
|
||||
// bound tuning. Switch to CreateSubspaceChebyshevNew once the spectrum is
|
||||
// well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator).
|
||||
//--------------------------------------------------------------------
|
||||
typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
|
||||
Subspace Aggregates(Coarse4d, UrbGrid, cb);
|
||||
|
||||
Aggregates.CreateSubspace(RNGrb, HermOp);
|
||||
Aggregates.Orthogonalise();
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Coarse geometry: NextToNearestStencilGeometry4D
|
||||
// hops=2 -> 33 stencil points in 4D
|
||||
//--------------------------------------------------------------------
|
||||
NextToNearestStencilGeometry4D geom(Coarse4d);
|
||||
|
||||
std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Single-RHS coarse operator (used for correctness check below)
|
||||
//--------------------------------------------------------------------
|
||||
typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
|
||||
typedef LittleDiracOp::CoarseVector CoarseVector;
|
||||
|
||||
LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
|
||||
LDO.CoarsenOperator(HermOp, Aggregates);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Correctness check: P M_fine P^T c ≈ M_coarse c
|
||||
//
|
||||
// Promote a random coarse vector into the fine subspace, apply the
|
||||
// fine operator, project back, and compare with the coarse operator
|
||||
// applied directly. Error should be at the level of subspace
|
||||
// approximation quality (smaller = better basis vectors).
|
||||
//--------------------------------------------------------------------
|
||||
{
|
||||
GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
|
||||
CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
|
||||
random(RNGc, c_src);
|
||||
|
||||
LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
|
||||
Aggregates.PromoteFromSubspace(c_src, f_v);
|
||||
HermOp.Op(f_v, f_Mv);
|
||||
Aggregates.ProjectToSubspace(c_proj, f_Mv);
|
||||
|
||||
LDO.M(c_src, c_ldop);
|
||||
|
||||
c_proj -= c_ldop;
|
||||
RealD err = norm2(c_proj) / norm2(c_ldop);
|
||||
std::cout << GridLogMessage
|
||||
<< "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Multi-RHS coarse grid
|
||||
//
|
||||
// The extra leading dimension holds nrhs right-hand sides packed into
|
||||
// SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
|
||||
//--------------------------------------------------------------------
|
||||
const int nrhs = vComplex::Nsimd() * 2;
|
||||
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate rhMpi ({1, mpi[0], mpi[1], mpi[2], mpi[3]});
|
||||
Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
|
||||
Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
|
||||
|
||||
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
|
||||
|
||||
typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
|
||||
MultiCoarseOp mrhs(geom, CoarseMrhs);
|
||||
mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Coarse-grid Lanczos for deflation
|
||||
//--------------------------------------------------------------------
|
||||
typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
|
||||
MrhsHermOp MrhsCoarseOp(mrhs);
|
||||
|
||||
// Estimate spectral bounds for Lanczos Chebyshev filter
|
||||
CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
|
||||
PowerMethod<CoarseVector> cPM;
|
||||
RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
|
||||
// Chebyshev filter window [lo, hi]:
|
||||
// lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th
|
||||
// coarse eigenvalues so that only the target modes receive cosh amplification.
|
||||
//
|
||||
// From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config):
|
||||
// Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746] ~= mass^2
|
||||
// Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275)
|
||||
// Group 2 (second group): lambda in [0.165, 0.179]
|
||||
//
|
||||
// lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275,
|
||||
// factor 8x below lambda_24=0.165).
|
||||
// hi = lambda_max_coarse * 1.1 ~= 2.121
|
||||
// y(lambda_0=0.002647) ~ -1.016 -> T_70 ~ 1.7e5 (cosh(70*0.182))
|
||||
// y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5
|
||||
// Relative spread across near-null cluster: ~4.3%
|
||||
// y(lambda_24=0.165) ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1
|
||||
//
|
||||
// order=71 (degree 70) is needed to give ~4% relative spread across the
|
||||
// near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried)
|
||||
// gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge.
|
||||
// Absolute amplification ~1e5; what matters for IRL convergence is the
|
||||
// relative spread, not the absolute value.
|
||||
// lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts).
|
||||
// lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1.
|
||||
RealD lambda_lo = 0.02;
|
||||
|
||||
std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo
|
||||
<< " hi=" << lambda_max*1.1 << " order=71" << std::endl;
|
||||
|
||||
Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
|
||||
|
||||
// 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28
|
||||
// in the first Lanczos restart. The remaining modes (~0.165) are a
|
||||
// second spectral group that needs more Krylov vectors; handle them
|
||||
// separately once the basic HDCG solve is validated.
|
||||
int Nk = 24;
|
||||
int Nm = 48;
|
||||
int Nstop = Nk;
|
||||
|
||||
GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16});
|
||||
|
||||
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
|
||||
IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
|
||||
Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
|
||||
|
||||
int Nconv;
|
||||
std::vector<RealD> eval(Nm);
|
||||
std::vector<CoarseVector> evec(Nm, Coarse4d); // evec on f_grid (single-RHS coarse)
|
||||
std::vector<CoarseVector> c_srcs(nrhs, Coarse4d); // src on same grid as evec
|
||||
for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
|
||||
IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// HDCG solver assembly
|
||||
//--------------------------------------------------------------------
|
||||
MultiRHSDeflation<CoarseVector> MrhsGuesser;
|
||||
MrhsGuesser.ImportEigenBasis(evec, eval);
|
||||
|
||||
// MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
|
||||
MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
|
||||
MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
|
||||
MrhsProjector.ImportBasis(Aggregates.subspace);
|
||||
|
||||
ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
|
||||
DoNothingGuesser<CoarseVector> DoNothing;
|
||||
HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
|
||||
|
||||
// Spectral radius of the fine operator, needed for the smoother shift.
|
||||
// Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod.
|
||||
LatticeStaggeredFermionD fine_pm_src(UrbGrid);
|
||||
random(RNGrb, fine_pm_src);
|
||||
PowerMethod<LatticeStaggeredFermionD> finePM;
|
||||
RealD fine_lambda_max = finePM(HermOp, fine_pm_src);
|
||||
|
||||
// Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100.
|
||||
//
|
||||
// The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the
|
||||
// interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial
|
||||
// focuses entirely on the HIGH-frequency part of the spectrum and leaves
|
||||
// near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there).
|
||||
//
|
||||
// This is the right target because the coarse-grid correction always introduces
|
||||
// high-frequency spectral leakage: the blocked coarse-grid degrees of freedom
|
||||
// are piecewise constant across coarse cells and therefore have sharp edges at
|
||||
// cell boundaries (like lego-block edges). Smoothness is measured by the
|
||||
// covariant Dirac derivative, so promoting the coarse solution back to the
|
||||
// fine grid inevitably excites high-frequency components — just as a step
|
||||
// function always carries high-frequency Fourier content. The smoother must
|
||||
// repair exactly these high modes.
|
||||
//
|
||||
// The smoother and the coarse-grid correction are applied alternately: together
|
||||
// they both lift the low eigenvalues and pull down the upper eigenvalues of the
|
||||
// composite preconditioned operator, reducing the condition number seen by the
|
||||
// outer HDCG iterations.
|
||||
//
|
||||
// DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered
|
||||
// the 8 roots over [0.005, 4.6] and diluted their effect on the high modes.
|
||||
RealD smootherShift = fine_lambda_max / 200.0;
|
||||
std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = "
|
||||
<< fine_lambda_max << "/200 = " << smootherShift << std::endl;
|
||||
ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
|
||||
CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
|
||||
|
||||
TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
|
||||
HDCG(1.0e-8, 500,
|
||||
HermOp,
|
||||
smoother,
|
||||
HPDSolve, // M1 (coarse correction)
|
||||
HPDSolve, // Vstart (initial guess projection)
|
||||
MrhsProjector,
|
||||
MrhsGuesser,
|
||||
CoarseMrhs);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Solve: nrhs right-hand sides simultaneously
|
||||
//--------------------------------------------------------------------
|
||||
std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
|
||||
std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
|
||||
|
||||
GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid
|
||||
for (int r = 0; r < nrhs; r++) {
|
||||
random(RNGrb2, src[r]);
|
||||
sol[r] = Zero();
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Baseline: standard single-RHS CG on HermOp (no preconditioning)
|
||||
// Run before HDCG to establish the unpreconditioned iteration count
|
||||
// and wall-clock time for direct comparison.
|
||||
//--------------------------------------------------------------------
|
||||
{
|
||||
ConjugateGradient<LatticeStaggeredFermionD> CG(1.0e-8, 100000, false);
|
||||
std::vector<LatticeStaggeredFermionD> cg_sol(nrhs, UrbGrid);
|
||||
for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero();
|
||||
|
||||
RealD t0 = usecond();
|
||||
int total_iters = 0;
|
||||
for (int r = 0; r < nrhs; r++) {
|
||||
std::cout << GridLogMessage << "====== CG baseline RHS " << r
|
||||
<< " ======" << std::endl;
|
||||
CG(HermOp, src[r], cg_sol[r]);
|
||||
total_iters += CG.IterationsToComplete;
|
||||
}
|
||||
RealD t1 = usecond();
|
||||
std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, "
|
||||
<< total_iters << " total iterations, "
|
||||
<< (t1 - t0) / 1.0e6 << " s total, "
|
||||
<< (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// HDCG solve
|
||||
//--------------------------------------------------------------------
|
||||
HDCG(src, sol);
|
||||
|
||||
Grid_finalize();
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user