1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-06-06 04:04:36 +01:00

Compare commits

..

59 Commits

Author SHA1 Message Date
Peter Boyle 8540b2a85d Test_extended_meson_field: add view_open timers to measure MemoryManager H2D transfers
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 14:04:38 -04:00
Peter Boyle dbd3a0e612 A2ALoopPropagator: fuse outer product sum into single accelerator_for kernel
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 10:39:37 -04:00
Peter Boyle 5b58d1da62 Test_extended_meson_field: add --Ni and --Nj command line options
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 22:46:11 -04:00
Peter Boyle 377db1bc08 Tensor_inner: move scalar innerProductD overloads before norm2 for ADL visibility
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 22:24:52 -04:00
Peter Boyle 699564997e Test_extended_meson_field: use decltype(coalescedRead) for arch-portable kernel types
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 22:18:43 -04:00
Peter Boyle f2750fae09 Test_extended_meson_field: use Grid norm2 instead of std::norm for HIP compatibility
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 22:04:10 -04:00
Peter Boyle ed12fa09c5 Not sure how this old lattice slice sum core
fix didn't propagate
2026-05-27 21:54:24 -04:00
Peter Boyle b914403bbe Better setup Frontier 2026-05-27 21:31:54 -04:00
Peter Boyle c1566fb9a2 Merge branch 'develop' into feature/Kpipi-masaaki-offload 2026-05-27 21:03:16 -04:00
Peter Boyle 905da6f083 Merge branch 'feature/reduction-reorganisation' into develop 2026-05-27 21:01:30 -04:00
Peter Boyle cb199c127c Merge branch 'develop' into feature/Kpipi-masaaki-offload 2026-05-27 20:59:30 -04:00
Peter Boyle 1a932ea33b Merge 2026-05-27 20:45:00 -04:00
Peter Boyle 5822a6599c skills: add GPU/A2A reference skill documents
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 11:12:47 -04:00
Peter Boyle 0eeb334fe0 systems/mac-arm: add MPI configure command and spack sourceme
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 11:12:42 -04:00
Peter Boyle d8d16407e9 A2ASpatialSum: extended meson field kernel and test
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 11:12:29 -04:00
Peter Boyle 12e3499b6d Updated rocm 7 compile for ORNL 2026-05-21 12:28:42 -04:00
Peter Boyle 9576011011 Changed setup for ROCM 7, nasty LD_LIBRARY_PATH issues were committing
evils
2026-05-21 12:28:04 -04:00
Peter Boyle 155b34c1aa File list lost 2026-05-21 12:06:01 -04:00
Peter Boyle 982ffe9ebe Lattice_reduction_gpu: demote timing logs to Debug, disable by default
skills/mpi-heterogeneous: add Bug Class 4 for Frontier GTL/libamdhip64 ABI mismatch

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 12:05:36 -04:00
Peter Boyle 0251ecaeab Test_planned_fft: fix PlannedFFT template parameter to use ::vector_object
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 18:13:38 -04:00
Peter Boyle 372a27d645 tests: add Test_planned_fft exercising PlannedFFT<vobj>
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 17:59:24 -04:00
Peter Boyle 72b4a061f3 tests/fft: remove PlanDestroy calls (FFT handles plans per-call)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 17:54:41 -04:00
Peter Boyle 29198efabe FFT: add FFTbase, PlannedFFT; factor FFT_dim_execute free function
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 17:53:17 -04:00
Peter Boyle 50aa51f93a debug: add Test_hipfft_repro — reproducer for hipFFT PARSE_ERROR on ROCm 7
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 22:27:27 -04:00
Peter Boyle 79ccc81a86 tests/debug: add G=4 to hipfft fail reproducer
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 22:21:52 -04:00
Peter Boyle 3f0fdbb597 tests/debug: test hipMemset variant before cache is populated
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 22:10:16 -04:00
Peter Boyle ea57bd8f03 tests/debug: extend hipfft fail reproducer with hipMemset and sync variants
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 22:02:02 -04:00
Peter Boyle bdba5b8403 FFT: use host stack buffer in PlanCreate, not deviceVector
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 21:49:06 -04:00
Peter Boyle 58cc6ca9c0 tests/debug: add minimal hipfft ordering bug fail/pass pair
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 21:48:23 -04:00
Peter Boyle e5996b440d tests/debug: test plan-before-malloc vs malloc-before-plan ordering
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 21:40:17 -04:00
Peter Boyle ad9d03fd85 tests/debug: extend hipfft reproducer with Grid-realistic howmany and exec tests
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 19:19:59 -04:00
Peter Boyle 4de160ce20 tests/debug: add minimal hipfft plan-creation reproducer
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 17:52:59 -04:00
Peter Boyle fc8c8ce6e7 FFT HIP: use hipfftCreate+hipfftMakePlanMany instead of hipfftPlanMany 2026-05-19 17:29:28 -04:00
Peter Boyle ddbb7f07c8 FFT: pass nullptr for inembed/onembed in hipfftPlanMany to avoid HIPFFT_PARSE_ERROR 2026-05-19 17:15:21 -04:00
Peter Boyle 1e29c59bcc FFT: cache plans per vobj type across calls
Plans are created lazily on the first FFT_dim call and reused for all
subsequent calls on the same FFT object.  PlanCreate<vobj>() can be
called explicitly to pre-warm the cache.  PlanDestroy() must be called
before switching to a different vobj type; the destructor cleans up any
live plans automatically.

Update Test_fft.cc and Test_fftf.cc to call PlanDestroy() between the
LatticeComplex and LatticeSpinMatrix sections that reuse the same FFT object.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 15:12:10 -04:00
Peter Boyle b6abdc3845 Accelerator: lower default accelerator_threads from 16 to 8
Benchmark_dwf_fp32 on MI250X GCD: 1.7 TF/s at nt=8, ~300 GF/s at nt=16.
With Nsimd=8 (fp32, GEN_SIMD_WIDTH=64B), nt=8 gives exactly 64 threads =
one full AMD wavefront. Higher values double register demand per block and
hit a register-pressure cliff for stencil kernels.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 13:41:03 -04:00
Peter Boyle 2fadd8bb62 Accelerator: raise default accelerator_threads from 2 to 16 2026-05-19 10:15:53 -04:00
Peter Boyle 60df2dd5d0 skills: add gpu-memory-performance.md
Documents the acceleratorThreads() default=2 trap, LambdaApply thread
mapping, coalescedRead/Write idiom, when to use __global__ vs
accelerator_for, and fused vs staged HBM access patterns.

Includes observed MI250X numbers from LatticePropagatorD reduction
(50 → 297 → 546 GB/s progression).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 10:03:32 -04:00
Peter Boyle 66b529b345 sumD_gpu_reduce_words: fuse pack+reduce into single packReduceKernel
Replace the two-kernel pack+reduce sequence with a single fused kernel
packReduceKernel<R> that reads R words of each vobj at offset 'base'
and accumulates directly into iVector<iScalar<scalarD>,R>, eliminating
the intermediate bundle buffer entirely.

HBM access per word-group drops from 3x (pack-read + pack-write +
reduce-read) to 1x.  Thread count comes from getNumBlocksAndThreads
(warpSize..256) rather than acceleratorThreads(), so occupancy is
correct regardless of the --accelerator-threads setting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-19 09:46:43 -04:00
Peter Boyle 1304172a93 Modified repack 2026-05-19 08:53:13 -04:00
Peter Boyle 1315d4604d Enable GRID_REDUCTION_TIMING unconditionally
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 22:14:00 -04:00
Peter Boyle a31af31328 Lattice_reduction_gpu: add GRID_REDUCTION_TIMING instrumentation
Uncomment #define GRID_REDUCTION_TIMING to enable per-phase timing output:

  sumD_gpu_reduce_words: pack time (accelerator_for) per R and base
  sumD_gpu_small:        reduceKernel+barrier time and D2H time separately
  sumD_gpu_large:        total wall time across all word groups

This lets us identify whether the large-type bottleneck is in the pack
kernel, the shared-memory reduction kernel, the barrier, or the D2H.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 22:13:30 -04:00
Peter Boyle 26c3c7d8f9 sumD_gpu_large: radix-12 word-bundle reduction replacing radix-1
Replace the word-by-word loop (one kernel launch per scalar word) with
sumD_gpu_reduce_words<R> which packs R consecutive vector_type words per
site into iVector<iScalar<vector>,R>, then calls the existing sumD_gpu_small
shared-memory kernel once for the whole bundle.

Dispatch: radix-12 first, radix-4 for the remainder < 12, radix-1 for
any final < 4 words.  For LatticePropagator (144 words = 12x12), this
reduces the kernel-launch count from 144 to 12 -- a 12x reduction.

Bundle::Nsimd() inherits from vector_type so sumD_gpu_small handles SIMD
lane extraction and double-precision promotion identically to the scalar
word case.  sizeof(Bundle::scalar_objectD) = R*16 <= 192 B; well within
sharedMemPerBlock on all supported devices.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:56:45 -04:00
Peter Boyle 0650d7c7eb Lattice_reduction_sycl: fix double-precision accumulation in sumD_gpu_tensor
Accumulate in sobjD throughout rather than accumulating in sobj and
converting the final sum. For float fields this matters: summing N floats
then casting loses O(N*eps_float) relative precision vs accumulating in
double from the start.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:53:40 -04:00
Peter Boyle 068f95ad2d Revert to hand-rolled reduction; drop Lattice_reduction_gpu_cub.h
Remove the CUB/hipCUB direction entirely. Restore Lattice_reduction_gpu.h,
Lattice_reduction_sycl.h, and Lattice_reduction.h to the state before the
CUB rewrite (commit 969b0a39), recovering the original primary function names
(sumD_gpu_small, sumD_gpu_large, sumD_gpu, sum_gpu, sum_gpu_large) and the
hand-rolled shared-memory reduction kernel.

Delete Lattice_reduction_gpu_cub.h. Update Test_reduction to remove the
old/new comparison sections that depended on sum_gpu_old.

The lesson: CUB DeviceReduce is slower than the hand-rolled kernel for small
types, and the smem sizing problem for the extraction pass has no clean
solution within the accelerator_for abstraction. The right improvement is
a higher radix (12 then 4) in sumD_gpu_large, applied directly to the
existing hand-rolled kernel.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:52:18 -04:00
Peter Boyle f4fbf7c9ca sumD_gpu_direct: revert to per-lane write; CUB handles Nsimd*osites inputs
Benchmarking showed the shared-memory lane-summation approach (843d6497)
was slower than writing each SIMD lane individually and letting CUB reduce
the full nlanes = osites*Nsimd array. CUB's device reduce is more efficient
over the larger input than the smem overhead + serialised lane-0 summation.
The smem approach also required overriding acceleratorThreads() to avoid
the block-size sizing problem. Restore the simpler per-lane path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:23:15 -04:00
Peter Boyle 843d6497b2 sumD_gpu_direct: shared-memory lane reduction with acceleratorThreads(1)
Set acceleratorThreads to 1 before the extraction kernel so that
dim3(nsimd,1,1) blocks give exactly one site group per block and
__shared__ sobjD smem[nsimd] is correctly sized without depending on
the runtime acceleratorThreads() value. threadIdx.x (acceleratorSIMTlane)
indexes the SIMD lane for coalesced reads; lane 0 sums smem[0..nsimd-1]
and writes one sobjD per site. CUB then reduces osites elements instead
of osites*nsimd, reducing both store traffic and CUB work by Nsimd.
acceleratorSynchronise() (warp-level) suffices since nsimd < warpSize.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 21:08:10 -04:00
Peter Boyle 747c167658 sumD_gpu_direct: one thread per SIMD lane using extractLane
Replaces one thread per outer site calling Reduce() (sequential Nsimd-wide
loop) with one thread per lane calling extractLane() — O(1) per thread.
CUB now reduces over osites*Nsimd elements. Avoids serial lane reduction
but leaves the per-lane sobjD store stride as a known remaining concern.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 16:21:50 -04:00
Peter Boyle fca2c5dba0 Lattice_reduction_gpu_cub: define GRID_REDUCTION_TIMING in header
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 14:54:08 -04:00
Peter Boyle e12bc7f07c Lattice_reduction_gpu_cub: add GRID_REDUCTION_TIMING instrumentation
Guards accelerator_for and CUB DeviceReduce calls in sumD_gpu_direct
and sumD_gpu_large with #ifdef GRID_REDUCTION_TIMING to isolate where
time is spent in each path. Large path accumulates across all groups
and prints totals with words/nfull/rem context.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 14:23:44 -04:00
Peter Boyle dc6ae51cab Lattice_reduction_gpu_cub: replace WordBundle4 with iVector<iScalar<scalarD>,4>
WordBundle4 was redundant with Grid's existing tensor infrastructure.
iVector<iScalar<scalarD>,4> already provides accelerator_inline operator+,
zeroit(), and sycl::is_device_copyable — no new type needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 13:55:28 -04:00
Peter Boyle baa70d8ec9 Test_reduction: add timing benchmark for new vs old reduction paths
Reports us/call and GB/s for sum_gpu (CUB/sycl::reduction) and
sum_gpu_old (hand-rolled shared-memory) for each field type, with
5-call warmup and 100-call timed loop.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 12:31:13 -04:00
Peter Boyle c93b338bdd skills: HPC battle-hardening skill files for GPU+MPI correctness
Six skill files encoding expertise for making codebases robust on
problematic HPC systems, covering: correctness verification
(double-run, fingerprinting, flight recorder), hang diagnosis,
GPU runtime correctness (premature barrier, infinite poll),
MPI correctness on heterogeneous systems (device buffer aliasing,
AARCH64 PLT corruption, deterministic reductions),
compiler validation, and communication/computation overlap pipeline
design.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 12:10:44 -04:00
Peter Boyle c0472aa0ec Test_reduction: use separate float and double grids
Float fields require a grid constructed with vComplexF::Nsimd(); using
a double grid causes grid->_gsites to undercount the sites in float
vobjF, making the constant-field expected value wrong.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 12:09:35 -04:00
Peter Boyle 09552cfd73 Rename scalarNorm2 to squaredSum in Test_reduction.cc
The function computes |sum|^2 — the squared magnitude of an aggregate sum —
not a norm. squaredSum makes clear that squaring is applied to the sum, not
to individual site values before summing, distinguishing it from sumOfSquares
(the squared L2 norm).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 23:15:11 -04:00
Peter Boyle 003fec509c Fix Zero() used on thrust::complex in WordBundle4 initialisation
Grid's Zero() sentinel is not assignable to thrust::complex<double>;
use scalarD(0) instead.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 18:10:17 -04:00
Peter Boyle 773a82d87f Reinstate large/small dispatch in CUB reduction path; radix-4 word-bundle for large types
rocPRIM's DeviceReduce requires warpSize(64) threads each holding one element in shared
memory, so sizeof(T)*64 must fit in sharedMemPerBlock.  LatticePropagator::scalar_objectD
is 2304 bytes (64*2304 = 147 KB), exceeding the budget and triggering a compile-time
static_assert in limit_block_size.

Introduce sumD_gpu_direct (the original direct-CUB path, safe for small types) and a new
sumD_gpu_large that groups the vobj's vector_type words in bundles of 4, reducing each
bundle as WordBundle4<scalarD> (64 bytes, 64*64 = 4 KB — always within budget).  If
words % 4 != 0, the final partial bundle is zero-padded.  sumD_gpu dispatches at compile
time via if constexpr on sizeof(sobjD) > 512.

For LatticePropagator (144 words) this gives 36 CUB launches instead of 144.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 16:55:58 -04:00
Peter Boyle 286c29d6fb Add Test_reduction to tests/debug
Tests the new CUB/hipCUB/SYCL lattice reduction (sum_gpu) against the
preserved hand-rolled implementation (sum_gpu_old) for LatticeComplexF/D,
LatticeColourMatrixF/D and LatticePropagatorF/D.

Part a) gaussian random field: checks that old and new agree to within
float/double roundoff tolerance.
Part b) constant field (= 1.0, identity-matrix init): verifies
innerProduct(sum, sum) = Ncomp * V^2 where Ncomp counts the nonzero
diagonal scalar components per site (1 / Nc / Ns*Nc respectively).

Make.inc is auto-generated by scripts/filelist on bootstrap and is not
tracked; the new .cc file is all that is needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 14:31:33 -04:00
Peter Boyle 969b0a3922 Rewrite lattice GPU reduction to use CUB, hipCUB, and SYCL reduction
Replace hand-rolled shared-memory reduction kernels (reduceBlock/reduceBlocks/
reduceKernel) and the global device variable retirementCount with a unified
CUB/hipCUB DeviceReduce::Reduce path for CUDA/HIP and sycl::reduction for SYCL.
No small/large split is needed: both CUB and sycl::reduction handle arbitrary
object sizes internally.

Old implementations preserved as sum_gpu_old / sumD_gpu_old etc. in the
original files for regression testing on GPU hardware.

Also add CLAUDE.md with build, test, and architecture guidance.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 13:41:56 -04:00
26 changed files with 1931 additions and 1169 deletions
-17
View File
@@ -30,9 +30,6 @@ Key configure options:
| `--enable-Nc=` | `3` (default), `2`, `4`, `5` |
| `--with-gmp=`, `--with-mpfr=`, `--with-fftw=`, `--with-lime=` | paths to libs |
| `--enable-hdf5`, `--enable-mkl`, `--enable-lapack` | optional features |
| `--enable-debug=yes` | adds `-g`, removes `-O3` |
Use `make V=1` for verbose compiler output (shows full flags; required for bug reports).
Platform recipes from `README.md`:
- **KNL**: `--enable-simd=KNL --enable-comms=mpi3-auto --enable-mkl`
@@ -99,17 +96,3 @@ GPU support is injected via macros (`accelerator_for`, `accelerator_for2dNB`). T
- The `RealD`/`RealF`/`ComplexD`/`ComplexF` typedefs are used everywhere; avoid raw `double`/`float`.
- Logging uses `Grid_log`, `Grid_error` macros (from `Grid/log/`); performance-critical paths use the `GRID_TRACE` / timer macros from `Grid/perfmon/`.
- Reductions across MPI ranks go through `GridBase::GlobalSum` / `GlobalMax`; never reduce with bare MPI calls inside library code.
## Skills
`skills/` contains seven user-invocable Claude Code skills encoding deep domain knowledge for HPC work in this repo. Invoke with `/skill-name` or ask Claude to use them by name:
| Skill | When to use |
|-------|-------------|
| `gpu-memory-performance` | Bandwidth/occupancy problems — `acceleratorThreads()` pitfalls, `coalescedRead/Write`, fused vs staged HBM patterns |
| `gpu-runtime-correctness` | Wrong answers, non-deterministic results, premature `q.wait()` returns |
| `communication-overlap` | Designing GPU+MPI overlap pipelines; replacing broken accelerator-aware MPI paths with host-staged 7-phase pipeline |
| `mpi-heterogeneous` | Collective hangs, buffer aliasing in `MPI_Sendrecv`, heterogeneous topology bugs |
| `hang-diagnosis` | Distinguishing ioctl hangs, infinite poll loops, collective deadlocks, and silent wrong-answer races |
| `correctness-verification` | Reproducibility checksums, double-wait testing, bisecting non-deterministic failures |
| `compiler-validation` | Confirming compiler/optimisation flags are safe before production runs |
+1
View File
@@ -53,6 +53,7 @@ NAMESPACE_CHECK(approx);
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
// Not really deflation, but useful
#include <Grid/algorithms/blas/MomentumProject.h>
#include <Grid/algorithms/blas/A2ASpatialSum.h>
NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
+213
View File
@@ -0,0 +1,213 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Grid/algorithms/blas/A2ASpatialSum.h
Copyright (C) 2025
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/*
A2ASpatialSum
Replaces the scalar spatial accumulation loop in A2A extended meson field
contractions with a batched GEMM over local time slices, enabling GPU offload.
Given:
leftv[N_i][osite] — conjugated left SpinColourVectors (SIMD-packed)
loopRight[N_j][osite]— type-contracted right SpinColourVectors (SIMD-packed)
Computes:
EMF[i,j,t] = sum_{x,s,c} leftv[i][x,t,s,c] * loopRight[j][x,t,s,c]
via batched GEMM over nt local time slices, then GlobalSumVector across MPI.
Memory layout (all C row-major):
W_buf [nt][N_i][nxyz*Nsc] — W[t][i][x*Nsc+sc] = leftv[i] at (x,t)
LR_buf [nt][N_j][nxyz*Nsc] — LR[t][j][x*Nsc+sc] = loopRight[j] at (x,t)
EMF_buf[nt][N_j][N_i] — column-major result; EMF[i,j,t] = EMF_buf[t][j][i]
BLAS call (column-major, OP_T on A so A is read as W[i][k]):
C = A^T * B where A=W[N_i×K C-row], B=LR[N_j×K C-row], C=[N_j×N_i C-row]
→ C[i,j] = sum_k W[i][k] * LR[j][k] = EMF[i,j] ✓
*/
template<class vobj>
class A2ASpatialSum
{
public:
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_object sobj;
GridBase *grid;
int N_i, N_j;
int nt, nxyz, Nsc;
deviceVector<scalar> W_buf;
deviceVector<scalar> LR_buf;
deviceVector<scalar> EMF_buf;
deviceVector<scalar *> W_ptrs;
deviceVector<scalar *> LR_ptrs;
deviceVector<scalar *> EMF_ptrs;
A2ASpatialSum() : grid(nullptr), N_i(0), N_j(0), nt(0), nxyz(0), Nsc(0) {}
void Allocate(int _N_i, int _N_j, GridBase *_grid)
{
grid = _grid;
N_i = _N_i;
N_j = _N_j;
Coordinate ldims = grid->LocalDimensions();
nt = ldims[grid->Nd() - 1];
nxyz = grid->lSites() / nt;
Nsc = sizeof(sobj) / sizeof(scalar);
W_buf.resize(nt * N_i * nxyz * Nsc);
LR_buf.resize(nt * N_j * nxyz * Nsc);
EMF_buf.resize(nt * N_j * N_i);
// Build persistent batch pointer arrays
W_ptrs.resize(nt);
LR_ptrs.resize(nt);
EMF_ptrs.resize(nt);
scalar *Wh = &W_buf[0];
scalar *LRh = &LR_buf[0];
scalar *EMFh = &EMF_buf[0];
int lN_i = N_i, lN_j = N_j, lnxyz = nxyz, lNsc = Nsc;
for (int t = 0; t < nt; t++) {
acceleratorPut(W_ptrs[t], Wh + t * lN_i * lnxyz * lNsc);
acceleratorPut(LR_ptrs[t], LRh + t * lN_j * lnxyz * lNsc);
acceleratorPut(EMF_ptrs[t], EMFh + t * lN_j * lN_i);
}
}
void PackLeft(const std::vector<Lattice<vobj>> &leftv)
{
GRID_ASSERT((int)leftv.size() == N_i);
PackVectors(leftv, &W_buf[0], N_i);
}
void PackRight(const std::vector<Lattice<vobj>> &loopRight)
{
GRID_ASSERT((int)loopRight.size() == N_j);
PackVectors(loopRight, &LR_buf[0], N_j);
}
private:
// Pack vecs[N] lattice fields into buf[nt][N][nxyz*Nsc], extracting all SIMD lanes.
void PackVectors(const std::vector<Lattice<vobj>> &vecs, scalar *buf, int N)
{
int nd = grid->_ndimension;
int osites = grid->oSites();
int Nsimd = vobj::Nsimd();
int lN = N;
int lNsc = Nsc;
int lnxyz = nxyz;
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims = grid->LocalDimensions();
Coordinate simd = grid->_simd_layout;
for (int n = 0; n < N; n++) {
autoView(src_v, vecs[n], AcceleratorRead);
accelerator_for(sf, osites, Nsimd, {
#ifdef GRID_SIMT
{
int lane = acceleratorSIMTlane(Nsimd);
#else
for (int lane = 0; lane < Nsimd; lane++) {
#endif
Coordinate icoor(nd), ocoor(nd), lcoor(nd);
Lexicographic::CoorFromIndex(icoor, lane, simd);
Lexicographic::CoorFromIndex(ocoor, sf, rdimensions);
for (int d = 0; d < nd; d++)
lcoor[d] = rdimensions[d] * icoor[d] + ocoor[d];
int l_t = lcoor[nd - 1];
Coordinate xyz_coor = lcoor;
xyz_coor[nd - 1] = 0;
int64_t l_xyz;
Lexicographic::IndexFromCoor(xyz_coor, l_xyz, ldims);
sobj data = extractLane(lane, src_v[sf]);
scalar *data_s = (scalar *)&data;
int64_t base = (int64_t)l_t * lN * lnxyz * lNsc
+ (int64_t)n * lnxyz * lNsc
+ l_xyz * lNsc;
for (int sc = 0; sc < lNsc; sc++)
buf[base + sc] = data_s[sc];
}
});
}
}
public:
// Batched GEMM + MPI reduction → result[nt_global][N_i][N_j]
//
// BLAS (column-major, OP_T on A):
// C[N_j×N_i] = A^T[N_i×K] * B[N_j×K] with K=nxyz*Nsc
// reading A as C row-major [N_i][K] and B as C row-major [N_j][K]
// → C[i,j] = sum_k W[i,k] * LR[j,k] = EMF[i,j] ✓
void Sum(Eigen::Tensor<ComplexD, 3> &result)
{
GridBLAS BLAS;
int K = nxyz * Nsc;
BLAS.gemmBatched(GridBLAS_OP_T, GridBLAS_OP_N,
N_i, N_j, K,
scalar(1.0),
W_ptrs,
LR_ptrs,
scalar(0.0),
EMF_ptrs);
BLAS.synchronise();
// Copy from device and distribute into global-t layout
int nt_global = result.dimension(0);
int nd = grid->Nd();
int lt_start = grid->LocalStarts()[nd - 1];
std::vector<scalar> host_emf(nt * N_j * N_i);
acceleratorCopyFromDevice(&EMF_buf[0], host_emf.data(),
nt * N_j * N_i * sizeof(scalar));
// EMF_buf[t][j*N_i + i] = EMF[i,j] for local t
std::vector<scalar> global_emf(nt_global * N_i * N_j, scalar(0.0));
for (int lt = 0; lt < nt; lt++) {
int gt = lt + lt_start;
for (int i = 0; i < N_i; i++)
for (int j = 0; j < N_j; j++)
global_emf[gt * N_i * N_j + i * N_j + j] = host_emf[lt * N_j * N_i + j * N_i + i];
}
grid->GlobalSumVector(global_emf.data(), nt_global * N_i * N_j);
for (int gt = 0; gt < nt_global; gt++)
for (int i = 0; i < N_i; i++)
for (int j = 0; j < N_j; j++)
result(gt, i, j) = global_emf[gt * N_i * N_j + i * N_j + j];
}
};
NAMESPACE_END(Grid);
@@ -288,21 +288,17 @@ public:
//----------------------------------------------------------------------
if ( Nm>Nk ) {
Glog <<" #Ritz values of poly(A) before shift ["<<Nm<<" values]:"<<std::endl;
for(int i=0; i<Nm; ++i){
std::cout.precision(8);
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
std::cout << "Rval = "<<std::setw(16)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
}
// Glog <<" #Apply shifted QR transformations "<<std::endl;
//int k2 = Nk+Nu;
int k2 = Nk;
Eigen::MatrixXcd BTDM = Eigen::MatrixXcd::Identity(Nm,Nm);
Q = Eigen::MatrixXcd::Identity(Nm,Nm);
unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
for(int ip=Nk; ip<Nm; ++ip){
Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
}
@@ -330,11 +326,11 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
_sort.push(eval2,Nk);
Glog << "#Ritz values of poly(A) after shift ["<<Nk<<" values]:"<<std::endl;
// Glog << "#Ritz value after shift: "<< std::endl;
for(int i=0; i<Nk; ++i){
std::cout.precision(8);
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
std::cout << "Rval = "<<std::setw(16)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
// std::cout.precision(13);
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
}
}
//----------------------------------------------------------------------
@@ -714,17 +710,11 @@ private:
//lme[0][L] = beta;
for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
GRID_ASSERT (!isnan(norm2(w[u])));
}
// Diagnostic: print alpha (lmd) and beta (lme) block diagonals for this step
{
Glog << " blk b="<<std::setw(3)<<b<<" alpha:";
for (int u=0; u<Nu; ++u)
std::cout << " " << std::setw(10) << std::setprecision(6) << real(lmd[u][L+u]);
std::cout << " |beta|:";
for (int u=0; u<Nu; ++u)
std::cout << " " << std::setw(10) << std::setprecision(6) << abs(lme[u][L+u]);
std::cout << std::endl;
for (int k=L+u; k<R; ++k) {
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
}
}
// Glog << "LinAlg done "<< std::endl;
+28 -23
View File
@@ -32,6 +32,33 @@ Author: Christoph Lehner <christoph@lhnr.de>
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////
// innerProductD scalar overloads must be visible before norm2 is defined
//////////////////////////////////////
accelerator_inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){ return innerProduct(l,r); }
accelerator_inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){ return innerProduct(l,r); }
accelerator_inline RealD innerProductD(const RealD &l,const RealD &r){ return innerProduct(l,r); }
accelerator_inline RealD innerProductD(const RealF &l,const RealF &r){ return innerProduct(l,r); }
accelerator_inline vComplexD innerProductD(const vComplexD &l,const vComplexD &r){ return innerProduct(l,r); }
accelerator_inline vRealD innerProductD(const vRealD &l,const vRealD &r){ return innerProduct(l,r); }
accelerator_inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r)
{
vComplexD la,lb;
vComplexD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
return innerProduct(la,ra) + innerProduct(lb,rb);
}
accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r)
{
vRealD la,lb;
vRealD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
return innerProduct(la,ra) + innerProduct(lb,rb);
}
///////////////////////////////////////////////////////////////////////////////////////
// innerProduct Scalar x Scalar -> Scalar
// innerProduct Vector x Vector -> Scalar
@@ -138,30 +165,8 @@ auto Reduce (const iScalar<l>& lhs) -> iScalar<decltype(Reduce(lhs._internal))>
//////////////////////////////////////
// innerProductD : if single promote to double and evaluate with sum 2x
// (scalar/vector overloads are declared above norm2 for ADL visibility)
//////////////////////////////////////
accelerator_inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){ return innerProduct(l,r); }
accelerator_inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){ return innerProduct(l,r); }
accelerator_inline RealD innerProductD(const RealD &l,const RealD &r){ return innerProduct(l,r); }
accelerator_inline RealD innerProductD(const RealF &l,const RealF &r){ return innerProduct(l,r); }
accelerator_inline vComplexD innerProductD(const vComplexD &l,const vComplexD &r){ return innerProduct(l,r); }
accelerator_inline vRealD innerProductD(const vRealD &l,const vRealD &r){ return innerProduct(l,r); }
accelerator_inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r)
{
vComplexD la,lb;
vComplexD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
return innerProduct(la,ra) + innerProduct(lb,rb);
}
accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r)
{
vRealD la,lb;
vRealD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
return innerProduct(la,ra) + innerProduct(lb,rb);
}
// Now do it for vector, matrix, scalar
template<class l,class r,int N> accelerator_inline
+11 -40
View File
@@ -33,6 +33,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
;
int main (int argc, char ** argv)
{
@@ -96,49 +97,19 @@ int main (int argc, char ** argv)
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionD Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
NaiveStaggeredFermionD Dn(Umu,Grid,RBGrid,mass,c1,u0,params);
FermionField src_e(&RBGrid); pickCheckerboard(Even, src_e, src);
FermionField src_o(&RBGrid); pickCheckerboard(Odd, src_o, src);
FermionField res_e(&RBGrid); res_e = Zero();
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
int ncall=1000;
// ImprovedStaggered Dhop
for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
double t0=usecond();
for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t1=usecond();
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // 1146 flops/site
std::cout<<GridLogMessage << ncall << " ImprovedStaggered Dhop calls in "<< (t1-t0)<<" us"<<std::endl;
std::cout<<GridLogMessage << "ImprovedStaggered Dhop mflop/s = "<< flops/(t1-t0)<<std::endl;
// ImprovedStaggered DhopEO
for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
t0=usecond();
for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
t1=usecond();
flops=(16*(3*(6+8+8)) + 15*3*2)*(volume/2)*ncall;
std::cout<<GridLogMessage << ncall << " ImprovedStaggered DhopEO calls in "<< (t1-t0)<<" us"<<std::endl;
std::cout<<GridLogMessage << "ImprovedStaggered DhopEO mflop/s = "<< flops/(t1-t0)<<std::endl;
// NaiveStaggered Dhop
for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
t0=usecond();
for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
t1=usecond();
flops=(8*(3*(6+8+8)) + 7*3*2)*volume*ncall;
std::cout<<GridLogMessage << ncall << " NaiveStaggered Dhop calls in "<< (t1-t0)<<" us"<<std::endl;
std::cout<<GridLogMessage << "NaiveStaggered Dhop mflop/s = "<< flops/(t1-t0)<<std::endl;
// NaiveStaggered DhopEO
for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
t0=usecond();
for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
t1=usecond();
flops=(8*(3*(6+8+8)) + 7*3*2)*(volume/2)*ncall;
std::cout<<GridLogMessage << ncall << " NaiveStaggered DhopEO calls in "<< (t1-t0)<<" us"<<std::endl;
std::cout<<GridLogMessage << "NaiveStaggered DhopEO mflop/s = "<< flops/(t1-t0)<<std::endl;
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
Grid_finalize();
}
+14 -37
View File
@@ -96,45 +96,22 @@ int main (int argc, char ** argv)
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionF Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
NaiveStaggeredFermionF Dn(Umu,Grid,RBGrid,mass,c1,u0,params);
FermionField src_e(&RBGrid); pickCheckerboard(Even, src_e, src);
FermionField src_o(&RBGrid); pickCheckerboard(Odd, src_o, src);
FermionField res_e(&RBGrid); res_e = Zero();
int ncall=10000;
// ImprovedStaggered Dhop
for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
int ncall=1000;
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t0=usecond();
for(int i=0;i<ncall;i++) Ds.Dhop(src,result,0);
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t1=usecond();
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // 1146 flops/site
std::cout<<GridLogMessage << "ImprovedStaggered Dhop mflop/s = "<< flops/(t1-t0)<<std::endl;
// ImprovedStaggered DhopEO
for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
t0=usecond();
for(int i=0;i<ncall;i++) Ds.DhopEO(src_o,res_e,0);
t1=usecond();
flops=(16*(3*(6+8+8)) + 15*3*2)*(volume/2)*ncall;
std::cout<<GridLogMessage << "ImprovedStaggered DhopEO mflop/s = "<< flops/(t1-t0)<<std::endl;
// NaiveStaggered Dhop
for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
t0=usecond();
for(int i=0;i<ncall;i++) Dn.Dhop(src,result,0);
t1=usecond();
flops=(8*(3*(6+8+8)) + 7*3*2)*volume*ncall;
std::cout<<GridLogMessage << "NaiveStaggered Dhop mflop/s = "<< flops/(t1-t0)<<std::endl;
// NaiveStaggered DhopEO
for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
t0=usecond();
for(int i=0;i<ncall;i++) Dn.DhopEO(src_o,res_e,0);
t1=usecond();
flops=(8*(3*(6+8+8)) + 7*3*2)*(volume/2)*ncall;
std::cout<<GridLogMessage << "NaiveStaggered DhopEO mflop/s = "<< flops/(t1-t0)<<std::endl;
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
Grid_finalize();
}
+6 -170
View File
@@ -716,161 +716,6 @@ public:
return mflops_best;
}
static double NaiveStaggered(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); GRID_ASSERT(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark NaiveStaggered on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
typedef NaiveStaggeredFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
typename Action::ImplParams params;
Action Ds(Umu,*FGrid,*FrbGrid,mass,c1,u0,params);
///////// Source preparation ////////////
Fermion src (FGrid); random(RNG4,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
const int num_cases = 2;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
};
for(int c=0;c<num_cases;c++) {
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
uint64_t no = 50;
uint64_t ni = 100;
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
t0=usecond();
for(uint64_t j=0;j<ni;j++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=((8*(3*(6+8+8)) + 7*3*2)*1.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
return mflops_best;
}
static double Clover(int L)
{
double mflops;
@@ -1042,7 +887,6 @@ int main (int argc, char ** argv)
std::vector<double> clover;
std::vector<double> dwf4;
std::vector<double> staggered;
std::vector<double> naive_staggered;
int Ls=1;
if (do_dslash){
@@ -1070,21 +914,13 @@ int main (int argc, char ** argv)
staggered.push_back(result);
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Naive Staggered dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double result = Benchmark::NaiveStaggered(L_list[l]) ;
naive_staggered.push_back(result);
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered \t\t Naive Staggered" <<std::endl;
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<" \t\t "<<naive_staggered[l]<<std::endl;
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
@@ -1094,14 +930,14 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered \t\t NaiveStag \t|\t (GF/s per node)" <<std::endl;
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
fprintf(FP,"Per node summary table\n");
fprintf(FP,"\n");
fprintf(FP,"L , Wilson, DWF4, Staggered, NaiveStag\n");
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
fprintf(FP,"\n");
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<" \t " <<naive_staggered[l]/NN<<std::endl;
fprintf(FP,"%d , %.0f, %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.,naive_staggered[l]/NN/1000.);
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
}
fprintf(FP,"\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+61
View File
@@ -0,0 +1,61 @@
---
name: ref_a2a_emf_work
description: "A2A Extended Meson Field GPU offload work — status, file locations, pending task"
metadata:
node_type: memory
type: project
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
---
## What was built
`Grid/algorithms/blas/A2ASpatialSum.h` — batched GEMM spatial sum replacing scalar SIMD accumulation. Included via `Grid/algorithms/Algorithms.h`.
`tests/Test_extended_meson_field.cc` — test with class `A2AExtendedMesonFieldRef` containing:
- CPU reference path (`use_blas=false`)
- BLAS path (`use_blas=true`) using `A2ASpatialSum`
- Per-phase timing with `[ref type=N]` / `[blas type=N]` labels
- 4 contraction types (0-3), all verified at machine precision (~4e-16 rel_err)
## Pending task: GPU offload class
**Goal**: Write `A2AExtendedMesonFieldGPU` in the same test file, replacing all `thread_for` loops with `accelerator_for`-based free function kernels.
The `thread_for` blocks to replace all have the form:
```cpp
thread_for(r, rd, {
int so = r * grid->_ostride[orthogdim];
for (int n = 0; n < e1; n++)
for (int b = 0; b < e2; b++) {
int ss = so + n * stride + b;
// work
}
});
```
Replace with `accelerator_for(ss, grid->oSites(), Nsimd, { ... })`.
**Free functions to write** (each takes `Lattice<T>` args, opens views internally):
- `A2ALoopPropagator` — outerProduct sum (loop build)
- `A2APackLeftConjugated` — conjugate left fermion fields into `Lattice<SpinColourVector_v>`
- `A2ALoopLeftContractionType0/1/2/3` — per-site loop × loop propagator → `tloop`
- `A2ALoopRightContractionType0/1/2/3` — per-site tloop × right → `loopRight[j]`
**Data structure changes required**:
- `tloopv`: `std::vector<SpinColourMatrix_v>``Lattice<SpinColourMatrix_v>` (PropagatorField)
- `leftv[i]`: `std::vector<SpinColourVector_v>``Lattice<SpinColourVector_v>`
- `loopRight[j]`: `std::vector<SpinColourVector_v>``Lattice<SpinColourVector_v>`
**Why**: `std::vector<vobj>` is host memory, not GPU accessible. See [[ref_lattice_vs_vector]].
**`A2ASpatialSum` impact**: `PackLeft`/`PackRight` currently take `std::vector<std::vector<vobj>>`. Once leftv/loopRight become `std::vector<Lattice<vobj>>`, those signatures must change to match.
## Timing on 8.8.8.16 (N_i=N_j=8, Nloop=4, 1 MPI rank)
Dominant costs:
- `loop_build`: 4-6 ms (outerProduct over 4 propagators)
- `pack_loopright`: 0.9-2.2 ms (type-dependent)
- `spatial_sum` (ref): ~1.5 ms
- `A2ASpatialSum TOTAL`: 2.5-4.3 ms (PackLeft+PackRight dominate GEMM on small volume)
## Related
[[ref_accelerator_for]] [[ref_coalesced_views]] [[ref_lattice_vs_vector]] [[ref_grid_simt_pattern]]
+43
View File
@@ -0,0 +1,43 @@
---
name: ref_accelerator_for
description: Grid accelerator_for usage — converting block-strided thread_for to GPU-portable oSites loops
metadata:
node_type: memory
type: reference
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
---
## Pattern: block-strided thread_for → accelerator_for over oSites
Old CPU-only pattern (block-strided over orthog dimension):
```cpp
thread_for(r, rd, {
int so = r * grid->_ostride[orthogdim];
for (int n = 0; n < e1; n++)
for (int b = 0; b < e2; b++) {
int ss = so + n * stride + b;
// work on site ss
}
});
```
GPU-portable replacement:
```cpp
accelerator_for(ss, grid->oSites(), Nsimd, {
// work on site ss — one SIMT thread per (osite, lane) on GPU
// one thread per osite (lane loop implicit via GRID_SIMT) on CPU
});
```
Key rules:
- `accelerator_for(iter, count, Nsimd, body)` — Nsimd is `vobj::Nsimd()` or `grid->Nsimd()`
- On CPU: expands to `thread_for` over count, `acceleratorSIMTlane` always returns 0 — must use `#ifdef GRID_SIMT` pattern if iterating lanes explicitly (see [[ref_grid_simt_pattern]])
- On GPU: one SIMT thread per (iter × lane), `acceleratorSIMTlane(Nsimd)` returns actual lane
- Loop body must capture only scalar/POD by value or via device-accessible pointers; no `std::vector` or host containers inside the body
- `Coordinate` inside `accelerator_for` must be `AcceleratorVector<int, MaxDims>` (stack-allocated, device-safe) — Grid's `Coordinate` typedef already satisfies this
## Where defined
`Grid/threads/Accelerator.h` — CPU path ~line 607; GPU paths in conditional blocks above.
## Model file
`Grid/algorithms/blas/MomentumProject.h``ImportVector` is the canonical example of correct `accelerator_for` + SIMD lane extraction.
+70
View File
@@ -0,0 +1,70 @@
---
name: ref_coalesced_views
description: Grid coalescedRead/coalescedWrite and autoView — GPU-portable field access inside accelerator_for
metadata:
node_type: memory
type: reference
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
---
## View access modes
```cpp
autoView(v, field, AcceleratorRead); // read-only, device-accessible
autoView(v, field, AcceleratorWrite); // write-only, device-accessible
autoView(v, field, AcceleratorReadWrite); // read-write, device-accessible
autoView(v, field, CpuRead); // CPU only (avoids GPU migration)
autoView(v, field, CpuWrite); // CPU only
```
Views must be opened **before** `accelerator_for` and closed (go out of scope) **after**. Never open a view inside the accelerator_for body.
## coalescedRead / coalescedWrite
Inside `accelerator_for(ss, oSites, Nsimd, { ... })`:
```cpp
auto site = coalescedRead(v[ss]); // reads SIMT lane; returns scalar_object on GPU, vobj on CPU
coalescedWrite(v[ss], site); // writes SIMT lane
```
- `coalescedRead(v[ss])` calls `v.operator()(ss)` which on GPU returns `extractLane(lane, v[ss])` — one lane per SIMT thread, contiguous across threads → coalesced
- On CPU returns the full vobj (no lane extraction needed; handled transparently)
- The returned type is `decltype(coalescedRead(v[ss]))` — use `auto` or match with scalar_object
## Typical kernel pattern
```cpp
autoView(out_v, out, AcceleratorWrite);
autoView(in_v, in, AcceleratorRead);
accelerator_for(ss, grid->oSites(), vobj::Nsimd(), {
auto x = coalescedRead(in_v[ss]);
// modify x ...
coalescedWrite(out_v[ss], x);
});
```
## Free function kernel signature
```cpp
template<class vobj>
void MyKernel(Lattice<vobj> &out, const Lattice<vobj> &in)
{
GridBase *grid = in.Grid();
autoView(out_v, out, AcceleratorWrite);
autoView(in_v, in, AcceleratorRead);
accelerator_for(ss, grid->oSites(), vobj::Nsimd(), {
auto x = coalescedRead(in_v[ss]);
coalescedWrite(out_v[ss], x);
});
}
```
## What NOT to do
- Do not access `std::vector` elements inside `accelerator_for` — not device-accessible
- Do not use `CpuRead`/`CpuWrite` views inside `accelerator_for` — GPU will fault
- Do not assign to `v[ss]` directly inside `accelerator_for` — use `coalescedWrite`
- Do not open multiple write views on the same field simultaneously
## Related
[[ref_accelerator_for]] [[ref_lattice_vs_vector]]
+47
View File
@@ -0,0 +1,47 @@
---
name: ref_grid_simt_pattern
description: Grid GRID_SIMT
metadata:
node_type: memory
type: reference
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
---
## The problem
On CPU, `accelerator_for(sf, oSites, Nsimd, {...})` expands to `thread_for(sf, oSites, {...})` — one thread per osite. `acceleratorSIMTlane(Nsimd)` always returns **0** on CPU. If you need to iterate all Nsimd lanes (e.g. to extract SIMD-packed data), you must loop explicitly on CPU.
On GPU, `accelerator_for` launches one SIMT thread per (osite × lane). `acceleratorSIMTlane(Nsimd)` returns the actual lane index [0, Nsimd).
## Correct pattern (from MomentumProject::ImportVector)
```cpp
accelerator_for(sf, osites, Nsimd, {
#ifdef GRID_SIMT
{
int lane = acceleratorSIMTlane(Nsimd);
#else
for (int lane = 0; lane < Nsimd; lane++) {
#endif
// body using lane
}
});
```
- On GPU: `GRID_SIMT` is defined → single-lane body, lane from hardware
- On CPU: `GRID_SIMT` is not defined → explicit lane loop inside the osite thread
## When is this needed?
Only when you explicitly need the lane index, e.g.:
- Extracting scalar data from SIMD-packed `vobj` via `extractLane(lane, src[sf])`
- Computing full local coordinates from (osite, lane) → `Lexicographic::CoorFromIndex(icoor, lane, simd_layout)`
When using `coalescedRead`/`coalescedWrite`, this pattern is **not needed** — those handle lane selection transparently.
## Pitfall that caused a bug
`A2ASpatialSum::PackVectors` originally used `accelerator_for` without the `#ifdef GRID_SIMT` lane loop. On CPU, only lane=0 was extracted, giving wrong norms (~8× too small for `GEN_SIMD_WIDTH=64`, `Nsimd=4`). Fix: add the `#ifdef GRID_SIMT` pattern. See [[ref_accelerator_for]].
## Model file
`Grid/algorithms/blas/MomentumProject.h`, function `ImportVector`, lines ~166-207.
+48
View File
@@ -0,0 +1,48 @@
---
name: ref_lattice_vs_vector
description: When to use Lattice<T> vs std::vector<T> for GPU-portable field storage in Grid
metadata:
node_type: memory
type: reference
originSessionId: 956e80aa-401d-481a-80bb-17f8abe1c131
---
## Rule
Use `Lattice<vobj>` (or `std::vector<Lattice<vobj>>`) for any field that will be read or written inside `accelerator_for`. `std::vector<vobj>` is host memory and is NOT device-accessible.
## Before vs after GPU offload
```cpp
// CPU-only (host memory, not GPU accessible)
std::vector<SpinColourVector_v> tloopv(oSites, Zero());
// accessed directly: tloopv[ss]
// GPU-portable
Lattice<SpinColourVector_v> tloop(grid);
// accessed via view: autoView(tloop_v, tloop, AcceleratorWrite);
// coalescedWrite(tloop_v[ss], val);
```
## Corollary: function signatures
CPU-only version:
```cpp
void PackLeft(const std::vector<std::vector<vobj>> &leftv);
```
GPU-portable version:
```cpp
void PackLeft(const std::vector<Lattice<vobj>> &leftv);
```
## deviceVector for raw device buffers
`deviceVector<T>` (defined in Grid) is like `std::vector<T>` but in device-accessible memory. Use for raw scalar scratch/pack buffers (e.g. GEMM input/output staging). Not for structured lattice data.
## Pointer arrays for batched BLAS
`deviceVector<scalar *>` holds batch pointer arrays. Populate with `acceleratorPut(ptrs[t], base + offset)` — sets device-side pointer from host. See `A2ASpatialSum::Allocate`.
## Related
[[ref_coalesced_views]] [[ref_accelerator_for]]
+63 -78
View File
@@ -1,91 +1,76 @@
Per node summary table
L , Wilson, DWF4, Staggered, NaiveStag
8 , 90, 933, 38, 23
12 , 403, 1688, 178, 113
16 , 188, 1647, 449, 295
24 , 947, 1574, 674, 553
32 , 931, 1371, 718, 643
Memory Bandwidth
Bytes, GB/s per node
786432, 40.271620
12582912, 433.611792
63700992, 905.374321
201326592, 1114.979152
491520000, 1180.241898
6291456, 379.297050
100663296, 3754.674992
509607936, 6521.472413
1610612736, 8513.456479
3932160000, 9018.901766
GEMM
M, N, K, BATCH, GF/s per rank
16, 8, 16, 256, 0.564958
16, 16, 16, 256, 243.148058
16, 32, 16, 256, 440.346877
32, 8, 32, 256, 439.194136
32, 16, 32, 256, 847.334141
32, 32, 32, 256, 1430.892623
64, 8, 64, 256, 1242.756741
64, 16, 64, 256, 2196.689493
64, 32, 64, 256, 3697.458072
16, 8, 256, 256, 899.582627
16, 16, 256, 256, 1673.537756
16, 32, 256, 256, 2959.597089
32, 8, 256, 256, 1558.858630
32, 16, 256, 256, 2864.839445
32, 32, 256, 256, 4810.671254
64, 8, 256, 256, 2386.092942
64, 16, 256, 256, 4451.665937
64, 32, 256, 256, 5942.124095
8, 256, 16, 256, 799.867271
16, 256, 16, 256, 1584.624888
32, 256, 16, 256, 1949.422338
8, 256, 32, 256, 1389.417474
16, 256, 32, 256, 2668.344493
32, 256, 32, 256, 3234.162120
8, 256, 64, 256, 2150.925128
16, 256, 64, 256, 4012.488132
32, 256, 64, 256, 5154.785521
Communications
Packet bytes, direction, GB/s per node
4718592, 1, 245.026198
4718592, 2, 251.180996
4718592, 3, 361.110977
4718592, 5, 247.898447
4718592, 6, 249.867523
4718592, 7, 359.033061
15925248, 1, 255.030946
15925248, 2, 264.453890
15925248, 3, 392.949183
15925248, 5, 256.040644
15925248, 6, 264.681896
15925248, 7, 392.102622
37748736, 1, 258.823333
37748736, 2, 268.181577
37748736, 3, 401.478191
37748736, 5, 258.995363
37748736, 6, 268.206586
37748736, 7, 400.397611
GEMM
M, N, K, BATCH, GF/s per rank fp64
16, 8, 16, 4096, 693.316363
16, 12, 16, 4096, 657.277058
16, 16, 16, 4096, 711.992616
32, 8, 32, 4096, 821.084324
32, 12, 32, 4096, 1279.852719
32, 16, 32, 4096, 2647.096674
64, 8, 64, 4096, 2630.192325
64, 12, 64, 4096, 3338.071321
64, 16, 64, 4096, 3950.899281
16, 8, 256, 4096, 1638.362501
16, 12, 256, 4096, 2377.502234
16, 16, 256, 4096, 3048.328833
32, 8, 256, 4096, 2917.384276
32, 12, 256, 4096, 4103.085151
32, 16, 256, 4096, 5102.971860
64, 8, 256, 4096, 3222.258206
64, 12, 256, 4096, 4619.456391
64, 16, 256, 4096, 5847.916650
8, 256, 16, 4096, 1728.073337
12, 256, 16, 4096, 2356.653970
16, 256, 16, 4096, 2676.876038
8, 256, 32, 4096, 2611.531990
12, 256, 32, 4096, 3451.573106
16, 256, 32, 4096, 3966.915301
8, 256, 64, 4096, 3436.248737
12, 256, 64, 4096, 4539.497945
16, 256, 64, 4096, 5307.992323
GEMM
M, N, K, BATCH, GF/s per rank fp32
16, 8, 16, 4096, 499.017445
16, 12, 16, 4096, 731.543385
16, 16, 16, 4096, 958.800786
32, 8, 32, 4096, 1549.813550
32, 12, 32, 4096, 2147.907502
32, 16, 32, 4096, 2601.698596
64, 8, 64, 4096, 3785.446233
64, 12, 64, 4096, 5116.694843
64, 16, 64, 4096, 6109.345016
16, 8, 256, 4096, 1206.627737
16, 12, 256, 4096, 1809.699599
16, 16, 256, 4096, 2412.014053
32, 8, 256, 4096, 2406.114488
32, 12, 256, 4096, 3605.531907
32, 16, 256, 4096, 4798.444037
64, 8, 256, 4096, 4688.711196
64, 12, 256, 4096, 6990.696301
64, 16, 256, 4096, 9214.749925
8, 256, 16, 4096, 2596.307289
12, 256, 16, 4096, 3439.892562
16, 256, 16, 4096, 3907.201036
8, 256, 32, 4096, 3012.752067
12, 256, 32, 4096, 3904.217583
16, 256, 32, 4096, 4599.047092
8, 256, 64, 4096, 3721.999042
12, 256, 64, 4096, 5098.573927
16, 256, 64, 4096, 6159.080872
Per node summary table
L , Wilson, DWF4, Staggered, GF/s per node
8 , 155, 1386, 50
12 , 694, 4208, 230
16 , 1841, 6675, 609
24 , 3934, 8573, 1641
32 , 5083, 9771, 3086
1 Per node summary table Memory Bandwidth
Per node summary table
L , Wilson, DWF4, Staggered, NaiveStag
8 , 90, 933, 38, 23
12 , 403, 1688, 178, 113
16 , 188, 1647, 449, 295
24 , 947, 1574, 674, 553
32 , 931, 1371, 718, 643
Memory Bandwidth
Bytes, GB/s per node
786432, 40.271620
1 12582912, 433.611792 Memory Bandwidth
2 63700992, 905.374321 Bytes, GB/s per node
3 201326592, 1114.979152 6291456, 379.297050
4 491520000, 1180.241898 100663296, 3754.674992
5 Communications 509607936, 6521.472413
6 Packet bytes, direction, GB/s per node 1610612736, 8513.456479
7 GEMM 3932160000, 9018.901766
8 M, N, K, BATCH, GF/s per rank fp64 GEMM
9 M, N, K, BATCH, GF/s per rank
10 16, 8, 16, 256, 0.564958
11 16, 16, 16, 256, 243.148058
12 16, 32, 16, 256, 440.346877
13 32, 8, 32, 256, 439.194136
14 32, 16, 32, 256, 847.334141
15 32, 32, 32, 256, 1430.892623
16 64, 8, 64, 256, 1242.756741
17 64, 16, 64, 256, 2196.689493
18 64, 32, 64, 256, 3697.458072
19 16, 8, 256, 256, 899.582627
20 16, 16, 256, 256, 1673.537756
21 16, 32, 256, 256, 2959.597089
22 32, 8, 256, 256, 1558.858630
23 32, 16, 256, 256, 2864.839445
24 32, 32, 256, 256, 4810.671254
25 64, 8, 256, 256, 2386.092942
26 64, 16, 256, 256, 4451.665937
27 64, 32, 256, 256, 5942.124095
28 8, 256, 16, 256, 799.867271
29 16, 256, 16, 256, 1584.624888
30 32, 256, 16, 256, 1949.422338
31 8, 256, 32, 256, 1389.417474
32 16, 256, 32, 256, 2668.344493
33 32, 256, 32, 256, 3234.162120
34 8, 256, 64, 256, 2150.925128
35 16, 256, 64, 256, 4012.488132
36 32, 256, 64, 256, 5154.785521
37 Communications
38 Packet bytes, direction, GB/s per node
39 4718592, 1, 245.026198
40 4718592, 2, 251.180996
41 4718592, 3, 361.110977
42 16, 8, 16, 4096, 693.316363 4718592, 5, 247.898447
43 16, 12, 16, 4096, 657.277058 4718592, 6, 249.867523
44 16, 16, 16, 4096, 711.992616 4718592, 7, 359.033061
45 32, 8, 32, 4096, 821.084324 15925248, 1, 255.030946
46 32, 12, 32, 4096, 1279.852719 15925248, 2, 264.453890
47 15925248, 3, 392.949183
48 15925248, 5, 256.040644
49 15925248, 6, 264.681896
50 15925248, 7, 392.102622
51 37748736, 1, 258.823333
52 37748736, 2, 268.181577
53 37748736, 3, 401.478191
54 37748736, 5, 258.995363
55 37748736, 6, 268.206586
56 37748736, 7, 400.397611
57 Per node summary table
58 L , Wilson, DWF4, Staggered, GF/s per node
59 8 , 155, 1386, 50
60 12 , 694, 4208, 230
61 16 , 1841, 6675, 609
62 24 , 3934, 8573, 1641
63 32 , 5083, 9771, 3086
64
65 32, 16, 32, 4096, 2647.096674
66 64, 8, 64, 4096, 2630.192325
67 64, 12, 64, 4096, 3338.071321
64, 16, 64, 4096, 3950.899281
16, 8, 256, 4096, 1638.362501
16, 12, 256, 4096, 2377.502234
16, 16, 256, 4096, 3048.328833
32, 8, 256, 4096, 2917.384276
32, 12, 256, 4096, 4103.085151
32, 16, 256, 4096, 5102.971860
64, 8, 256, 4096, 3222.258206
64, 12, 256, 4096, 4619.456391
64, 16, 256, 4096, 5847.916650
8, 256, 16, 4096, 1728.073337
12, 256, 16, 4096, 2356.653970
16, 256, 16, 4096, 2676.876038
8, 256, 32, 4096, 2611.531990
12, 256, 32, 4096, 3451.573106
16, 256, 32, 4096, 3966.915301
8, 256, 64, 4096, 3436.248737
12, 256, 64, 4096, 4539.497945
16, 256, 64, 4096, 5307.992323
GEMM
M, N, K, BATCH, GF/s per rank fp32
16, 8, 16, 4096, 499.017445
16, 12, 16, 4096, 731.543385
16, 16, 16, 4096, 958.800786
32, 8, 32, 4096, 1549.813550
32, 12, 32, 4096, 2147.907502
32, 16, 32, 4096, 2601.698596
64, 8, 64, 4096, 3785.446233
64, 12, 64, 4096, 5116.694843
64, 16, 64, 4096, 6109.345016
16, 8, 256, 4096, 1206.627737
16, 12, 256, 4096, 1809.699599
16, 16, 256, 4096, 2412.014053
32, 8, 256, 4096, 2406.114488
32, 12, 256, 4096, 3605.531907
32, 16, 256, 4096, 4798.444037
64, 8, 256, 4096, 4688.711196
64, 12, 256, 4096, 6990.696301
64, 16, 256, 4096, 9214.749925
8, 256, 16, 4096, 2596.307289
12, 256, 16, 4096, 3439.892562
16, 256, 16, 4096, 3907.201036
8, 256, 32, 4096, 3012.752067
12, 256, 32, 4096, 3904.217583
16, 256, 32, 4096, 4599.047092
8, 256, 64, 4096, 3721.999042
12, 256, 64, 4096, 5098.573927
16, 256, 64, 4096, 6159.080872
68
69
70
71
72
73
74
75
76
-20
View File
@@ -1,20 +0,0 @@
#!/usr/bin/env bash
# Configure Grid from a build directory two levels below the source root,
# e.g.: mkdir -p systems/mac-arm/build && cd systems/mac-arm/build && bash ../config-command-homebrew
#
# Prerequisites: source sourceme-homebrew.sh first.
CXX=mpicxx ../../configure \
--enable-simd=GEN \
--enable-comms=mpi-auto \
--enable-unified=yes \
--prefix="${HOME}/Grid-install" \
--with-lime="${CLIME}" \
--with-openssl="${OPENSSL}" \
--with-gmp="${GMP}" \
--with-mpfr="${MPFR}" \
--with-fftw="${FFTW}" \
--enable-Sp=no \
--disable-fermion-reps \
--disable-gparity \
--disable-debug
+4 -5
View File
@@ -3,15 +3,14 @@
CXX=mpicxx ../../configure \
--enable-simd=GEN \
--enable-comms=mpi-auto \
--enable-Sp=no \
--enable-unified=yes \
--disable-fermion-reps \
--disable-gparity \
--with-fftw=$FFTW \
--enable-unified=yes \
--prefix /Users/peterboyle/QCD/vtk/Grid/install \
--prefix /Users/peterboyle/QCD/Grid-install \
--with-lime=$CLIME \
--with-openssl=$OPENSSL \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--disable-debug
--with-fftw=$FFTW \
--disable-debug
-15
View File
@@ -1,15 +0,0 @@
#!/usr/bin/env bash
# Environment for building Grid on Apple Silicon Mac with Homebrew dependencies.
# Usage: source systems/mac-arm/sourceme-homebrew.sh
HOMEBREW=/opt/homebrew
export GMP=${HOMEBREW}/opt/gmp
export MPFR=${HOMEBREW}/opt/mpfr
export FFTW=${HOMEBREW}/opt/fftw
export OPENSSL=${HOMEBREW}/opt/openssl@3
export CLIME=/usr/local
export PATH="${HOMEBREW}/opt/open-mpi/bin:${HOMEBREW}/bin:${PATH}"
export LDFLAGS="-L${OPENSSL}/lib"
export CPPFLAGS="-I${OPENSSL}/include"
-4
View File
@@ -1,11 +1,7 @@
source /Users/peterboyle/QCD//Spack/spack//share/spack/setup-env.sh
export FFTW=`spack find --paths fftw | grep ^fftw | awk '{print $2}' `
#export HDF5=`spack find --paths hdf5+cxx | grep ^hdf5 | awk '{print $2}' `
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
export MPFR=`spack find --paths mpfr | grep ^mpfr | awk '{print $2}' `
export OPENSSL=`spack find --paths openssl | grep openssl | awk '{print $2}' `
export GMP=`spack find --paths gmp | grep ^gmp | awk '{print $2}' `
export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
+841
View File
@@ -0,0 +1,841 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: tests/Test_extended_meson_field.cc
Copyright (C) 2015-2025
Author: Peter Boyle <pboyle@bnl.gov>
Author: Masaaki Tomii <masaaki.tomii@uconn.edu> (original Hadrons kernels)
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
#include "disable_tests_without_instantiations.h"
#ifdef ENABLE_FERMION_INSTANTIATIONS
#include <Grid/Grid.h>
#include <Grid/qcd/utils/A2Autils.h>
using namespace Grid;
typedef WilsonImplD FImpl;
typedef typename FImpl::FermionField FermionField;
typedef typename FImpl::SiteSpinor vobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;
typedef iSpinColourVector<vector_type> SpinColourVector_v;
typedef iSpinMatrix<vector_type> SpinMatrix_v;
typedef iSinglet<vector_type> Scalar_v;
typedef iSinglet<scalar_type> Scalar_s;
typedef Lattice<SpinColourMatrix_v> PropagatorField;
// CPU reference + optionally batched GEMM spatial sum, ported from
// Hadrons/Modules/MContraction/A2AExtendedMesonField.hpp
// (M. Tomii, mtomii/Hadrons:local-2025-edits). Hadrons infrastructure removed.
// thread_for / CpuRead / orthogdim=3 preserved.
class A2AExtendedMesonFieldRef
{
public:
// result is indexed [nt][N_i][N_j].
// use_blas=true replaces the scalar spatial accumulation with A2ASpatialSum.
static void compute(
Eigen::Tensor<ComplexD, 3> &result,
const std::vector<FermionField> &left,
const std::vector<FermionField> &right,
const std::vector<FermionField> &loop1,
const std::vector<FermionField> &loop2,
const std::vector<Gamma::Algebra> &gamma1,
const std::vector<Gamma::Algebra> &gamma2,
int type,
bool use_blas = false)
{
GridBase *grid = left[0].Grid();
const int orthogdim = 3;
int rd = grid->_rdimensions[orthogdim];
int ld = grid->_ldimensions[orthogdim];
int Nd = grid->_ndimension;
int Nsimd = grid->Nsimd();
int nt = result.dimension(0);
int N_i = (int)left.size();
int N_j = (int)right.size();
std::string tag = std::string(use_blas ? "[blas" : "[ref ") + " type=" + std::to_string(type) + "]";
auto Tms = [](double us) { return us * 1e-3; };
double t0;
// ------------------------------------------------------------------
// Loop propagator: sum_k outerProduct(loop1[k], loop2[k])
// ------------------------------------------------------------------
t0 = usecond();
PropagatorField loop(grid);
loop = Zero();
for (unsigned int k = 0; k < loop1.size(); ++k)
loop += outerProduct(loop1[k], loop2[k]);
std::cout << GridLogMessage << tag << " loop_build: " << Tms(usecond()-t0) << " ms\n";
// ------------------------------------------------------------------
// Pack conjugated left vectors
// ------------------------------------------------------------------
t0 = usecond();
std::vector<FermionField> leftv(N_i, grid);
for (int i = 0; i < N_i; i++)
leftv[i] = conjugate(left[i]);
std::cout << GridLogMessage << tag << " pack_left: " << Tms(usecond()-t0) << " ms\n";
// ------------------------------------------------------------------
// Per-site loop contraction into PropagatorField tloop (type-dependent)
// ------------------------------------------------------------------
t0 = usecond();
PropagatorField tloop(grid);
tloop = Zero();
{
autoView(tloopv, tloop, CpuWrite);
autoView(loopv, loop, CpuRead);
if (type == 0) {
thread_for(ss, grid->oSites(), {
for (int s1 = 0; s1 < Ns; ++s1)
for (int s2 = 0; s2 < Ns; ++s2)
tloopv[ss]()(s1,s2)(0,0) = loopv[ss]()(s1,s2)(0,0)
+ loopv[ss]()(s1,s2)(1,1)
+ loopv[ss]()(s1,s2)(2,2);
});
}
if (type == 1) {
thread_for(ss, grid->oSites(), {
tloopv[ss] = Zero();
for (int mu = 0; mu < (int)gamma1.size(); ++mu)
tloopv[ss] = tloopv[ss] + Gamma(gamma1[mu]) * loopv[ss] * Gamma(gamma2[mu]);
});
}
if (type == 2) {
thread_for(ss, grid->oSites(), {
tloopv[ss] = Zero();
for (int mu = 0; mu < (int)gamma2.size(); ++mu) {
SpinColourMatrix_v tmp = Gamma(gamma2[mu]) * loopv[ss];
int s1 = mu / Ns;
int s2 = mu % Ns;
for (int c1 = 0; c1 < Nc; ++c1)
for (int c2 = 0; c2 < Nc; ++c2)
tloopv[ss]()(s1,s2)(c1,c2) = tmp()(0,0)(c1,c2) + tmp()(1,1)(c1,c2)
+ tmp()(2,2)(c1,c2) + tmp()(3,3)(c1,c2);
}
});
}
if (type == 3) {
thread_for(ss, grid->oSites(), {
SpinMatrix_v spinLoop = Zero();
for (int s1 = 0; s1 < Ns; ++s1)
for (int s2 = 0; s2 < Ns; ++s2)
spinLoop()(s1,s2)() = loopv[ss]()(s1,s2)(0,0)
+ loopv[ss]()(s1,s2)(1,1)
+ loopv[ss]()(s1,s2)(2,2);
tloopv[ss] = Zero();
for (int mu = 0; mu < (int)gamma1.size(); ++mu) {
SpinMatrix_v tmp2 = Gamma(gamma1[mu]) * spinLoop * Gamma(gamma2[mu]);
for (int s1 = 0; s1 < Ns; ++s1)
for (int s2 = 0; s2 < Ns; ++s2)
tloopv[ss]()(s1,s2)(0,0) = tloopv[ss]()(s1,s2)(0,0) + tmp2()(s1,s2)();
}
});
}
}
std::cout << GridLogMessage << tag << " tloop: " << Tms(usecond()-t0) << " ms\n";
// Select addLoopRight kernel for this type
std::function<void(SpinColourVector_v &,
const SpinColourMatrix_v &,
const SpinColourVector_v &,
const std::vector<Gamma::Algebra> &,
const std::vector<Gamma::Algebra> &)> addLoopRight;
if (type == 0) {
addLoopRight = [](SpinColourVector_v &lR,
const SpinColourMatrix_v &loopm,
const SpinColourVector_v &rightv,
const std::vector<Gamma::Algebra> &g1,
const std::vector<Gamma::Algebra> &g2) {
SpinMatrix_v spinLoop = Zero();
for (int s1 = 0; s1 < Ns; ++s1)
for (int s2 = 0; s2 < Ns; ++s2)
spinLoop()(s1,s2)() = loopm()(s1,s2)(0,0);
for (int mu = 0; mu < (int)g1.size(); ++mu) {
SpinMatrix_v GLoop = Gamma(g2[mu]) * spinLoop;
auto trGLoop = GLoop()(0,0)() + GLoop()(1,1)() + GLoop()(2,2)() + GLoop()(3,3)();
SpinColourVector_v Grightv = Gamma(g1[mu]) * rightv;
for (int s = 0; s < Ns; ++s)
for (int c = 0; c < Nc; ++c)
lR()(s)(c) += Grightv()(s)(c) * trGLoop;
}
};
}
if (type == 1) {
addLoopRight = [](SpinColourVector_v &lR,
const SpinColourMatrix_v &loopm,
const SpinColourVector_v &rightv,
const std::vector<Gamma::Algebra> &g1,
const std::vector<Gamma::Algebra> &g2) {
for (int s = 0; s < Ns; ++s)
for (int c = 0; c < Nc; ++c) {
lR()(s)(c)
+= loopm()(s,0)(c,0) * rightv()(0)(0)
+ loopm()(s,0)(c,1) * rightv()(0)(1)
+ loopm()(s,0)(c,2) * rightv()(0)(2)
+ loopm()(s,1)(c,0) * rightv()(1)(0)
+ loopm()(s,1)(c,1) * rightv()(1)(1)
+ loopm()(s,1)(c,2) * rightv()(1)(2)
+ loopm()(s,2)(c,0) * rightv()(2)(0)
+ loopm()(s,2)(c,1) * rightv()(2)(1)
+ loopm()(s,2)(c,2) * rightv()(2)(2)
+ loopm()(s,3)(c,0) * rightv()(3)(0)
+ loopm()(s,3)(c,1) * rightv()(3)(1)
+ loopm()(s,3)(c,2) * rightv()(3)(2);
}
};
}
if (type == 2) {
addLoopRight = [](SpinColourVector_v &lR,
const SpinColourMatrix_v &loopm,
const SpinColourVector_v &rightv,
const std::vector<Gamma::Algebra> &g1,
const std::vector<Gamma::Algebra> &g2) {
for (int mu = 0; mu < (int)g1.size(); ++mu) {
int s1 = mu / Ns;
int s2 = mu % Ns;
SpinColourVector_v Grightv = Gamma(g1[mu]) * rightv;
for (int s = 0; s < Ns; ++s)
for (int c = 0; c < Nc; ++c)
lR()(s)(c) += loopm()(s1,s2)(c,0) * Grightv()(s)(0)
+ loopm()(s1,s2)(c,1) * Grightv()(s)(1)
+ loopm()(s1,s2)(c,2) * Grightv()(s)(2);
}
};
}
if (type == 3) {
addLoopRight = [](SpinColourVector_v &lR,
const SpinColourMatrix_v &loopm,
const SpinColourVector_v &rightv,
const std::vector<Gamma::Algebra> &g1,
const std::vector<Gamma::Algebra> &g2) {
for (int s = 0; s < Ns; ++s)
for (int c = 0; c < Nc; ++c)
lR()(s)(c) += loopm()(s,0)(0,0) * rightv()(0)(c)
+ loopm()(s,1)(0,0) * rightv()(1)(c)
+ loopm()(s,2)(0,0) * rightv()(2)(c)
+ loopm()(s,3)(0,0) * rightv()(3)(c);
};
}
// ------------------------------------------------------------------
// Pack loopRight[j] = type-kernel(tloop, right[j]) per site
// ------------------------------------------------------------------
t0 = usecond();
std::vector<FermionField> loopRight(N_j, grid);
{
autoView(tlv, tloop, CpuRead);
for (int j = 0; j < N_j; j++) {
loopRight[j] = Zero();
autoView(lRv, loopRight[j], CpuWrite);
autoView(rv, right[j], CpuRead);
thread_for(ss, grid->oSites(), {
addLoopRight(lRv[ss], tlv[ss], rv[ss], gamma1, gamma2);
});
}
}
std::cout << GridLogMessage << tag << " pack_loopright: " << Tms(usecond()-t0) << " ms\n";
if (use_blas) {
// ------------------------------------------------------------------
// BLAS path: A2ASpatialSum (Allocate + PackLeft + PackRight + Sum)
// ------------------------------------------------------------------
A2ASpatialSum<SpinColourVector_v> spatial_sum;
double t_blas_start = usecond();
t0 = usecond();
spatial_sum.Allocate(N_i, N_j, grid);
std::cout << GridLogMessage << tag << " Allocate: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
spatial_sum.PackLeft(leftv);
std::cout << GridLogMessage << tag << " PackLeft: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
spatial_sum.PackRight(loopRight);
std::cout << GridLogMessage << tag << " PackRight: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
spatial_sum.Sum(result);
std::cout << GridLogMessage << tag << " Sum (GEMM+MPI): " << Tms(usecond()-t0) << " ms\n";
std::cout << GridLogMessage << tag << " A2ASpatialSum: " << Tms(usecond()-t_blas_start) << " ms [TOTAL]\n";
} else {
// ------------------------------------------------------------------
// Reference path: SIMD spatial sum + scalar extraction
// ------------------------------------------------------------------
int MFrvol = rd * N_i * N_j;
int MFlvol = ld * N_i * N_j;
Vector<Scalar_v> lvSum(MFrvol);
thread_for(r, MFrvol, { lvSum[r] = Zero(); });
t0 = usecond();
{
int e1 = grid->_slice_nblock[orthogdim];
int e2 = grid->_slice_block [orthogdim];
int stride = grid->_slice_stride[orthogdim];
using LView = decltype(leftv[0].View(CpuRead));
using RView = decltype(loopRight[0].View(CpuRead));
std::vector<LView> lv_views;
std::vector<RView> lr_views;
lv_views.reserve(N_i);
lr_views.reserve(N_j);
for (int i = 0; i < N_i; i++) lv_views.push_back(leftv[i].View(CpuRead));
for (int j = 0; j < N_j; j++) lr_views.push_back(loopRight[j].View(CpuRead));
thread_for(r, rd, {
int so = r * grid->_ostride[orthogdim];
int base = N_i * N_j * r;
for (int n = 0; n < e1; n++)
for (int b = 0; b < e2; b++) {
int ss = so + n * stride + b;
for (int ii = 0; ii < N_i; ii++)
for (int jj = 0; jj < N_j; jj++) {
int idx = jj + N_j * ii + base;
for (int s = 0; s < Ns; ++s)
for (int c = 0; c < Nc; ++c)
lvSum[idx]()()() += lv_views[ii][ss]()(s)(c) * lr_views[jj][ss]()(s)(c);
}
}
});
for (auto &v : lv_views) v.ViewClose();
for (auto &v : lr_views) v.ViewClose();
}
std::cout << GridLogMessage << tag << " spatial_sum: " << Tms(usecond()-t0) << " ms\n";
Vector<Scalar_s> lsSum(MFlvol);
thread_for(r, MFlvol, { lsSum[r] = scalar_type(0.0); });
t0 = usecond();
thread_for(rt, rd, {
Coordinate icoor(Nd);
ExtractBuffer<Scalar_s> extracted(Nsimd);
for (int ii = 0; ii < N_i; ii++)
for (int jj = 0; jj < N_j; jj++) {
int ij_rdx = jj + N_j * (ii + N_i * rt);
extract(lvSum[ij_rdx], extracted);
for (int idx = 0; idx < Nsimd; idx++) {
grid->iCoorFromIindex(icoor, idx);
int ldx = rt + icoor[orthogdim] * rd;
int ij_ldx = jj + N_j * (ii + N_i * ldx);
lsSum[ij_ldx] = lsSum[ij_ldx] + extracted[idx];
}
}
});
std::cout << GridLogMessage << tag << " simd_extract: " << Tms(usecond()-t0) << " ms\n";
int pd = grid->_processors[orthogdim];
int pc = grid->_processor_coor[orthogdim];
t0 = usecond();
Vector<ComplexD> cache(nt * N_i * N_j, ComplexD(0.0));
for (int lt = 0; lt < ld; lt++)
for (int pt = 0; pt < pd; pt++) {
int t = lt + pt * ld;
for (int ii = 0; ii < N_i; ii++)
for (int jj = 0; jj < N_j; jj++) {
if (pt == pc) {
int ij_ldx = jj + N_j * (ii + N_i * lt);
cache[jj + N_j * (ii + N_i * t)] = lsSum[ij_ldx]()()();
}
}
}
grid->GlobalSumVector(cache.data(), nt * N_i * N_j);
std::cout << GridLogMessage << tag << " globalsum: " << Tms(usecond()-t0) << " ms\n";
for (int t = 0; t < nt; t++)
for (int ii = 0; ii < N_i; ii++)
for (int jj = 0; jj < N_j; jj++)
result(t, ii, jj) = cache[jj + N_j * (ii + N_i * t)];
}
}
};
// ================================================================
// Free-function GPU kernels — accelerator_for, v(ss) reads,
// coalescedWrite writes, vobj-level arithmetic throughout.
// Gamma arrays passed as Vector<Gamma::Algebra> (UVM).
// ================================================================
void A2ALoopPropagator(PropagatorField &loop,
const std::vector<FermionField> &loop1,
const std::vector<FermionField> &loop2)
{
int Nk = (int)loop1.size();
uint64_t oSites = loop.Grid()->oSites();
int Nsimd = SpinColourVector_v::Nsimd();
typedef decltype(loop1[0].View(AcceleratorRead)) View;
std::vector<View> v1, v2;
v1.reserve(Nk); v2.reserve(Nk);
for (int k = 0; k < Nk; k++) {
v1.push_back(loop1[k].View(AcceleratorRead));
v2.push_back(loop2[k].View(AcceleratorRead));
}
deviceVector<SpinColourVector_v *> l1p(Nk), l2p(Nk);
for (int k = 0; k < Nk; k++) {
acceleratorPut(l1p[k], &v1[k][0]);
acceleratorPut(l2p[k], &v2[k][0]);
}
autoView(loopv, loop, AcceleratorWrite);
SpinColourVector_v **l1 = &l1p[0];
SpinColourVector_v **l2 = &l2p[0];
int lNk = Nk;
accelerator_for(ss, oSites, Nsimd, {
auto result = outerProduct(coalescedRead(l1[0][ss]), coalescedRead(l2[0][ss]));
for (int k = 1; k < lNk; k++)
result = result + outerProduct(coalescedRead(l1[k][ss]), coalescedRead(l2[k][ss]));
coalescedWrite(loopv[ss], result);
});
}
void A2APackLeftConjugated(FermionField &out, const FermionField &in)
{
autoView(outv, out, AcceleratorWrite);
autoView(inv, in, AcceleratorRead);
uint64_t Osites = in.Grid()->oSites();
int Nsimd = SpinColourVector_v::Nsimd();
accelerator_for(ss, Osites, Nsimd, {
coalescedWrite(outv[ss], conjugate(inv(ss)));
});
}
// Type 0: colour-trace stored in (s1,s2)(0,0)
void A2ALoopLeftContractionType0(PropagatorField &tloop, const PropagatorField &loop)
{
autoView(tloopv, tloop, AcceleratorWrite);
autoView(loopv, loop, AcceleratorRead);
uint64_t Osites = loop.Grid()->oSites();
int Nsimd = SpinColourMatrix_v::Nsimd();
accelerator_for(ss, Osites, Nsimd, {
auto l = loopv(ss);
auto tmp = l; tmp = Zero();
for (int s1 = 0; s1 < Ns; ++s1)
for (int s2 = 0; s2 < Ns; ++s2)
tmp()(s1,s2)(0,0) = l()(s1,s2)(0,0) + l()(s1,s2)(1,1) + l()(s1,s2)(2,2);
coalescedWrite(tloopv[ss], tmp);
});
}
// Type 1: tloop = sum_mu Gamma(g1[mu]) * loop * Gamma(g2[mu])
void A2ALoopLeftContractionType1(PropagatorField &tloop, const PropagatorField &loop,
const Vector<Gamma::Algebra> &gamma1,
const Vector<Gamma::Algebra> &gamma2)
{
int ng = (int)gamma1.size();
const Gamma::Algebra *g1 = gamma1.data();
const Gamma::Algebra *g2 = gamma2.data();
autoView(tloopv, tloop, AcceleratorWrite);
autoView(loopv, loop, AcceleratorRead);
uint64_t Osites = loop.Grid()->oSites();
int Nsimd = SpinColourMatrix_v::Nsimd();
accelerator_for(ss, Osites, Nsimd, {
auto l = loopv(ss);
auto tmp = l; tmp = Zero();
for (int mu = 0; mu < ng; ++mu)
tmp = tmp + Gamma(g1[mu]) * l * Gamma(g2[mu]);
coalescedWrite(tloopv[ss], tmp);
});
}
// Type 2: for mu=[0..ng), s1=mu/Ns, s2=mu%Ns;
// tloop(s1,s2)(c1,c2) = Tr_spin( Gamma(g2[mu]) * loop )(c1,c2)
void A2ALoopLeftContractionType2(PropagatorField &tloop, const PropagatorField &loop,
const Vector<Gamma::Algebra> &gamma2)
{
int ng = (int)gamma2.size();
const Gamma::Algebra *g2 = gamma2.data();
autoView(tloopv, tloop, AcceleratorWrite);
autoView(loopv, loop, AcceleratorRead);
uint64_t Osites = loop.Grid()->oSites();
int Nsimd = SpinColourMatrix_v::Nsimd();
accelerator_for(ss, Osites, Nsimd, {
auto l = loopv(ss);
auto tmp = l; tmp = Zero();
for (int mu = 0; mu < ng; ++mu) {
auto gtmp = Gamma(g2[mu]) * l;
int s1 = mu / Ns;
int s2 = mu % Ns;
for (int c1 = 0; c1 < Nc; ++c1)
for (int c2 = 0; c2 < Nc; ++c2)
tmp()(s1,s2)(c1,c2) = gtmp()(0,0)(c1,c2) + gtmp()(1,1)(c1,c2)
+ gtmp()(2,2)(c1,c2) + gtmp()(3,3)(c1,c2);
}
coalescedWrite(tloopv[ss], tmp);
});
}
// Type 3: colour-trace → spin matrix → sum_mu G1*spinLoop*G2 stored in (s1,s2)(0,0)
void A2ALoopLeftContractionType3(PropagatorField &tloop, const PropagatorField &loop,
const Vector<Gamma::Algebra> &gamma1,
const Vector<Gamma::Algebra> &gamma2)
{
int ng = (int)gamma1.size();
const Gamma::Algebra *g1 = gamma1.data();
const Gamma::Algebra *g2 = gamma2.data();
autoView(tloopv, tloop, AcceleratorWrite);
autoView(loopv, loop, AcceleratorRead);
uint64_t Osites = loop.Grid()->oSites();
int Nsimd = SpinColourMatrix_v::Nsimd();
accelerator_for(ss, Osites, Nsimd, {
typedef decltype(coalescedRead(loopv[0])) calcSCMatrix;
typedef iSpinMatrix<typename calcSCMatrix::vector_type> calcSpinMatrix;
auto l = loopv(ss);
calcSpinMatrix spinLoop; spinLoop = Zero();
for (int s1 = 0; s1 < Ns; ++s1)
for (int s2 = 0; s2 < Ns; ++s2)
spinLoop()(s1,s2)() = l()(s1,s2)(0,0) + l()(s1,s2)(1,1) + l()(s1,s2)(2,2);
auto tmp = l; tmp = Zero();
for (int mu = 0; mu < ng; ++mu) {
calcSpinMatrix tmp2 = Gamma(g1[mu]) * spinLoop * Gamma(g2[mu]);
for (int s1 = 0; s1 < Ns; ++s1)
for (int s2 = 0; s2 < Ns; ++s2)
tmp()(s1,s2)(0,0) = tmp()(s1,s2)(0,0) + tmp2()(s1,s2)();
}
coalescedWrite(tloopv[ss], tmp);
});
}
// Type 0: loopRight = sum_mu Tr(G2*spinLoop) * G1*right
// where spinLoop(s1,s2) = tloop(s1,s2)(0,0)
void A2ALoopRightContractionType0(FermionField &loopRight,
const PropagatorField &tloop,
const FermionField &right,
const Vector<Gamma::Algebra> &gamma1,
const Vector<Gamma::Algebra> &gamma2)
{
int ng = (int)gamma1.size();
const Gamma::Algebra *g1 = gamma1.data();
const Gamma::Algebra *g2 = gamma2.data();
autoView(lRv, loopRight, AcceleratorWrite);
autoView(tlv, tloop, AcceleratorRead);
autoView(rv, right, AcceleratorRead);
uint64_t Osites = right.Grid()->oSites();
int Nsimd = SpinColourVector_v::Nsimd();
accelerator_for(ss, Osites, Nsimd, {
typedef decltype(coalescedRead(rv[0])) calcSCVector;
typedef decltype(coalescedRead(tlv[0])) calcSCMatrix;
typedef iSpinMatrix<typename calcSCMatrix::vector_type> calcSpinMatrix;
auto loopm = tlv(ss);
auto rightv = rv(ss);
calcSpinMatrix spinLoop; spinLoop = Zero();
for (int s1 = 0; s1 < Ns; ++s1)
for (int s2 = 0; s2 < Ns; ++s2)
spinLoop()(s1,s2)() = loopm()(s1,s2)(0,0);
calcSCVector lR; lR = Zero();
for (int mu = 0; mu < ng; ++mu) {
auto GLoop = Gamma(g2[mu]) * spinLoop;
auto trGLoop = GLoop()(0,0)() + GLoop()(1,1)() + GLoop()(2,2)() + GLoop()(3,3)();
auto Grightv = Gamma(g1[mu]) * rightv;
for (int s = 0; s < Ns; ++s)
for (int c = 0; c < Nc; ++c)
lR()(s)(c) = lR()(s)(c) + Grightv()(s)(c) * trGLoop;
}
coalescedWrite(lRv[ss], lR);
});
}
// Type 1: loopRight = tloop * right (SpinColourMatrix * SpinColourVector)
void A2ALoopRightContractionType1(FermionField &loopRight,
const PropagatorField &tloop,
const FermionField &right)
{
autoView(lRv, loopRight, AcceleratorWrite);
autoView(tlv, tloop, AcceleratorRead);
autoView(rv, right, AcceleratorRead);
uint64_t Osites = right.Grid()->oSites();
int Nsimd = SpinColourVector_v::Nsimd();
accelerator_for(ss, Osites, Nsimd, {
coalescedWrite(lRv[ss], tlv(ss) * rv(ss));
});
}
// Type 2: loopRight(s)(c) = sum_{mu,c'} tloop(s1,s2)(c,c') * (G(g1[mu])*right)(s)(c')
void A2ALoopRightContractionType2(FermionField &loopRight,
const PropagatorField &tloop,
const FermionField &right,
const Vector<Gamma::Algebra> &gamma1)
{
int ng = (int)gamma1.size();
const Gamma::Algebra *g1 = gamma1.data();
autoView(lRv, loopRight, AcceleratorWrite);
autoView(tlv, tloop, AcceleratorRead);
autoView(rv, right, AcceleratorRead);
uint64_t Osites = right.Grid()->oSites();
int Nsimd = SpinColourVector_v::Nsimd();
accelerator_for(ss, Osites, Nsimd, {
typedef decltype(coalescedRead(rv[0])) calcSCVector;
auto loopm = tlv(ss);
auto rightv = rv(ss);
calcSCVector lR; lR = Zero();
for (int mu = 0; mu < ng; ++mu) {
int s1 = mu / Ns;
int s2 = mu % Ns;
auto Grightv = Gamma(g1[mu]) * rightv;
for (int s = 0; s < Ns; ++s)
for (int c = 0; c < Nc; ++c)
lR()(s)(c) = lR()(s)(c)
+ loopm()(s1,s2)(c,0) * Grightv()(s)(0)
+ loopm()(s1,s2)(c,1) * Grightv()(s)(1)
+ loopm()(s1,s2)(c,2) * Grightv()(s)(2);
}
coalescedWrite(lRv[ss], lR);
});
}
// Type 3: loopRight(s)(c) = sum_{s'} tloop(s,s')(0,0) * right(s')(c)
void A2ALoopRightContractionType3(FermionField &loopRight,
const PropagatorField &tloop,
const FermionField &right)
{
autoView(lRv, loopRight, AcceleratorWrite);
autoView(tlv, tloop, AcceleratorRead);
autoView(rv, right, AcceleratorRead);
uint64_t Osites = right.Grid()->oSites();
int Nsimd = SpinColourVector_v::Nsimd();
accelerator_for(ss, Osites, Nsimd, {
typedef decltype(coalescedRead(rv[0])) calcSCVector;
auto loopm = tlv(ss);
auto rightv = rv(ss);
calcSCVector lR; lR = Zero();
for (int s = 0; s < Ns; ++s)
for (int c = 0; c < Nc; ++c)
lR()(s)(c) = loopm()(s,0)(0,0) * rightv()(0)(c)
+ loopm()(s,1)(0,0) * rightv()(1)(c)
+ loopm()(s,2)(0,0) * rightv()(2)(c)
+ loopm()(s,3)(0,0) * rightv()(3)(c);
coalescedWrite(lRv[ss], lR);
});
}
// ================================================================
// GPU-offloaded extended meson field: accelerator_for contractions
// + A2ASpatialSum GEMM spatial reduction.
// ================================================================
class A2AExtendedMesonFieldGPU
{
public:
static void compute(
Eigen::Tensor<ComplexD, 3> &result,
const std::vector<FermionField> &left,
const std::vector<FermionField> &right,
const std::vector<FermionField> &loop1,
const std::vector<FermionField> &loop2,
const std::vector<Gamma::Algebra> &gamma1_in,
const std::vector<Gamma::Algebra> &gamma2_in,
int type)
{
GridBase *grid = left[0].Grid();
int N_i = (int)left.size();
int N_j = (int)right.size();
std::string tag = std::string("[gpu type=") + std::to_string(type) + "]";
auto Tms = [](double us) { return us * 1e-3; };
double t0;
Vector<Gamma::Algebra> gamma1(gamma1_in.begin(), gamma1_in.end());
Vector<Gamma::Algebra> gamma2(gamma2_in.begin(), gamma2_in.end());
t0 = usecond();
for (auto &f : loop1) { autoView(v, f, AcceleratorRead); }
for (auto &f : loop2) { autoView(v, f, AcceleratorRead); }
std::cout << GridLogMessage << tag << " view_open_loop: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
PropagatorField loop(grid);
A2ALoopPropagator(loop, loop1, loop2);
std::cout << GridLogMessage << tag << " loop_build: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
for (int i = 0; i < N_i; i++) { autoView(v, left[i], AcceleratorRead); }
std::cout << GridLogMessage << tag << " view_open_left: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
std::vector<FermionField> leftv(N_i, grid);
for (int i = 0; i < N_i; i++)
A2APackLeftConjugated(leftv[i], left[i]);
std::cout << GridLogMessage << tag << " pack_left: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
PropagatorField tloop(grid);
tloop = Zero();
switch (type) {
case 0: A2ALoopLeftContractionType0(tloop, loop); break;
case 1: A2ALoopLeftContractionType1(tloop, loop, gamma1, gamma2); break;
case 2: A2ALoopLeftContractionType2(tloop, loop, gamma2); break;
case 3: A2ALoopLeftContractionType3(tloop, loop, gamma1, gamma2); break;
}
std::cout << GridLogMessage << tag << " tloop: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
{ autoView(tlv, tloop, AcceleratorRead); }
for (int j = 0; j < N_j; j++) { autoView(rv, right[j], AcceleratorRead); }
std::cout << GridLogMessage << tag << " view_open_right: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
std::vector<FermionField> loopRight(N_j, grid);
for (int j = 0; j < N_j; j++) {
switch (type) {
case 0: A2ALoopRightContractionType0(loopRight[j], tloop, right[j], gamma1, gamma2); break;
case 1: A2ALoopRightContractionType1(loopRight[j], tloop, right[j]); break;
case 2: A2ALoopRightContractionType2(loopRight[j], tloop, right[j], gamma1); break;
case 3: A2ALoopRightContractionType3(loopRight[j], tloop, right[j]); break;
}
}
std::cout << GridLogMessage << tag << " pack_loopright: " << Tms(usecond()-t0) << " ms\n";
A2ASpatialSum<SpinColourVector_v> spatial_sum;
double t_blas = usecond();
t0 = usecond();
spatial_sum.Allocate(N_i, N_j, grid);
std::cout << GridLogMessage << tag << " Allocate: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
spatial_sum.PackLeft(leftv);
std::cout << GridLogMessage << tag << " PackLeft: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
spatial_sum.PackRight(loopRight);
std::cout << GridLogMessage << tag << " PackRight: " << Tms(usecond()-t0) << " ms\n";
t0 = usecond();
spatial_sum.Sum(result);
std::cout << GridLogMessage << tag << " Sum (GEMM+MPI): " << Tms(usecond()-t0) << " ms\n";
std::cout << GridLogMessage << tag << " A2ASpatialSum: " << Tms(usecond()-t_blas) << " ms [TOTAL]\n";
}
};
int main(int argc, char *argv[])
{
Grid_init(&argc, &argv);
Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout = GridDefaultSimd(4, vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
GridCartesian grid(latt_size, simd_layout, mpi_layout);
int Nt = latt_size[Tp];
int N_i = 8;
int N_j = 8;
int Nloop = 4;
if (GridCmdOptionExists(argv, argv+argc, "--Ni"))
N_i = std::stoi(GridCmdOptionPayload(argv, argv+argc, "--Ni"));
if (GridCmdOptionExists(argv, argv+argc, "--Nj"))
N_j = std::stoi(GridCmdOptionPayload(argv, argv+argc, "--Nj"));
GridParallelRNG pRNG(&grid);
pRNG.SeedFixedIntegers({1, 2, 3, 4});
std::vector<FermionField> left(N_i, &grid);
std::vector<FermionField> right(N_j, &grid);
std::vector<FermionField> loop1(Nloop, &grid);
std::vector<FermionField> loop2(Nloop, &grid);
for (auto &f : left) random(pRNG, f);
for (auto &f : right) random(pRNG, f);
for (auto &f : loop1) random(pRNG, f);
for (auto &f : loop2) random(pRNG, f);
std::vector<Gamma::Algebra> GammaMU = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
Eigen::Tensor<ComplexD, 3> result_ref(Nt, N_i, N_j);
Eigen::Tensor<ComplexD, 3> result_blas(Nt, N_i, N_j);
Eigen::Tensor<ComplexD, 3> result_gpu(Nt, N_i, N_j);
double t_ref = 0, t_blas = 0, t_gpu = 0, start, stop;
for (int type = 0; type < 4; type++) {
result_ref.setZero();
start = usecond();
A2AExtendedMesonFieldRef::compute(result_ref, left, right, loop1, loop2,
GammaMU, GammaMU, type, false);
stop = usecond(); t_ref = stop - start;
result_blas.setZero();
start = usecond();
A2AExtendedMesonFieldRef::compute(result_blas, left, right, loop1, loop2,
GammaMU, GammaMU, type, true);
stop = usecond(); t_blas = stop - start;
result_gpu.setZero();
start = usecond();
A2AExtendedMesonFieldGPU::compute(result_gpu, left, right, loop1, loop2,
GammaMU, GammaMU, type);
stop = usecond(); t_gpu = stop - start;
double norm2_ref = 0.0, norm2_blas = 0.0, norm2_gpu = 0.0;
double norm2_diff_blas = 0.0, norm2_diff_gpu = 0.0;
for (int t = 0; t < Nt; t++)
for (int ii = 0; ii < N_i; ii++)
for (int jj = 0; jj < N_j; jj++) {
norm2_ref += norm2(result_ref(t, ii, jj));
norm2_blas += norm2(result_blas(t, ii, jj));
norm2_gpu += norm2(result_gpu(t, ii, jj));
ComplexD diff_blas = result_ref(t, ii, jj) - result_blas(t, ii, jj);
ComplexD diff_gpu = result_ref(t, ii, jj) - result_gpu(t, ii, jj);
norm2_diff_blas += norm2(diff_blas);
norm2_diff_gpu += norm2(diff_gpu);
}
double rel_blas = (norm2_ref > 0) ? std::sqrt(norm2_diff_blas / norm2_ref) : 0.0;
double rel_gpu = (norm2_ref > 0) ? std::sqrt(norm2_diff_gpu / norm2_ref) : 0.0;
std::cout << GridLogMessage
<< "type=" << type
<< " norm2_ref=" << norm2_ref
<< " norm2_blas=" << norm2_blas
<< " norm2_gpu=" << norm2_gpu
<< " rel_blas=" << rel_blas
<< " rel_gpu=" << rel_gpu
<< " t_ref=" << t_ref * 1e-6 << "s"
<< " t_blas=" << t_blas * 1e-6 << "s"
<< " t_gpu=" << t_gpu * 1e-6 << "s"
<< std::endl;
GRID_ASSERT(rel_blas < 1e-10);
GRID_ASSERT(rel_gpu < 1e-10);
}
std::cout << GridLogMessage << "All types passed A2ASpatialSum and GPU regression." << std::endl;
Grid_finalize();
return EXIT_SUCCESS;
}
#endif
+2 -2
View File
@@ -36,8 +36,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
//gridblasHandle_t GridBLAS::gridblasHandle;
//int GridBLAS::gridblasInit;
gridblasHandle_t GridBLAS::gridblasHandle;
int GridBLAS::gridblasInit;
///////////////////////
// Tells little dirac op to use MdagM as the .Op()
+76
View File
@@ -0,0 +1,76 @@
/*
* Isolating the hipfft HIPFFT_PARSE_ERROR on ROCm 7 / hipFFT 1.0.20.
*
* Tests three orderings with an empty rocFFT cache to find which GPU
* operation before plan creation triggers the failure:
* A) hipMalloc only — hypothesis: passes (no async GPU work)
* B) hipMalloc + hipMemset — hypothesis: fails (async GPU work in flight)
* C) hipMalloc + hipMemset — hypothesis: passes (work completed before plan)
* + hipDeviceSynchronize
*
* Compile:
* hipcc -o Test_hipfft_bug_fail Test_hipfft_bug_fail.cc -lhipfft
*
* Run with empty cache:
* rm -rf ~/.cache/
* ./Test_hipfft_bug_fail
*/
#include <cstdio>
#include <hipfft/hipfft.h>
#include <hip/hip_runtime.h>
static const char *res(hipfftResult rv) {
return rv == HIPFFT_SUCCESS ? "SUCCESS" : "PARSE_ERROR";
}
static hipfftResult makePlan(int G, int howmany) {
int n[] = {G};
hipfftHandle p;
size_t workSize = 0;
hipfftCreate(&p);
hipfftResult rv = hipfftMakePlanMany(p, 1, n,
nullptr, 1, G, nullptr, 1, G,
HIPFFT_Z2Z, howmany, &workSize);
hipfftDestroy(p);
return rv;
}
int main(void) {
hipDeviceProp_t prop;
hipGetDeviceProperties(&prop, 0);
printf("Device: %s\n", prop.name);
#ifdef hipfftVersionMinor
printf("hipFFT version: %d.%d.%d\n\n",
hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
#endif
for (int G : {4, 8, 16, 32}) {
int howmany = 512;
long nelems = (long)G * howmany;
hipfftDoubleComplex *buf = nullptr;
hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex));
// Tests ordered so each runs before a prior success can populate the cache.
// B first: hipMalloc + hipMemset (async GPU work in flight)
// If this fails, A (no hipMemset) will pass, confirming hipMemset is the trigger.
hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
hipfftResult rvB = makePlan(G, howmany);
printf("G=%-4d B) hipMalloc + hipMemset : %s\n", G, res(rvB));
// C: hipMalloc + hipMemset + sync — does syncing before plan creation fix it?
hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
hipDeviceSynchronize();
hipfftResult rvC = makePlan(G, howmany);
printf("G=%-4d C) hipMalloc + hipMemset + sync: %s\n", G, res(rvC));
// A last: hipMalloc only, no async GPU work — should always pass
hipfftResult rvA = makePlan(G, howmany);
printf("G=%-4d A) hipMalloc only : %s\n\n", G, res(rvA));
hipFree(buf);
}
return 0;
}
+61
View File
@@ -0,0 +1,61 @@
/*
* Minimal program demonstrating the workaround for the hipfft ROCm 7 bug.
*
* Workaround: create the hipfft plan BEFORE any hipMalloc. Plan creation
* for G < 32 then succeeds even with an empty rocFFT cache.
*
* Compile:
* hipcc -o Test_hipfft_bug_pass Test_hipfft_bug_pass.cc -lhipfft
*
* Run:
* rm -rf ~/.cache/rocfft
* ./Test_hipfft_bug_pass
*
* Expected: all G values succeed.
* Compare with Test_hipfft_bug_fail.cc which uses the opposite ordering.
*/
#include <cstdio>
#include <hipfft/hipfft.h>
#include <hip/hip_runtime.h>
int main(void) {
hipDeviceProp_t prop;
hipGetDeviceProperties(&prop, 0);
printf("Device: %s\n", prop.name);
#ifdef hipfftVersionMinor
printf("hipFFT version: %d.%d.%d\n\n",
hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
#endif
for (int G : {8, 16, 32}) {
int howmany = 512;
int n[] = {G};
long nelems = (long)G * howmany;
// Plan created BEFORE hipMalloc — succeeds for all G
hipfftHandle p;
size_t workSize = 0;
hipfftCreate(&p);
hipfftResult rv = hipfftMakePlanMany(p, 1, n,
nullptr, 1, G, nullptr, 1, G,
HIPFFT_Z2Z, howmany, &workSize);
printf("G=%-4d plan-then-hipMalloc: %d (%s)\n",
G, (int)rv, rv == HIPFFT_SUCCESS ? "HIPFFT_SUCCESS" : "HIPFFT_PARSE_ERROR");
if (rv == HIPFFT_SUCCESS) {
hipfftDoubleComplex *buf = nullptr;
hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex));
hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex));
rv = hipfftExecZ2Z(p, buf, buf, HIPFFT_FORWARD);
hipDeviceSynchronize();
printf("G=%-4d execFwd: %d (%s)\n",
G, (int)rv, rv == HIPFFT_SUCCESS ? "HIPFFT_SUCCESS" : "FAILED");
hipFree(buf);
}
hipfftDestroy(p);
}
return 0;
}
+161
View File
@@ -0,0 +1,161 @@
/*
* Minimal reproducer for hipfftMakePlanMany / hipfftPlanMany failures.
*
* Compile on Frontier (no Grid headers needed):
* hipcc -o Test_hipfft_minimal Test_hipfft_minimal.cc -lhipfft
*
* Run:
* ./Test_hipfft_minimal
*/
#include <cstdio>
#include <cstdlib>
#include <hipfft/hipfft.h>
#include <hip/hip_runtime.h>
static const char *hipfftResultString(hipfftResult r) {
switch (r) {
case HIPFFT_SUCCESS: return "HIPFFT_SUCCESS";
case HIPFFT_INVALID_PLAN: return "HIPFFT_INVALID_PLAN";
case HIPFFT_ALLOC_FAILED: return "HIPFFT_ALLOC_FAILED";
case HIPFFT_INVALID_TYPE: return "HIPFFT_INVALID_TYPE";
case HIPFFT_INVALID_VALUE: return "HIPFFT_INVALID_VALUE";
case HIPFFT_INTERNAL_ERROR: return "HIPFFT_INTERNAL_ERROR";
case HIPFFT_EXEC_FAILED: return "HIPFFT_EXEC_FAILED";
case HIPFFT_SETUP_FAILED: return "HIPFFT_SETUP_FAILED";
case HIPFFT_INVALID_SIZE: return "HIPFFT_INVALID_SIZE";
case HIPFFT_UNALIGNED_DATA: return "HIPFFT_UNALIGNED_DATA";
case HIPFFT_INCOMPLETE_PARAMETER_LIST:return "HIPFFT_INCOMPLETE_PARAMETER_LIST";
case HIPFFT_INVALID_DEVICE: return "HIPFFT_INVALID_DEVICE";
case HIPFFT_PARSE_ERROR: return "HIPFFT_PARSE_ERROR";
case HIPFFT_NO_WORKSPACE: return "HIPFFT_NO_WORKSPACE";
case HIPFFT_NOT_IMPLEMENTED: return "HIPFFT_NOT_IMPLEMENTED";
case HIPFFT_NOT_SUPPORTED: return "HIPFFT_NOT_SUPPORTED";
default: return "UNKNOWN";
}
}
// Plan creation + execution for (G, howmany).
// Tests two orderings to isolate whether a prior hipMalloc poisons hipfft
// plan creation for small G on ROCm 7:
// A) plan BEFORE hipMalloc — hypothesis: succeeds
// B) hipMalloc BEFORE plan — hypothesis: fails for G < 32
static void tryPlanAndExec(int G, long howmany) {
int n[] = {G};
long nelems = (long)G * howmany;
printf("--- G=%-4d howmany=%-10ld total_elems=%-12ld ---\n",
G, howmany, nelems);
// Allocate device buffer (hipfftDoubleComplex = 16 bytes each)
hipfftDoubleComplex *dbuf = nullptr;
hipError_t herr = hipMalloc(&dbuf, nelems * sizeof(hipfftDoubleComplex));
if (herr != hipSuccess) {
printf(" hipMalloc failed (%d) for %ld elems — skipping\n\n", (int)herr, nelems);
return;
}
hipMemset(dbuf, 0, nelems * sizeof(hipfftDoubleComplex));
// 1. hipfftPlanMany (one-step, nullptr embed) — current Grid path
{
hipfftHandle p;
hipfftResult rv = hipfftPlanMany(&p, 1, n,
nullptr, 1, G,
nullptr, 1, G,
HIPFFT_Z2Z, (int)howmany);
printf(" hipfftPlanMany create : %d (%s)\n", (int)rv, hipfftResultString(rv));
if (rv == HIPFFT_SUCCESS) {
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
hipDeviceSynchronize();
printf(" hipfftPlanMany execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
hipfftDestroy(p);
}
}
// 2. hipfftCreate + hipfftMakePlanMany (two-step) — also current Grid path
{
hipfftHandle p;
size_t workSize = 0;
hipfftResult rc = hipfftCreate(&p);
if (rc == HIPFFT_SUCCESS) {
hipfftResult rv = hipfftMakePlanMany(p, 1, n,
nullptr, 1, G,
nullptr, 1, G,
HIPFFT_Z2Z, (int)howmany, &workSize);
printf(" hipfftMakePlanMany : %d (%s) workSize=%zu\n",
(int)rv, hipfftResultString(rv), workSize);
if (rv == HIPFFT_SUCCESS) {
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
hipDeviceSynchronize();
printf(" hipfftMakePlanMany exec : %d (%s)\n", (int)rv, hipfftResultString(rv));
}
hipfftDestroy(p);
} else {
printf(" hipfftCreate : %d (%s)\n", (int)rc, hipfftResultString(rc));
}
}
// 3. hipfftPlan1d (simplest API, batch = howmany)
{
hipfftHandle p;
hipfftResult rv = hipfftPlan1d(&p, G, HIPFFT_Z2Z, (int)howmany);
printf(" hipfftPlan1d create : %d (%s)\n", (int)rv, hipfftResultString(rv));
if (rv == HIPFFT_SUCCESS) {
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
hipDeviceSynchronize();
printf(" hipfftPlan1d execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
hipfftDestroy(p);
}
}
hipFree(dbuf);
printf("\n");
}
int main(void) {
// Print HIP device info
int device = 0;
hipGetDevice(&device);
hipDeviceProp_t prop;
hipGetDeviceProperties(&prop, device);
printf("Device %d: %s warpSize=%d\n\n", device, prop.name, prop.warpSize);
#ifdef hipfftVersionMinor
printf("hipFFT version: %d.%d.%d\n\n",
hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
#endif
// Original sweep with small howmany (these passed first time)
printf("=== Small howmany (original sweep) ===\n\n");
for (int G : {4, 8, 12, 16, 24, 32, 48, 64})
tryPlanAndExec(G, 512);
// Grid-realistic howmany values derived from actual lattice geometries.
// howmany = Ncomp * product(ldimensions[d] for d != dim)
// For LatticeComplexD: Ncomp=1.
printf("=== Grid-realistic parameters ===\n\n");
// --grid 16.16.16.16 4D FFT (KNOWN TO FAIL in Grid)
// Each dim: G=16, Nperp=16^3=4096
tryPlanAndExec(16, 4096);
// --grid 32.32.32.32 4D FFT (KNOWN TO SUCCEED in Grid)
// Each dim: G=32, Nperp=32^3=32768
tryPlanAndExec(32, 32768);
// --grid 32.32.32.32 Ls=8 5D DWF FFT (KNOWN TO FAIL on dim 0 in Grid)
// dim 0: G=8, Nperp=32^4=1048576
tryPlanAndExec(8, 1048576);
// dim 1-4: G=32, Nperp=8*32^3=262144
tryPlanAndExec(32, 262144);
// Extra intermediate cases to bracket the failure
tryPlanAndExec(16, 1024);
tryPlanAndExec(16, 2048);
tryPlanAndExec(16, 8192);
tryPlanAndExec(8, 4096);
tryPlanAndExec(8, 65536);
tryPlanAndExec(8, 262144);
return 0;
}
+168
View File
@@ -0,0 +1,168 @@
/*
* Reproducer for HIPFFT_PARSE_ERROR (error 12) from hipfftMakePlanMany on
* ROCm 7 / hipFFT 1.0.20 (Frontier, MI210 login and MI250X compute nodes).
*
* Observed failure: G < 32 returns HIPFFT_PARSE_ERROR from all three plan
* creation APIs (hipfftPlanMany, hipfftMakePlanMany, hipfftPlan1d) when a
* device buffer is allocated and zeroed with hipMalloc+hipMemset before the
* plan creation call. G >= 32 succeeds.
*
* Contrast with Test_hipfft_minimal.cc (plan-first ordering) which passes
* for all G even with an empty rocFFT cache.
*
* Compile on Frontier (no Grid headers needed):
* hipcc -o Test_hipfft_repro Test_hipfft_repro.cc -lhipfft
*
* Run with empty cache to reproduce the failure:
* rm -rf ~/.cache/rocfft
* ./Test_hipfft_repro
*/
#include <cstdio>
#include <cstdlib>
#include <hipfft/hipfft.h>
#include <hip/hip_runtime.h>
static const char *hipfftResultString(hipfftResult r) {
switch (r) {
case HIPFFT_SUCCESS: return "HIPFFT_SUCCESS";
case HIPFFT_INVALID_PLAN: return "HIPFFT_INVALID_PLAN";
case HIPFFT_ALLOC_FAILED: return "HIPFFT_ALLOC_FAILED";
case HIPFFT_INVALID_TYPE: return "HIPFFT_INVALID_TYPE";
case HIPFFT_INVALID_VALUE: return "HIPFFT_INVALID_VALUE";
case HIPFFT_INTERNAL_ERROR: return "HIPFFT_INTERNAL_ERROR";
case HIPFFT_EXEC_FAILED: return "HIPFFT_EXEC_FAILED";
case HIPFFT_SETUP_FAILED: return "HIPFFT_SETUP_FAILED";
case HIPFFT_INVALID_SIZE: return "HIPFFT_INVALID_SIZE";
case HIPFFT_UNALIGNED_DATA: return "HIPFFT_UNALIGNED_DATA";
case HIPFFT_INCOMPLETE_PARAMETER_LIST:return "HIPFFT_INCOMPLETE_PARAMETER_LIST";
case HIPFFT_INVALID_DEVICE: return "HIPFFT_INVALID_DEVICE";
case HIPFFT_PARSE_ERROR: return "HIPFFT_PARSE_ERROR";
case HIPFFT_NO_WORKSPACE: return "HIPFFT_NO_WORKSPACE";
case HIPFFT_NOT_IMPLEMENTED: return "HIPFFT_NOT_IMPLEMENTED";
case HIPFFT_NOT_SUPPORTED: return "HIPFFT_NOT_SUPPORTED";
default: return "UNKNOWN";
}
}
// Plan creation + execution for (G, howmany) using hipfftCreate+hipfftMakePlanMany.
// This is the path Grid's FFT.h now uses.
static void tryPlanAndExec(int G, long howmany) {
int n[] = {G};
long nelems = (long)G * howmany;
printf("--- G=%-4d howmany=%-10ld total_elems=%-12ld ---\n",
G, howmany, nelems);
// Allocate device buffer (hipfftDoubleComplex = 16 bytes each)
hipfftDoubleComplex *dbuf = nullptr;
hipError_t herr = hipMalloc(&dbuf, nelems * sizeof(hipfftDoubleComplex));
if (herr != hipSuccess) {
printf(" hipMalloc failed (%d) for %ld elems — skipping\n\n", (int)herr, nelems);
return;
}
hipMemset(dbuf, 0, nelems * sizeof(hipfftDoubleComplex));
// 1. hipfftPlanMany (one-step, nullptr embed) — current Grid path
{
hipfftHandle p;
hipfftResult rv = hipfftPlanMany(&p, 1, n,
nullptr, 1, G,
nullptr, 1, G,
HIPFFT_Z2Z, (int)howmany);
printf(" hipfftPlanMany create : %d (%s)\n", (int)rv, hipfftResultString(rv));
if (rv == HIPFFT_SUCCESS) {
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
hipDeviceSynchronize();
printf(" hipfftPlanMany execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
hipfftDestroy(p);
}
}
// 2. hipfftCreate + hipfftMakePlanMany (two-step) — also current Grid path
{
hipfftHandle p;
size_t workSize = 0;
hipfftResult rc = hipfftCreate(&p);
if (rc == HIPFFT_SUCCESS) {
hipfftResult rv = hipfftMakePlanMany(p, 1, n,
nullptr, 1, G,
nullptr, 1, G,
HIPFFT_Z2Z, (int)howmany, &workSize);
printf(" hipfftMakePlanMany : %d (%s) workSize=%zu\n",
(int)rv, hipfftResultString(rv), workSize);
if (rv == HIPFFT_SUCCESS) {
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
hipDeviceSynchronize();
printf(" hipfftMakePlanMany exec : %d (%s)\n", (int)rv, hipfftResultString(rv));
}
hipfftDestroy(p);
} else {
printf(" hipfftCreate : %d (%s)\n", (int)rc, hipfftResultString(rc));
}
}
// 3. hipfftPlan1d (simplest API, batch = howmany)
{
hipfftHandle p;
hipfftResult rv = hipfftPlan1d(&p, G, HIPFFT_Z2Z, (int)howmany);
printf(" hipfftPlan1d create : %d (%s)\n", (int)rv, hipfftResultString(rv));
if (rv == HIPFFT_SUCCESS) {
rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD);
hipDeviceSynchronize();
printf(" hipfftPlan1d execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv));
hipfftDestroy(p);
}
}
hipFree(dbuf);
printf("\n");
}
int main(void) {
// Print HIP device info
int device = 0;
hipGetDevice(&device);
hipDeviceProp_t prop;
hipGetDeviceProperties(&prop, device);
printf("Device %d: %s warpSize=%d\n\n", device, prop.name, prop.warpSize);
#ifdef hipfftVersionMinor
printf("hipFFT version: %d.%d.%d\n\n",
hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch);
#endif
// Original sweep with small howmany (these passed first time)
printf("=== Small howmany (original sweep) ===\n\n");
for (int G : {4, 8, 12, 16, 24, 32, 48, 64})
tryPlanAndExec(G, 512);
// Grid-realistic howmany values derived from actual lattice geometries.
// howmany = Ncomp * product(ldimensions[d] for d != dim)
// For LatticeComplexD: Ncomp=1.
printf("=== Grid-realistic parameters ===\n\n");
// --grid 16.16.16.16 4D FFT (KNOWN TO FAIL in Grid)
// Each dim: G=16, Nperp=16^3=4096
tryPlanAndExec(16, 4096);
// --grid 32.32.32.32 4D FFT (KNOWN TO SUCCEED in Grid)
// Each dim: G=32, Nperp=32^3=32768
tryPlanAndExec(32, 32768);
// --grid 32.32.32.32 Ls=8 5D DWF FFT (KNOWN TO FAIL on dim 0 in Grid)
// dim 0: G=8, Nperp=32^4=1048576
tryPlanAndExec(8, 1048576);
// dim 1-4: G=32, Nperp=8*32^3=262144
tryPlanAndExec(32, 262144);
// Extra intermediate cases to bracket the failure
tryPlanAndExec(16, 1024);
tryPlanAndExec(16, 2048);
tryPlanAndExec(16, 8192);
tryPlanAndExec(8, 4096);
tryPlanAndExec(8, 65536);
tryPlanAndExec(8, 262144);
return 0;
}
-362
View File
@@ -1,362 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: tests/debug/Test_staggered_hdcg.cc
Authors: Thomas Blum, Peter Boyle
HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
for naive staggered fermions, based on arXiv:2409.03904.
Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
- NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
- 4D SchurStaggeredOperator: Mpc = m^2 - D_oe * D_eo (hermitian, positive-definite)
- vColourVector fine field type (staggered has colour but no spin)
- NextToNearestStencilGeometry4D: 33-point coarse stencil
Stencil count: D_oe*D_eo has 2-hop fine range. With blocking B >= 2 the coarse
shifts have L1-distance <= 2, giving 33 stencil points in 4D:
1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
To extend to ImprovedStaggeredFermion later, use B >= 6.
Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
Usage (after build):
./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
*************************************************************************************/
#include <Grid/Grid.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
using namespace Grid;
// Non-converging CG used as a smoother (fixed number of iterations)
template<class Field>
class CGSmoother : public LinearFunction<Field>
{
public:
typedef LinearOperatorBase<Field> FineOperator;
FineOperator &_Op;
int iters;
CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
void operator()(const Field &in, Field &out)
{
ConjugateGradient<Field> CG(0.0, iters, false);
out = Zero();
CG(_Op, in, out);
}
};
int main(int argc, char **argv)
{
fprintf(stderr, "TRACE: entering main\n"); fflush(stderr);
Grid_init(&argc, &argv);
fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr);
//--------------------------------------------------------------------
// Parameters — tune for production
//--------------------------------------------------------------------
const int nbasis = 24; // near-null space dimension
const int cb = 0; // even checkerboard
RealD mass = 0.05;
// NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
// c1 = coefficient of the hopping term (1.0 = standard normalisation).
// u0 = tadpole factor (1.0 = no tadpole improvement).
RealD c1 = 1.0;
RealD u0 = 1.0;
//--------------------------------------------------------------------
// Grids
// Fine: UGrid (4D full), UrbGrid (4D red-black)
// Coarse: Coarse4d with dimensions = GridDefaultLatt() / Block
//
// Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
// NaiveStaggeredFermion works with any Block >= {2,2,2,2}
//--------------------------------------------------------------------
fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr);
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr);
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
Coordinate Block({4, 4, 4, 4});
Coordinate clatt = GridDefaultLatt();
for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd());
Coordinate cmpi = GridDefaultMpi();
fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n",
clatt[0],clatt[1],clatt[2],clatt[3],
csimd[0],csimd[1],csimd[2],csimd[3],
cmpi[0],cmpi[1],cmpi[2],cmpi[3],
(int)vComplex::Nsimd()); fflush(stderr);
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi);
fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr);
//--------------------------------------------------------------------
// RNG + gauge field
//--------------------------------------------------------------------
fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers({1,2,3,4});
fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr);
GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid
fprintf(stderr, "TRACE: Umu\n"); fflush(stderr);
LatticeGaugeField Umu(UGrid);
fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr);
SU<Nc>::HotConfiguration(RNG4, Umu);
fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr);
NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr);
SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr);
//--------------------------------------------------------------------
// Subspace: inverse-iteration near-null vectors
//
// CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors,
// converging naturally to the low modes of HermOp without needing spectral
// bound tuning. Switch to CreateSubspaceChebyshevNew once the spectrum is
// well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator).
//--------------------------------------------------------------------
typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
Subspace Aggregates(Coarse4d, UrbGrid, cb);
Aggregates.CreateSubspace(RNGrb, HermOp);
Aggregates.Orthogonalise();
//--------------------------------------------------------------------
// Coarse geometry: NextToNearestStencilGeometry4D
// hops=2 -> 33 stencil points in 4D
//--------------------------------------------------------------------
NextToNearestStencilGeometry4D geom(Coarse4d);
std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
//--------------------------------------------------------------------
// Single-RHS coarse operator (used for correctness check below)
//--------------------------------------------------------------------
typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
typedef LittleDiracOp::CoarseVector CoarseVector;
LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
LDO.CoarsenOperator(HermOp, Aggregates);
//--------------------------------------------------------------------
// Correctness check: P M_fine P^T c ≈ M_coarse c
//
// Promote a random coarse vector into the fine subspace, apply the
// fine operator, project back, and compare with the coarse operator
// applied directly. Error should be at the level of subspace
// approximation quality (smaller = better basis vectors).
//--------------------------------------------------------------------
{
GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
random(RNGc, c_src);
LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
Aggregates.PromoteFromSubspace(c_src, f_v);
HermOp.Op(f_v, f_Mv);
Aggregates.ProjectToSubspace(c_proj, f_Mv);
LDO.M(c_src, c_ldop);
c_proj -= c_ldop;
RealD err = norm2(c_proj) / norm2(c_ldop);
std::cout << GridLogMessage
<< "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
}
//--------------------------------------------------------------------
// Multi-RHS coarse grid
//
// The extra leading dimension holds nrhs right-hand sides packed into
// SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
//--------------------------------------------------------------------
const int nrhs = vComplex::Nsimd() * 2;
Coordinate mpi = GridDefaultMpi();
Coordinate rhMpi ({1, mpi[0], mpi[1], mpi[2], mpi[3]});
Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
MultiCoarseOp mrhs(geom, CoarseMrhs);
mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
//--------------------------------------------------------------------
// Coarse-grid Lanczos for deflation
//--------------------------------------------------------------------
typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
MrhsHermOp MrhsCoarseOp(mrhs);
// Estimate spectral bounds for Lanczos Chebyshev filter
CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
PowerMethod<CoarseVector> cPM;
RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
// Chebyshev filter window [lo, hi]:
// lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th
// coarse eigenvalues so that only the target modes receive cosh amplification.
//
// From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config):
// Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746] ~= mass^2
// Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275)
// Group 2 (second group): lambda in [0.165, 0.179]
//
// lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275,
// factor 8x below lambda_24=0.165).
// hi = lambda_max_coarse * 1.1 ~= 2.121
// y(lambda_0=0.002647) ~ -1.016 -> T_70 ~ 1.7e5 (cosh(70*0.182))
// y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5
// Relative spread across near-null cluster: ~4.3%
// y(lambda_24=0.165) ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1
//
// order=71 (degree 70) is needed to give ~4% relative spread across the
// near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried)
// gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge.
// Absolute amplification ~1e5; what matters for IRL convergence is the
// relative spread, not the absolute value.
// lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts).
// lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1.
RealD lambda_lo = 0.02;
std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo
<< " hi=" << lambda_max*1.1 << " order=71" << std::endl;
Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
// 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28
// in the first Lanczos restart. The remaining modes (~0.165) are a
// second spectral group that needs more Krylov vectors; handle them
// separately once the basic HDCG solve is validated.
int Nk = 24;
int Nm = 48;
int Nstop = Nk;
GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16});
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
int Nconv;
std::vector<RealD> eval(Nm);
std::vector<CoarseVector> evec(Nm, Coarse4d); // evec on f_grid (single-RHS coarse)
std::vector<CoarseVector> c_srcs(nrhs, Coarse4d); // src on same grid as evec
for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
//--------------------------------------------------------------------
// HDCG solver assembly
//--------------------------------------------------------------------
MultiRHSDeflation<CoarseVector> MrhsGuesser;
MrhsGuesser.ImportEigenBasis(evec, eval);
// MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
MrhsProjector.ImportBasis(Aggregates.subspace);
ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
DoNothingGuesser<CoarseVector> DoNothing;
HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
// Spectral radius of the fine operator, needed for the smoother shift.
// Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod.
LatticeStaggeredFermionD fine_pm_src(UrbGrid);
random(RNGrb, fine_pm_src);
PowerMethod<LatticeStaggeredFermionD> finePM;
RealD fine_lambda_max = finePM(HermOp, fine_pm_src);
// Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100.
//
// The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the
// interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial
// focuses entirely on the HIGH-frequency part of the spectrum and leaves
// near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there).
//
// This is the right target because the coarse-grid correction always introduces
// high-frequency spectral leakage: the blocked coarse-grid degrees of freedom
// are piecewise constant across coarse cells and therefore have sharp edges at
// cell boundaries (like lego-block edges). Smoothness is measured by the
// covariant Dirac derivative, so promoting the coarse solution back to the
// fine grid inevitably excites high-frequency components — just as a step
// function always carries high-frequency Fourier content. The smoother must
// repair exactly these high modes.
//
// The smoother and the coarse-grid correction are applied alternately: together
// they both lift the low eigenvalues and pull down the upper eigenvalues of the
// composite preconditioned operator, reducing the condition number seen by the
// outer HDCG iterations.
//
// DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered
// the 8 roots over [0.005, 4.6] and diluted their effect on the high modes.
RealD smootherShift = fine_lambda_max / 200.0;
std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = "
<< fine_lambda_max << "/200 = " << smootherShift << std::endl;
ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
HDCG(1.0e-8, 500,
HermOp,
smoother,
HPDSolve, // M1 (coarse correction)
HPDSolve, // Vstart (initial guess projection)
MrhsProjector,
MrhsGuesser,
CoarseMrhs);
//--------------------------------------------------------------------
// Solve: nrhs right-hand sides simultaneously
//--------------------------------------------------------------------
std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid
for (int r = 0; r < nrhs; r++) {
random(RNGrb2, src[r]);
sol[r] = Zero();
}
//--------------------------------------------------------------------
// Baseline: standard single-RHS CG on HermOp (no preconditioning)
// Run before HDCG to establish the unpreconditioned iteration count
// and wall-clock time for direct comparison.
//--------------------------------------------------------------------
{
ConjugateGradient<LatticeStaggeredFermionD> CG(1.0e-8, 50000, false);
std::vector<LatticeStaggeredFermionD> cg_sol(nrhs, UrbGrid);
for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero();
RealD t0 = usecond();
int total_iters = 0;
for (int r = 0; r < nrhs; r++) {
std::cout << GridLogMessage << "====== CG baseline RHS " << r
<< " ======" << std::endl;
CG(HermOp, src[r], cg_sol[r]);
total_iters += CG.IterationsToComplete;
}
RealD t1 = usecond();
std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, "
<< total_iters << " total iterations, "
<< (t1 - t0) / 1.0e6 << " s total, "
<< (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl;
}
//--------------------------------------------------------------------
// HDCG solve
//--------------------------------------------------------------------
HDCG(src, sol);
Grid_finalize();
return 0;
}
-373
View File
@@ -1,373 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: tests/debug/Test_staggered_hdcg.cc
Authors: Thomas Blum, Peter Boyle
HDCG (Hierarchical Deflation Conjugate Gradient) multigrid solver
for naive staggered fermions, based on arXiv:2409.03904.
Adapts the DWF HDCG infrastructure (Test_general_coarse_hdcg_phys48.cc) to:
- NaiveStaggeredFermion (nearest-neighbour only, no Naik 3-hop term)
- 4D SchurStaggeredOperator: Mpc = m^2 - D_oe * D_eo (hermitian, positive-definite)
- vColourVector fine field type (staggered has colour but no spin)
- NextToNearestStencilGeometry4D: 33-point coarse stencil
Stencil count: D_oe*D_eo has 2-hop fine range. With blocking B >= 2 the coarse
shifts have L1-distance <= 2, giving 33 stencil points in 4D:
1 (identity) + 8 (+-e_mu) + 24 (+-e_mu +- e_nu).
NaiveStaggeredFermion has no Naik term, so any B >= 2 suffices.
To extend to ImprovedStaggeredFermion later, use B >= 6.
Reference: arXiv:2409.03904 (mrhs hermitian multigrid for DWF).
Usage (after build):
./Test_staggered_hdcg --grid 16.16.16.16 --mpi 1.1.1.1
*************************************************************************************/
#include <Grid/Grid.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
using namespace Grid;
// Non-converging CG used as a smoother (fixed number of iterations)
template<class Field>
class CGSmoother : public LinearFunction<Field>
{
public:
typedef LinearOperatorBase<Field> FineOperator;
FineOperator &_Op;
int iters;
CGSmoother(int _iters, FineOperator &Op) : _Op(Op), iters(_iters) {}
void operator()(const Field &in, Field &out)
{
ConjugateGradient<Field> CG(0.0, iters, false);
out = Zero();
CG(_Op, in, out);
}
};
int main(int argc, char **argv)
{
fprintf(stderr, "TRACE: entering main\n"); fflush(stderr);
Grid_init(&argc, &argv);
fprintf(stderr, "TRACE: Grid_init done\n"); fflush(stderr);
//--------------------------------------------------------------------
// Parameters — tune for production
//--------------------------------------------------------------------
const int nbasis = 24; // near-null space dimension
const int cb = 0; // even checkerboard
RealD mass = 0.00184;
// NaiveStaggeredFermion: nearest-neighbour hop only (no Naik term).
// c1 = coefficient of the hopping term (1.0 = standard normalisation).
// u0 = tadpole factor (1.0 = no tadpole improvement).
RealD c1 = 1.0;
RealD u0 = 1.0;
//--------------------------------------------------------------------
// Grids
// Fine: UGrid (4D full), UrbGrid (4D red-black)
// Coarse: Coarse4d with dimensions = GridDefaultLatt() / Block
//
// Recommended: GridDefaultLatt() >= 16^4, Block = {4,4,4,4}
// NaiveStaggeredFermion works with any Block >= {2,2,2,2}
//--------------------------------------------------------------------
fprintf(stderr, "TRACE: making UGrid\n"); fflush(stderr);
GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
fprintf(stderr, "TRACE: making UrbGrid\n"); fflush(stderr);
GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
Coordinate Block({4, 4, 4, 4});
Coordinate clatt = GridDefaultLatt();
for (int d = 0; d < clatt.size(); d++) clatt[d] /= Block[d];
Coordinate csimd = GridDefaultSimd(Nd, vComplex::Nsimd());
Coordinate cmpi = GridDefaultMpi();
fprintf(stderr, "TRACE: making Coarse4d clatt=%d %d %d %d simd=%d %d %d %d mpi=%d %d %d %d Nsimd=%d\n",
clatt[0],clatt[1],clatt[2],clatt[3],
csimd[0],csimd[1],csimd[2],csimd[3],
cmpi[0],cmpi[1],cmpi[2],cmpi[3],
(int)vComplex::Nsimd()); fflush(stderr);
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, csimd, cmpi);
fprintf(stderr, "TRACE: Coarse4d made\n"); fflush(stderr);
//--------------------------------------------------------------------
// RNG + gauge field
//--------------------------------------------------------------------
fprintf(stderr, "TRACE: RNG4\n"); fflush(stderr);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers({1,2,3,4});
fprintf(stderr, "TRACE: RNGrb\n"); fflush(stderr);
GridParallelRNG RNGrb(UGrid); RNGrb.SeedFixedIntegers({5,6,7,8}); // must use full grid, not UrbGrid
fprintf(stderr, "TRACE: Umu\n"); fflush(stderr);
LatticeGaugeField Umu(UGrid);
int HotStart = 0;
if ( HotStart ) {
fprintf(stderr, "TRACE: HotConfig\n"); fflush(stderr);
SU<Nc>::HotConfiguration(RNG4, Umu);
} else {
FieldMetaData header;
std::string file("./configuration.ildg");
IldgReader IR;
IR.open(file);
IR.readConfiguration(Umu,header);
IR.close();
}
fprintf(stderr, "TRACE: NaiveStaggeredFermionD\n"); fflush(stderr);
NaiveStaggeredFermionD Ds(Umu, *UGrid, *UrbGrid, mass, c1, u0);
fprintf(stderr, "TRACE: SchurStaggeredOperator\n"); fflush(stderr);
SchurStaggeredOperator<NaiveStaggeredFermionD, LatticeStaggeredFermionD> HermOp(Ds);
fprintf(stderr, "TRACE: HermOp done\n"); fflush(stderr);
//--------------------------------------------------------------------
// Subspace: inverse-iteration near-null vectors
//
// CreateSubspace applies CG (4 solves, tol=1e-4) to random noise vectors,
// converging naturally to the low modes of HermOp without needing spectral
// bound tuning. Switch to CreateSubspaceChebyshevNew once the spectrum is
// well characterised (hi ~ 5.0 for naive staggered SchurStaggeredOperator).
//--------------------------------------------------------------------
typedef Aggregation<vColourVector, vTComplex, nbasis> Subspace;
Subspace Aggregates(Coarse4d, UrbGrid, cb);
Aggregates.CreateSubspace(RNGrb, HermOp);
Aggregates.Orthogonalise();
//--------------------------------------------------------------------
// Coarse geometry: NextToNearestStencilGeometry4D
// hops=2 -> 33 stencil points in 4D
//--------------------------------------------------------------------
NextToNearestStencilGeometry4D geom(Coarse4d);
std::cout << GridLogMessage << "Coarse stencil: " << geom.npoint << " points" << std::endl;
//--------------------------------------------------------------------
// Single-RHS coarse operator (used for correctness check below)
//--------------------------------------------------------------------
typedef GeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> LittleDiracOp;
typedef LittleDiracOp::CoarseVector CoarseVector;
LittleDiracOp LDO(geom, UrbGrid, Coarse4d);
LDO.CoarsenOperator(HermOp, Aggregates);
//--------------------------------------------------------------------
// Correctness check: P M_fine P^T c ≈ M_coarse c
//
// Promote a random coarse vector into the fine subspace, apply the
// fine operator, project back, and compare with the coarse operator
// applied directly. Error should be at the level of subspace
// approximation quality (smaller = better basis vectors).
//--------------------------------------------------------------------
{
GridParallelRNG RNGc(Coarse4d); RNGc.SeedFixedIntegers({9,10,11,12});
CoarseVector c_src(Coarse4d), c_ldop(Coarse4d), c_proj(Coarse4d);
random(RNGc, c_src);
LatticeStaggeredFermionD f_v(UrbGrid), f_Mv(UrbGrid);
Aggregates.PromoteFromSubspace(c_src, f_v);
HermOp.Op(f_v, f_Mv);
Aggregates.ProjectToSubspace(c_proj, f_Mv);
LDO.M(c_src, c_ldop);
c_proj -= c_ldop;
RealD err = norm2(c_proj) / norm2(c_ldop);
std::cout << GridLogMessage
<< "Coarsen check |P*M_fine - M_coarse| / |M_coarse| = " << err << std::endl;
}
//--------------------------------------------------------------------
// Multi-RHS coarse grid
//
// The extra leading dimension holds nrhs right-hand sides packed into
// SIMD lanes, matching the pattern of Test_general_coarse_hdcg_phys48.
//--------------------------------------------------------------------
const int nrhs = vComplex::Nsimd() * 2;
Coordinate mpi = GridDefaultMpi();
Coordinate rhMpi ({1, mpi[0], mpi[1], mpi[2], mpi[3]});
Coordinate rhLatt({nrhs, clatt[0], clatt[1], clatt[2], clatt[3]});
Coordinate rhSimd({vComplex::Nsimd(), 1, 1, 1, 1});
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt, rhSimd, rhMpi);
typedef MultiGeneralCoarsenedMatrix<vColourVector, vTComplex, nbasis> MultiCoarseOp;
MultiCoarseOp mrhs(geom, CoarseMrhs);
mrhs.CoarsenOperator(HermOp, Aggregates, Coarse4d);
//--------------------------------------------------------------------
// Coarse-grid Lanczos for deflation
//--------------------------------------------------------------------
typedef HermitianLinearOperator<MultiCoarseOp, CoarseVector> MrhsHermOp;
MrhsHermOp MrhsCoarseOp(mrhs);
// Estimate spectral bounds for Lanczos Chebyshev filter
CoarseVector pm_src(CoarseMrhs); pm_src = ComplexD(1.0);
PowerMethod<CoarseVector> cPM;
RealD lambda_max = cPM(MrhsCoarseOp, pm_src);
// Chebyshev filter window [lo, hi]:
// lo must sit in the spectral gap between the Nstop-th and (Nstop+1)-th
// coarse eigenvalues so that only the target modes receive cosh amplification.
//
// From a pilot run (16^4 fine, 4^4 coarse, mass=0.05, hot config):
// Group 1 (near-null, 24 modes): lambda in [0.002647, 0.002746] ~= mass^2
// Spectral gap: factor 60 (lambda_24/lambda_23 = 0.165/0.00275)
// Group 2 (second group): lambda in [0.165, 0.179]
//
// lo = 0.02 sits in the spectral gap (factor 7x above lambda_23=0.00275,
// factor 8x below lambda_24=0.165).
// hi = lambda_max_coarse * 1.1 ~= 2.121
// y(lambda_0=0.002647) ~ -1.016 -> T_70 ~ 1.7e5 (cosh(70*0.182))
// y(lambda_23=0.002746) ~ -1.015 -> T_70 ~ 1.6e5
// Relative spread across near-null cluster: ~4.3%
// y(lambda_24=0.165) ~ -0.862 -> inside [lo,hi] -> |T_70| <= 1
//
// order=71 (degree 70) is needed to give ~4% relative spread across the
// near-null cluster of 24 nearly-degenerate eigenvalues; order=31 (tried)
// gave only ~1.7% spread, insufficient for Nk=24/Nm=48 to converge.
// Absolute amplification ~1e5; what matters for IRL convergence is the
// relative spread, not the absolute value.
// lo=0.005 failed (T_70~53, 0/24 modes in 10 restarts).
// lo=0.01 worked but needed 2 restarts (13/24 then 24/24); lo=0.02 converges in 1.
RealD lambda_lo = 0.02;
std::cout << GridLogMessage << "Chebyshev filter: lo=" << lambda_lo
<< " hi=" << lambda_max*1.1 << " order=71" << std::endl;
Chebyshev<CoarseVector> IRLCheby(lambda_lo, lambda_max * 1.1, 71);
// 24 near-null modes (eigenvalues ~mass^2) converge to resid^2~1e-28
// in the first Lanczos restart. The remaining modes (~0.165) are a
// second spectral group that needs more Krylov vectors; handle them
// separately once the basic HDCG solve is validated.
int Nk = 24;
int Nm = 48;
int Nstop = Nk;
GridParallelRNG CRNG(Coarse4d); CRNG.SeedFixedIntegers({13,14,15,16});
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector>
IRL(MrhsCoarseOp, Coarse4d, CoarseMrhs, nrhs, IRLCheby,
Nstop, /*conv_test_interval*/1, nrhs, Nk, Nm, 1.0e-5, 10);
int Nconv;
std::vector<RealD> eval(Nm);
std::vector<CoarseVector> evec(Nm, Coarse4d); // evec on f_grid (single-RHS coarse)
std::vector<CoarseVector> c_srcs(nrhs, Coarse4d); // src on same grid as evec
for (int r = 0; r < nrhs; r++) random(CRNG, c_srcs[r]);
IRL.calc(eval, evec, c_srcs, Nconv, LanczosType::irbl);
//--------------------------------------------------------------------
// HDCG solver assembly
//--------------------------------------------------------------------
MultiRHSDeflation<CoarseVector> MrhsGuesser;
MrhsGuesser.ImportEigenBasis(evec, eval);
// MrhsProjector maps between fine (UrbGrid) and coarse (Coarse4d) spaces
MultiRHSBlockProject<LatticeStaggeredFermionD> MrhsProjector;
MrhsProjector.Allocate(nbasis, UrbGrid, Coarse4d);
MrhsProjector.ImportBasis(Aggregates.subspace);
ConjugateGradient<CoarseVector> CoarseCG(5.0e-2, 5000, false);
DoNothingGuesser<CoarseVector> DoNothing;
HPDSolver<CoarseVector> HPDSolve(MrhsCoarseOp, CoarseCG, DoNothing);
// Spectral radius of the fine operator, needed for the smoother shift.
// Use a random checkerboard vector (UrbGrid) as starting guess for PowerMethod.
LatticeStaggeredFermionD fine_pm_src(UrbGrid);
random(RNGrb, fine_pm_src);
PowerMethod<LatticeStaggeredFermionD> finePM;
RealD fine_lambda_max = finePM(HermOp, fine_pm_src);
// Shifted smoother: CG on (HermOp + shift*I) with shift = lambda_max / 100.
//
// The O(8) CG polynomial has 8 roots. With this shift all 8 roots lie in the
// interval [shift, lambda_max + shift] ~ [0.046, 4.65], so the polynomial
// focuses entirely on the HIGH-frequency part of the spectrum and leaves
// near-null modes (lambda << shift) essentially untouched (polynomial ~ 1 there).
//
// This is the right target because the coarse-grid correction always introduces
// high-frequency spectral leakage: the blocked coarse-grid degrees of freedom
// are piecewise constant across coarse cells and therefore have sharp edges at
// cell boundaries (like lego-block edges). Smoothness is measured by the
// covariant Dirac derivative, so promoting the coarse solution back to the
// fine grid inevitably excites high-frequency components — just as a step
// function always carries high-frequency Fourier content. The smoother must
// repair exactly these high modes.
//
// The smoother and the coarse-grid correction are applied alternately: together
// they both lift the low eigenvalues and pull down the upper eigenvalues of the
// composite preconditioned operator, reducing the condition number seen by the
// outer HDCG iterations.
//
// DWF HDCG convention; using mass^2 = 0.0025 was far too small: it scattered
// the 8 roots over [0.005, 4.6] and diluted their effect on the high modes.
RealD smootherShift = fine_lambda_max / 200.0;
std::cout << GridLogMessage << "Smoother shift: lambda_max_fine/200 = "
<< fine_lambda_max << "/200 = " << smootherShift << std::endl;
ShiftedHermOpLinearOperator<LatticeStaggeredFermionD> ShiftedOp(HermOp, smootherShift);
CGSmoother<LatticeStaggeredFermionD> smoother(8, ShiftedOp);
TwoLevelADEF2mrhs<LatticeStaggeredFermionD, CoarseVector>
HDCG(1.0e-8, 500,
HermOp,
smoother,
HPDSolve, // M1 (coarse correction)
HPDSolve, // Vstart (initial guess projection)
MrhsProjector,
MrhsGuesser,
CoarseMrhs);
//--------------------------------------------------------------------
// Solve: nrhs right-hand sides simultaneously
//--------------------------------------------------------------------
std::vector<LatticeStaggeredFermionD> src(nrhs, UrbGrid);
std::vector<LatticeStaggeredFermionD> sol(nrhs, UrbGrid);
GridParallelRNG RNGrb2(UGrid); RNGrb2.SeedFixedIntegers({17,18,19,20}); // must use full grid, not UrbGrid
for (int r = 0; r < nrhs; r++) {
random(RNGrb2, src[r]);
sol[r] = Zero();
}
//--------------------------------------------------------------------
// Baseline: standard single-RHS CG on HermOp (no preconditioning)
// Run before HDCG to establish the unpreconditioned iteration count
// and wall-clock time for direct comparison.
//--------------------------------------------------------------------
{
ConjugateGradient<LatticeStaggeredFermionD> CG(1.0e-8, 100000, false);
std::vector<LatticeStaggeredFermionD> cg_sol(nrhs, UrbGrid);
for (int r = 0; r < nrhs; r++) cg_sol[r] = Zero();
RealD t0 = usecond();
int total_iters = 0;
for (int r = 0; r < nrhs; r++) {
std::cout << GridLogMessage << "====== CG baseline RHS " << r
<< " ======" << std::endl;
CG(HermOp, src[r], cg_sol[r]);
total_iters += CG.IterationsToComplete;
}
RealD t1 = usecond();
std::cout << GridLogMessage << "CG baseline: " << nrhs << " RHS, "
<< total_iters << " total iterations, "
<< (t1 - t0) / 1.0e6 << " s total, "
<< (t1 - t0) / 1.0e6 / nrhs << " s/RHS" << std::endl;
}
//--------------------------------------------------------------------
// HDCG solve
//--------------------------------------------------------------------
HDCG(src, sol);
Grid_finalize();
return 0;
}