diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index da5eb90f..27388062 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { // Possibly promote to double and sum ///////////////////////////////////////////////////////////////////////////////////////////////////////// -#define GRID_REDUCTION_TIMING +#undef GRID_REDUCTION_TIMING template inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) @@ -230,7 +230,7 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); #ifdef GRID_REDUCTION_TIMING t_d2h += usecond(); - std::cout << GridLogMessage << " sumD_gpu_small" + std::cout << GridLogDebug << " sumD_gpu_small" << " sizeof(sobj)=" << sizeof(sobj) << " blocks=" << numBlocks << " threads=" << numThreads << " kernel+barrier=" << t_kernel << " us" @@ -362,7 +362,7 @@ inline void sumD_gpu_reduce_words(const vobj *lat, Integer osites, acceleratorCopyFromDevice(buffer_v, &result, sizeof(result)); #ifdef GRID_REDUCTION_TIMING t_d2h += usecond(); - std::cout << GridLogMessage << " sumD_gpu_reduce_words R=" << R + std::cout << GridLogDebug << " sumD_gpu_reduce_words R=" << R << " base=" << base << " kernel=" << t_kernel << " D2H=" << t_d2h << " us" << std::endl; #endif @@ -391,7 +391,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi while (w < words) { sumD_gpu_reduce_words< 1>(lat, osites, ret_p, w); w += 1; } #ifdef GRID_REDUCTION_TIMING t_large += usecond(); - std::cout << GridLogMessage << "sumD_gpu_large" + std::cout << GridLogDebug << "sumD_gpu_large" << " sizeof(sobjD)=" << sizeof(sobjD) << " words=" << words << " total=" << t_large << " us" << std::endl; #endif diff --git a/skills/mpi-heterogeneous.md b/skills/mpi-heterogeneous.md index 9ad45523..e71a54c0 100644 --- a/skills/mpi-heterogeneous.md +++ b/skills/mpi-heterogeneous.md @@ -1,6 +1,6 @@ --- name: mpi-heterogeneous -description: Diagnose and work around MPI correctness bugs on heterogeneous (CPU+GPU) systems — device buffer aliasing in MPI_Sendrecv, AARCH64 PLT corruption from libfabric, topology-dependent allreduce hangs, and deterministic point-to-point reduction trees as a replacement for MPI_Allreduce. +description: Diagnose and work around MPI correctness bugs on heterogeneous (CPU+GPU) systems — device buffer aliasing in MPI_Sendrecv, AARCH64 PLT corruption from libfabric, topology-dependent allreduce hangs, mixed-ABI HIP runtime from wrong GTL library (Frontier/ROCm), and deterministic point-to-point reduction trees as a replacement for MPI_Allreduce. user-invocable: true allowed-tools: - Read @@ -110,6 +110,51 @@ void GlobalSumP2P(double *data, int count, MPI_Comm comm) { Grid reference: `USE_GRID_REDUCTION` macro in `Grid/communicator/Communicator_mpi3.cc`. +## Bug Class 4: Mixed HIP ABI from Wrong GTL Library (Frontier / ROCm) + +**Symptom**: `HIPFFT_PARSE_ERROR` (error code 12) returned by `hipfftPlanMany` / `hipfftMakePlanMany` / `hipfftPlan1d` for FFT sizes G < 32, but G ≥ 32 succeeds. The failure only occurs with an empty rocFFT kernel cache (`~/.cache/rocfft`); a warm cache may mask it. Host-side operations and GPU kernels that do not invoke rocFFT JIT work correctly. + +**Root cause — mixed HIP ABI**: rocFFT uses JIT compilation (via `libamd_comgr`) for small transforms (G < 32); for G ≥ 32 it uses pre-compiled device code bundled in the library, so the JIT path is never exercised. When two HIP runtime versions are loaded in the same process — e.g. `libamdhip64.so.7` (ROCm 7) and `libamdhip64.so.6` (ROCm 6) — the rocFFT JIT cannot complete successfully. + +The hidden source of the old library is the Cray MPI GPU Transport Layer. On Frontier, `cray-mpich`'s `libmpi_gtl_hsa.so` may be compiled against `libamdhip64.so.6` (ROCm 6 ABI) even when the loaded ROCm module is 7.0.2. Because `LD_LIBRARY_PATH` picks up the GTL directory before the ROCm 7 library directory, `libamdhip64.so.6` is pulled in first, and both ABI versions end up resident in the process. + +**Diagnosis**: +```bash +# Check which libamdhip64 versions are actually linked into your binary at runtime +ldd --verbose ./your_binary 2>&1 | grep amdhip +# Bad output — two different .so versions: +# libamdhip64.so.6 => /opt/rocm-6.4.2/lib/libamdhip64.so.6 +# libamdhip64.so.7 => /opt/rocm-7.0.2/lib/libamdhip64.so.7 +# Good output — only one: +# libamdhip64.so.7 => /opt/rocm-7.0.2/lib/libamdhip64.so.7 +``` + +If two versions appear, the problem is the GTL/LD_LIBRARY_PATH ordering. + +**Fix — correct module stack and LD_LIBRARY_PATH ordering (Frontier)**: +```bash +module load cce/21.0.0 +module load cpe/26.03 +module load rocm/7.0.2 +# Prepend CRAY_LD_LIBRARY_PATH so the ROCm-7-aware GTL is found first +export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH +# Ensure ROCm 7 LLVM libs (needed by libamd_comgr JIT) are on the path +export LD_LIBRARY_PATH=/opt/rocm-7.0.2/lib/llvm/lib/:$LD_LIBRARY_PATH +``` + +The critical step is prepending `CRAY_LD_LIBRARY_PATH`: this ensures the GTL library built against the ROCm 7 ABI is resolved before any older version that may appear further down `LD_LIBRARY_PATH`. Without this step, a stale symlink or directory ordering can silently load the wrong `libmpi_gtl_hsa.so`. + +**Reproducer**: `tests/debug/Test_hipfft_repro.cc` — standalone hipFFT test (no Grid headers) that sweeps G and howmany values matching realistic Grid lattice geometries. Compile with: +```bash +hipcc -o Test_hipfft_repro Test_hipfft_repro.cc -lhipfft +rm -rf ~/.cache/rocfft # empty cache required to trigger JIT path +./Test_hipfft_repro +``` + +**Reference**: `systems/WorkArounds.txt`, Frontier section — GPU mapping, XPMEM, and `FI_MR_CACHE_MONITOR=disabled` settings for Frontier are documented there. + +**Systems affected**: Frontier (ORNL, MI250X). Likely applies to any Cray PE system where the loaded `cray-mpich` GTL was compiled against an older ROCm ABI than the runtime ROCm module. LumiG (CSC, MI250X) uses the same Cray PE and may exhibit the same issue. + ## Compile-Time Guard Structure Recommended macro structure to switch between the workaround paths: