diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
index da5eb90f..27388062 100644
--- a/Grid/lattice/Lattice_reduction_gpu.h
+++ b/Grid/lattice/Lattice_reduction_gpu.h
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
 // Possibly promote to double and sum
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-#define GRID_REDUCTION_TIMING
+#undef GRID_REDUCTION_TIMING
 
 template <class vobj>
 inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
@@ -230,7 +230,7 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
 #ifdef GRID_REDUCTION_TIMING
   t_d2h += usecond();
-  std::cout << GridLogMessage << "  sumD_gpu_small"
+  std::cout << GridLogDebug << "  sumD_gpu_small"
             << " sizeof(sobj)=" << sizeof(sobj)
             << " blocks=" << numBlocks << " threads=" << numThreads
             << " kernel+barrier=" << t_kernel << " us"
@@ -362,7 +362,7 @@ inline void sumD_gpu_reduce_words(const vobj *lat, Integer osites,
   acceleratorCopyFromDevice(buffer_v, &result, sizeof(result));
 #ifdef GRID_REDUCTION_TIMING
   t_d2h += usecond();
-  std::cout << GridLogMessage << " sumD_gpu_reduce_words R=" << R
+  std::cout << GridLogDebug << " sumD_gpu_reduce_words R=" << R
             << " base=" << base
             << " kernel=" << t_kernel << " D2H=" << t_d2h << " us" << std::endl;
 #endif
@@ -391,7 +391,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
   while (w      <  words) { sumD_gpu_reduce_words< 1>(lat, osites, ret_p, w); w +=  1; }
 #ifdef GRID_REDUCTION_TIMING
   t_large += usecond();
-  std::cout << GridLogMessage << "sumD_gpu_large"
+  std::cout << GridLogDebug << "sumD_gpu_large"
             << " sizeof(sobjD)=" << sizeof(sobjD)
             << " words=" << words << " total=" << t_large << " us" << std::endl;
 #endif
diff --git a/skills/mpi-heterogeneous.md b/skills/mpi-heterogeneous.md
index 9ad45523..e71a54c0 100644
--- a/skills/mpi-heterogeneous.md
+++ b/skills/mpi-heterogeneous.md
@@ -1,6 +1,6 @@
 ---
 name: mpi-heterogeneous
-description: Diagnose and work around MPI correctness bugs on heterogeneous (CPU+GPU) systems — device buffer aliasing in MPI_Sendrecv, AARCH64 PLT corruption from libfabric, topology-dependent allreduce hangs, and deterministic point-to-point reduction trees as a replacement for MPI_Allreduce.
+description: Diagnose and work around MPI correctness bugs on heterogeneous (CPU+GPU) systems — device buffer aliasing in MPI_Sendrecv, AARCH64 PLT corruption from libfabric, topology-dependent allreduce hangs, mixed-ABI HIP runtime from wrong GTL library (Frontier/ROCm), and deterministic point-to-point reduction trees as a replacement for MPI_Allreduce.
 user-invocable: true
 allowed-tools:
   - Read
@@ -110,6 +110,51 @@ void GlobalSumP2P(double *data, int count, MPI_Comm comm) {
 
 Grid reference: `USE_GRID_REDUCTION` macro in `Grid/communicator/Communicator_mpi3.cc`.
 
+## Bug Class 4: Mixed HIP ABI from Wrong GTL Library (Frontier / ROCm)
+
+**Symptom**: `HIPFFT_PARSE_ERROR` (error code 12) returned by `hipfftPlanMany` / `hipfftMakePlanMany` / `hipfftPlan1d` for FFT sizes G < 32, but G ≥ 32 succeeds. The failure only occurs with an empty rocFFT kernel cache (`~/.cache/rocfft`); a warm cache may mask it. Host-side operations and GPU kernels that do not invoke rocFFT JIT work correctly.
+
+**Root cause — mixed HIP ABI**: rocFFT uses JIT compilation (via `libamd_comgr`) for small transforms (G < 32); for G ≥ 32 it uses pre-compiled device code bundled in the library, so the JIT path is never exercised. When two HIP runtime versions are loaded in the same process — e.g. `libamdhip64.so.7` (ROCm 7) and `libamdhip64.so.6` (ROCm 6) — the rocFFT JIT cannot complete successfully.
+
+The hidden source of the old library is the Cray MPI GPU Transport Layer. On Frontier, `cray-mpich`'s `libmpi_gtl_hsa.so` may be compiled against `libamdhip64.so.6` (ROCm 6 ABI) even when the loaded ROCm module is 7.0.2. Because `LD_LIBRARY_PATH` picks up the GTL directory before the ROCm 7 library directory, `libamdhip64.so.6` is pulled in first, and both ABI versions end up resident in the process.
+
+**Diagnosis**:
+```bash
+# Check which libamdhip64 versions are actually linked into your binary at runtime
+ldd --verbose ./your_binary 2>&1 | grep amdhip
+# Bad output — two different .so versions:
+#   libamdhip64.so.6 => /opt/rocm-6.4.2/lib/libamdhip64.so.6
+#   libamdhip64.so.7 => /opt/rocm-7.0.2/lib/libamdhip64.so.7
+# Good output — only one:
+#   libamdhip64.so.7 => /opt/rocm-7.0.2/lib/libamdhip64.so.7
+```
+
+If two versions appear, the problem is the GTL/LD_LIBRARY_PATH ordering.
+
+**Fix — correct module stack and LD_LIBRARY_PATH ordering (Frontier)**:
+```bash
+module load cce/21.0.0
+module load cpe/26.03
+module load rocm/7.0.2
+# Prepend CRAY_LD_LIBRARY_PATH so the ROCm-7-aware GTL is found first
+export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
+# Ensure ROCm 7 LLVM libs (needed by libamd_comgr JIT) are on the path
+export LD_LIBRARY_PATH=/opt/rocm-7.0.2/lib/llvm/lib/:$LD_LIBRARY_PATH
+```
+
+The critical step is prepending `CRAY_LD_LIBRARY_PATH`: this ensures the GTL library built against the ROCm 7 ABI is resolved before any older version that may appear further down `LD_LIBRARY_PATH`. Without this step, a stale symlink or directory ordering can silently load the wrong `libmpi_gtl_hsa.so`.
+
+**Reproducer**: `tests/debug/Test_hipfft_repro.cc` — standalone hipFFT test (no Grid headers) that sweeps G and howmany values matching realistic Grid lattice geometries. Compile with:
+```bash
+hipcc -o Test_hipfft_repro Test_hipfft_repro.cc -lhipfft
+rm -rf ~/.cache/rocfft   # empty cache required to trigger JIT path
+./Test_hipfft_repro
+```
+
+**Reference**: `systems/WorkArounds.txt`, Frontier section — GPU mapping, XPMEM, and `FI_MR_CACHE_MONITOR=disabled` settings for Frontier are documented there.
+
+**Systems affected**: Frontier (ORNL, MI250X). Likely applies to any Cray PE system where the loaded `cray-mpich` GTL was compiled against an older ROCm ABI than the runtime ROCm module. LumiG (CSC, MI250X) uses the same Cray PE and may exhibit the same issue.
+
 ## Compile-Time Guard Structure
 
 Recommended macro structure to switch between the workaround paths: