Remove accelerator_inline on CPU only code

Update booster compiule
Make disable accelerator aware mpi compile for CUDA
2026-06-18 18:03:44 +01:00 · 2026-06-17 20:47:15 +02:00 · 2026-06-17 20:46:53 +02:00 · 2026-06-17 20:46:14 +02:00 · 2026-06-17 20:45:32 +02:00 · 2026-06-17 20:45:04 +02:00
13 changed files with 317 additions and 22 deletions
@@ -63,12 +63,10 @@ void MemoryManager::PrintBytes(void)
  std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
  std::cout << " MemoryManager : "<<(total_host>>20)  <<" cpu         Mbytes "<<std::endl;
  uint64_t cacheBytes;
-  cacheBytes = CacheBytes[Cpu];
-  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
-  cacheBytes = CacheBytes[Acc];
-  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
-  cacheBytes = CacheBytes[Shared];
-  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
+  cacheBytes = HostCacheBytes();
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu alloc cache Mbytes "<<std::endl;
+  cacheBytes = DeviceCacheBytes();
+  std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc alloc cache Mbytes "<<std::endl;
  
 #ifdef GRID_CUDA
  cuda_mem();
@@ -289,7 +289,7 @@ public:
  ///////////////////////////////////////////
  // move constructor
  ///////////////////////////////////////////
-  Lattice(Lattice && r){ 
+  Lattice(Lattice && r) noexcept { 
    this->_grid = r.Grid();
    this->_odata      = r._odata;
    this->_odata_size = r._odata_size;
@@ -330,7 +330,7 @@ public:
  ///////////////////////////////////////////
  // Move assignment possible if same type
  ///////////////////////////////////////////
-  inline Lattice<vobj> & operator = (Lattice<vobj> && r){
+  inline Lattice<vobj> & operator = (Lattice<vobj> && r) noexcept {

    resize(0); // deletes if appropriate
    this->_grid       = r.Grid();
@@ -438,5 +438,11 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
  result = sumD_gpu_large(lat,osites);
  return result;
 }
+template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
+{
+  Word w;
+  bzero(&w,sizeof(w));
+  return w;
+}

 NAMESPACE_END(Grid);
@@ -1,7 +1,6 @@
 #pragma once

 #if defined(GRID_CUDA)
-
 #include <cub/cub.cuh>
 #define gpucub cub
 #define gpuError_t cudaError_t
@@ -57,8 +56,13 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  //copy offsets to device
  acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
  
+#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
+  #define GRID_CUB_SUM_OP ::cuda::std::plus<>{}
+#else
+  #define GRID_CUB_SUM_OP ::gpucub::Sum()
+#endif
  
-  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
+  gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
  if (gpuErr!=gpuSuccess) {
    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
    exit(EXIT_FAILURE);
@@ -82,12 +86,14 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
  });
  
  //issue segmented reductions in computeStream
-  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
+  gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
  if (gpuErr!=gpuSuccess) {
    std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
    exit(EXIT_FAILURE);
  }

+#undef GRID_CUB_SUM_OP
+  
  acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
  
  //sync after copy
@@ -51,8 +51,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #endif
 #ifdef __x86_64__
 #ifdef GRID_CUDA
-//accelerator_inline uint64_t __rdtsc(void) {  return 0; }
-//accelerator_inline uint64_t __rdpmc(int ) {  return 0; }
+accelerator_inline uint64_t __rdtsc(void) {  return 0; }
+accelerator_inline uint64_t __rdpmc(int ) {  return 0; }
 #else
 #include <x86intrin.h>
 #endif
@@ -93,8 +93,7 @@ inline uint64_t cyclecount(void){
 }
 #elif defined __x86_64__
 inline uint64_t cyclecount(void){ 
-  uint64_t ret = __rdtsc();
-  return (uint64_t)ret;
+  return (uint64_t)0;
 }
 #else

@@ -113,6 +113,14 @@ accelerator_inline RealD    adj(const RealD  & r){ return r; }
 accelerator_inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
 accelerator_inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }

+#if defined(GRID_CUDA) || defined(GRID_HIP)
+//Provide for convenience
+inline std::complex<double> conjugate(const std::complex<double>& r){ return(conj(r)); }
+inline std::complex<float>  conjugate(const std::complex<float>& r) { return(conj(r)); }
+inline std::complex<double> adj(const std::complex<double>& r)      { return(conj(r)); }
+inline std::complex<float>  adj(const std::complex<float>& r)       { return(conj(r)); }
+#endif
+
 accelerator_inline RealF real(const RealF  & r){ return r; }
 accelerator_inline RealD real(const RealD  & r){ return r; }
 accelerator_inline RealF real(const ComplexF  & r){ return r.real(); }
@@ -96,7 +96,9 @@ void     acceleratorInit(void);

 #ifdef GRID_CUDA

+NAMESPACE_END(Grid);
 #include <cuda.h>
+NAMESPACE_BEGIN(Grid);

 #ifdef __CUDA_ARCH__
 #define GRID_SIMT
@@ -8,7 +8,6 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
    --disable-gparity \
    --disable-fermion-reps \
    --with-lime=$LIME \
-    --enable-accelerator-cshift \
    --disable-unified \
    CXX=nvcc \
    LDFLAGS="-cudart shared " \
@@ -1,12 +1,12 @@
 DIR=`pwd`
-PREFIX=$HOME/DDHMC/Grid/systems/Prerequisites/install/
+
 ../../configure \
    --enable-comms=mpi \
    --enable-simd=GPU \
    --enable-shm=nvlink \
    --enable-gen-simd-width=64 \
-    --with-gmp=$PREFIX \
-    --with-mpfr=$PREFIX \
+    --with-gmp=$GMP \
+    --with-mpfr=$MPFR \
    --enable-accelerator=cuda \
    --disable-fermion-reps \
    --disable-unified \
@@ -1,4 +1,6 @@
-
 export CRAY_ACCEL_TARGET=nvidia80
+source /global/homes/p/pboyle/spack/share/spack/setup-env.sh
+export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
+export GMP=`spack find --paths gmp   | grep gmp  | cut -c 12-`

-module load PrgEnv-gnu cpe-cuda cudatoolkit/11.4
+module load PrgEnv-gnu cpe-cuda cudatoolkit/12.0
@@ -3,7 +3,10 @@
 CXX=mpicxx ../../configure \
 	   --enable-simd=GEN \
 	   --enable-comms=mpi-auto \
-	   --enable-Sp=yes \
+	   --enable-Sp=no \
+	   --disable-fermion-reps \
+	   --disable-gparity \
+	   --with-fftw=$FFTW \
 	   --enable-unified=yes \
 	   --prefix /Users/peterboyle/QCD/vtk/Grid/install \
 	   --with-lime=$CLIME \
@@ -0,0 +1,11 @@
+source /Users/peterboyle/QCD//Spack/spack//share/spack/setup-env.sh
+
+export FFTW=`spack find --paths fftw    | grep ^fftw   | awk '{print $2}' `
+#export HDF5=`spack find --paths hdf5+cxx   | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
+export MPFR=`spack find --paths mpfr    | grep ^mpfr  | awk '{print $2}' `
+export OPENSSL=`spack find --paths openssl | grep openssl | awk  '{print $2}' `
+export GMP=`spack find --paths gmp      | grep ^gmp | awk '{print $2}' `
+
+export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
@@ -0,0 +1,261 @@
+/*************************************************************************************
+
+ Test_fft_memory.cc
+
+ Memory growth test for PlannedFFT on a spin-colour matrix (propagator) field.
+
+ The test creates a single PlannedFFT object (which allocates FFTW plans once),
+ then repeatedly applies FFT_all_dim to the same propagator 400 times.
+
+ If PlannedFFT is working correctly the RSS should remain flat after the first
+ iteration — no new plans, no new deviceVector allocations beyond the per-call
+ pencil buffer which is freed at the end of each FFT_dim_execute call.
+
+ Build exactly like any other Grid test, e.g.:
+   make Test_fft_memory
+ or compile manually:
+   $(CXX) $(CXXFLAGS) Test_fft_memory.cc -o Test_fft_memory $(LDFLAGS)
+
+*************************************************************************************/
+
+#include <Grid/Grid.h>
+using namespace Grid;
+
+// --------------------------------------------------------------------------
+// Helper: read RSS (resident set size) in kB from /proc/self/status.
+// Returns 0 on platforms where /proc is unavailable.
+// --------------------------------------------------------------------------
+static long getCPURSSKb()
+{
+  long rss = 0;
+  FILE *fp = fopen("/proc/self/status", "r");
+  if (!fp) return -1;
+  char line[256];
+  while (fgets(line, sizeof(line), fp)) {
+    if (strncmp(line, "VmRSS:", 6) == 0) {
+      sscanf(line + 6, "%ld", &rss);
+      break;
+    }
+  }
+  fclose(fp);
+  return rss;
+}
+
+static long getGPUUsedMb()
+{
+#if defined(GRID_CUDA)
+  size_t free_bytes  = 0;
+  size_t total_bytes = 0;
+  cudaError_t err = cudaMemGetInfo(&free_bytes, &total_bytes);
+  if (err != cudaSuccess) return -1;
+  return (long)((total_bytes - free_bytes) / (1024 * 1024));
+ 
+#elif defined(GRID_HIP)
+  size_t free_bytes  = 0;
+  size_t total_bytes = 0;
+  hipError_t err = hipMemGetInfo(&free_bytes, &total_bytes);
+  if (err != hipSuccess) return -1;
+  return (long)((total_bytes - free_bytes) / (1024 * 1024));
+ 
+#else
+  return -1;   // CPU-only build: no GPU to query
+#endif
+}
+
+// ============================================================
+//  Convenience struct — one snapshot of both sides
+// ============================================================
+struct MemSnapshot {
+  long cpu_rss_kb;   // host RSS in kB  (-1 if unavailable)
+  long gpu_used_mb;  // device used in MB (-1 if no GPU)
+};
+ 
+static MemSnapshot takeSnapshot()
+{
+  MemSnapshot s;
+  s.cpu_rss_kb  = getCPURSSKb();
+  s.gpu_used_mb = getGPUUsedMb();
+  return s;
+}
+ 
+// ============================================================
+//  Pretty-print one row of the monitoring table
+// ============================================================
+static void printRow(int iter,
+                     const MemSnapshot &now,
+                     const MemSnapshot &prev)
+{
+  long cpu_delta = (now.cpu_rss_kb  >= 0 && prev.cpu_rss_kb  >= 0)
+                 ? now.cpu_rss_kb  - prev.cpu_rss_kb  : 0;
+  long gpu_delta = (now.gpu_used_mb >= 0 && prev.gpu_used_mb >= 0)
+                 ? now.gpu_used_mb - prev.gpu_used_mb : 0;
+ 
+  // Sign prefix so deltas are unambiguous
+  auto sign = [](long v) -> const char* { return v >= 0 ? "+" : ""; };
+ 
+  std::cout << GridLogMessage
+            << std::setw(6)  << iter
+            << "  CPU: " << std::setw(10) << now.cpu_rss_kb  << " kB"
+            << "  (" << sign(cpu_delta) << std::setw(7) << cpu_delta << " kB)"
+            << "  GPU: " << std::setw(7)  << now.gpu_used_mb << " MB"
+            << "  (" << sign(gpu_delta) << std::setw(5) << gpu_delta << " MB)"
+            << "\n";
+}
+
+// ============================================================
+
+int main(int argc, char **argv)
+{
+  Grid_init(&argc, &argv);
+
+  int threads = GridThread::GetThreads();
+  std::cout << GridLogMessage
+            << "Grid is setup to use " << threads << " threads" << std::endl;
+
+  // ------------------------------------------------------------------
+  // Grid setup — use whatever lattice/mpi/simd was passed on the CLI,
+  // e.g.  --grid 8.8.8.8 --mpi 1.1.1.1
+  // ------------------------------------------------------------------
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  GridCartesian GRID(latt_size, simd_layout, mpi_layout);
+
+  int vol = 1;
+  for (int d = 0; d < (int)latt_size.size(); d++) vol *= latt_size[d];
+
+  std::cout << GridLogMessage << "Lattice : ";
+  for (int d = 0; d < Nd; d++) std::cout << latt_size[d] << " ";
+  std::cout << std::endl;
+
+  // ------------------------------------------------------------------
+  // Propagator field: SpinColourMatrix = 12x12 complex, i.e.
+  // LatticePropagatorD (= Lattice<iSpinColourMatrix<vComplexD>>).
+  // This is the standard QCD quark propagator type.
+  // ------------------------------------------------------------------
+  LatticePropagatorD prop(&GRID);
+  
+  // ------------------------------------------------------------------
+  // Fill the propagator with a momentum-space plane wave,
+  // following the pattern from Test_fft.cc.
+  // We set each spin-colour component (a,b) to exp(i * sum_mu p_mu x_mu)
+  // with a fixed momentum p = (1,2,1,2).
+  // ------------------------------------------------------------------
+  Coordinate pvec({1, 2, 1, 2});
+
+  LatticeComplexD phase(&GRID);
+  LatticeComplexD coor(&GRID);
+  ComplexD ci(0.0, 1.0);
+
+  phase = Zero();
+  for (int mu = 0; mu < Nd; mu++) {
+    RealD TwoPiL = M_PI * 2.0 / latt_size[mu];
+    LatticeCoordinate(coor, mu);
+    phase = phase + (TwoPiL * pvec[mu]) * coor;
+  }
+  phase = exp(phase * ci);   // e^{i p.x}
+
+  // Broadcast the phase into every spin-colour matrix entry
+  prop = Zero();
+  prop = prop + phase;
+
+  std::cout << GridLogMessage
+            << "Propagator norm2 = " << norm2(prop) << std::endl;
+
+  // ------------------------------------------------------------------
+  // Baseline snapshot BEFORE PlannedFFT construction
+  // ------------------------------------------------------------------
+  MemSnapshot snap_before_plan = takeSnapshot();
+  std::cout << GridLogMessage
+            << "[mem] Before PlannedFFT construction"
+            << "  CPU: " << snap_before_plan.cpu_rss_kb  << " kB"
+            << "  GPU: " << snap_before_plan.gpu_used_mb << " MB"
+            << std::endl;
+ 
+  // ------------------------------------------------------------------
+  // Create the PlannedFFT — plans are allocated here ONCE for all
+  // dimensions and stored inside the object.
+  // ------------------------------------------------------------------
+  PlannedFFT<iSpinColourMatrix<vComplexD>> plannedFFT(&GRID);
+
+  // ------------------------------------------------------------------
+  // Snapshot AFTER plan construction — this is the true baseline
+  // for the loop, because cufftPlanMany itself grabs device memory.
+  // ------------------------------------------------------------------
+  MemSnapshot snap_after_plan = takeSnapshot();
+  std::cout << GridLogMessage
+            << "[mem] After  PlannedFFT construction"
+            << "  CPU: " << snap_after_plan.cpu_rss_kb  << " kB"
+            << "  GPU: " << snap_after_plan.gpu_used_mb << " MB"
+            << "  (plan overhead:"
+            << "  CPU +" << snap_after_plan.cpu_rss_kb - snap_before_plan.cpu_rss_kb << " kB"
+            << "  GPU +" << snap_after_plan.gpu_used_mb - snap_before_plan.gpu_used_mb << " MB)"
+            << std::endl;
+
+  MemoryManager::Print();
+  // ------------------------------------------------------------------
+  // 400-iteration loop.
+  // Each iteration computes the full 4d forward FFT of `prop`.
+  // We deliberately do NOT cache the result — we always start from
+  // the same `prop` so the FFT is recomputed identically each time.
+  // The point is to watch memory, not correctness.
+  // ------------------------------------------------------------------
+  const int Niter = 40;
+  const int Niter2 = 32;
+
+  // Print header for the memory table
+  std::cout << GridLogMessage
+              << "\n"
+              << std::setw(6)  << "iter"
+              << "  CPU: " << std::setw(10) << "RSS[kB]"
+              << "  (  delta  )"
+              << "  GPU: " << std::setw(7)  << "used[MB]"
+              << "  (delta)"
+              << "\n";
+ 
+  MemSnapshot snap_prev = snap_after_plan;
+
+  for (int i = 0; i < Niter; i++) {
+    std::vector<LatticePropagatorD> G;
+
+    for (int j = 0; j < Niter2; j++) {
+      LatticePropagatorD prop_fft(&GRID);
+
+      // Full 4d forward FFT using the pre-built plans
+      plannedFFT.FFT_all_dim(prop_fft, prop, FFT::forward);
+
+      G.push_back(prop_fft);
+    }
+ 
+    // cudaMemGetInfo reflects the state *after* any pooled frees have
+    // been committed, so this is accurate without an explicit sync —
+    // FFT_dim_execute already calls accelerator_barrier() internally.
+    MemSnapshot snap_now = takeSnapshot();
+    printRow(i, snap_now, snap_prev);
+    MemoryManager::Print();
+    snap_prev = snap_now;
+  }
+
+  // ------------------------------------------------------------------
+  // Summary
+  // ------------------------------------------------------------------
+  MemSnapshot snap_final = takeSnapshot();
+ 
+  long cpu_growth = snap_final.cpu_rss_kb  - snap_after_plan.cpu_rss_kb;
+  long gpu_growth = snap_final.gpu_used_mb - snap_after_plan.gpu_used_mb;
+ 
+  std::cout << GridLogMessage
+            << "\n==== Memory summary (baseline = after plan construction) ====\n"
+            << "  CPU RSS growth over " << Niter << " FFTs : "
+            << cpu_growth << " kB"
+            << (cpu_growth == 0 ? "  OK" : "  *** GROWING ***") << "\n"
+            << "  GPU used growth over " << Niter << " FFTs : "
+            << gpu_growth << " MB"
+            << (gpu_growth == 0 ? "  OK" : "  *** GROWING ***") << "\n"
+            << "  Note: first-call watermark from pool fill is expected and benign.\n"
+            << "        A leak shows as continuous growth beyond iter ~2-3.\n";
+ 
+  Grid_finalize();
+  return 0;
+}
Author	SHA1	Message	Date
Peter Boyle	f11ba18df2	Remove accelerator_inline on CPU only code	2026-06-17 20:47:15 +02:00
Peter Boyle	cf8587e401	Update booster compiule	2026-06-17 20:46:53 +02:00
Peter Boyle	7dd35ef749	Make disable accelerator aware mpi compile for CUDA	2026-06-17 20:46:14 +02:00
Peter Boyle	41e570ddce	Annoying old CPU perfmon code should be removed or deprecated as not worth maintaining	2026-06-17 20:45:32 +02:00
Peter Boyle	a452131b50	Print improvement	2026-06-17 20:45:04 +02:00
Peter Boyle	4e49ca55ab	Merge branch 'develop' of https://github.com/paboyle/Grid into develop	2026-06-16 11:20:12 -07:00
Peter Boyle	c3f4474401	Adding Mattia's memory leak test	2026-06-16 11:19:36 -07:00
Peter Boyle	3d3eff86f3	Modify move assignment operator to be noexcept Add noexcept specifier to move assignment operator.	2026-06-11 09:44:24 -04:00
Peter Boyle	fc9f154ac1	Modify Lattice move constructor to be noexcept Add noexcept specifier to move constructor for Lattice class.	2026-06-11 09:40:06 -04:00
Peter Boyle	4aa0bca4dc	Change sum operation to use gpucub mistake in PR from Chris Updated the sum operation definition for GPU reduction to use gpucub instead of cub.	2026-06-01 14:12:25 -04:00
Peter Boyle	905da6f083	Merge branch 'feature/reduction-reorganisation' into develop	2026-05-27 21:01:30 -04:00
Peter Boyle	86c7f29183	Config command update	2026-05-27 16:19:33 -04:00
Peter Boyle	b0c99f876e	Configure on mac update	2026-05-27 16:16:55 -04:00
Peter Boyle	bf5fcdc860	Ease of use for std::complex interchangable with thrust	2026-05-27 16:05:37 -04:00
Peter Boyle	b58a1508fa	Perlmutter cuda version update	2026-05-21 13:25:13 -07:00
Peter Boyle	4d527e81fa	Remove hip specific files	2026-05-21 12:34:30 -04:00
Peter Boyle	7803580aa6	Lattice_reduction_gpu: demote timing logs to Debug, disable by default skills/mpi-heterogeneous: add Bug Class 4 for Frontier GTL/libamdhip64 ABI mismatch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	32654db366	Test_planned_fft: fix PlannedFFT template parameter to use ::vector_object Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	cd340cfab3	tests: add Test_planned_fft exercising PlannedFFT<vobj> Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	f32866b2ff	tests/fft: remove PlanDestroy calls (FFT handles plans per-call) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	1cd1dc091e	FFT: add FFTbase, PlannedFFT; factor FFT_dim_execute free function Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	0493656e86	debug: add Test_hipfft_repro — reproducer for hipFFT PARSE_ERROR on ROCm 7 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	66fd504c4d	tests/debug: add G=4 to hipfft fail reproducer Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	be4dd2b52f	tests/debug: test hipMemset variant before cache is populated Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	707d059766	tests/debug: extend hipfft fail reproducer with hipMemset and sync variants Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	f08c755ae6	FFT: use host stack buffer in PlanCreate, not deviceVector Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	dbbfdd4e4b	tests/debug: add minimal hipfft ordering bug fail/pass pair Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	f967fb40bf	tests/debug: test plan-before-malloc vs malloc-before-plan ordering Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	74e0f846cb	tests/debug: extend hipfft reproducer with Grid-realistic howmany and exec tests Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	303a4d26e5	tests/debug: add minimal hipfft plan-creation reproducer Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	119888653c	FFT HIP: use hipfftCreate+hipfftMakePlanMany instead of hipfftPlanMany	2026-05-21 12:34:30 -04:00
Peter Boyle	a9f42c08f9	FFT: pass nullptr for inembed/onembed in hipfftPlanMany to avoid HIPFFT_PARSE_ERROR	2026-05-21 12:34:30 -04:00
Peter Boyle	e79adc9d31	FFT: cache plans per vobj type across calls Plans are created lazily on the first FFT_dim call and reused for all subsequent calls on the same FFT object. PlanCreate<vobj>() can be called explicitly to pre-warm the cache. PlanDestroy() must be called before switching to a different vobj type; the destructor cleans up any live plans automatically. Update Test_fft.cc and Test_fftf.cc to call PlanDestroy() between the LatticeComplex and LatticeSpinMatrix sections that reuse the same FFT object. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	5a9056cd93	Accelerator: lower default accelerator_threads from 16 to 8 Benchmark_dwf_fp32 on MI250X GCD: 1.7 TF/s at nt=8, ~300 GF/s at nt=16. With Nsimd=8 (fp32, GEN_SIMD_WIDTH=64B), nt=8 gives exactly 64 threads = one full AMD wavefront. Higher values double register demand per block and hit a register-pressure cliff for stencil kernels. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	012c36ab5a	Accelerator: raise default accelerator_threads from 2 to 16	2026-05-21 12:34:30 -04:00
Peter Boyle	5c4574f9aa	skills: add gpu-memory-performance.md Documents the acceleratorThreads() default=2 trap, LambdaApply thread mapping, coalescedRead/Write idiom, when to use __global__ vs accelerator_for, and fused vs staged HBM access patterns. Includes observed MI250X numbers from LatticePropagatorD reduction (50 → 297 → 546 GB/s progression). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	a424775884	sumD_gpu_reduce_words: fuse pack+reduce into single packReduceKernel Replace the two-kernel pack+reduce sequence with a single fused kernel packReduceKernel<R> that reads R words of each vobj at offset 'base' and accumulates directly into iVector<iScalar<scalarD>,R>, eliminating the intermediate bundle buffer entirely. HBM access per word-group drops from 3x (pack-read + pack-write + reduce-read) to 1x. Thread count comes from getNumBlocksAndThreads (warpSize..256) rather than acceleratorThreads(), so occupancy is correct regardless of the --accelerator-threads setting. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	d6b1388741	Modified repack	2026-05-21 12:34:30 -04:00
Peter Boyle	796c6cae4e	Enable GRID_REDUCTION_TIMING unconditionally Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	1a8064d6d9	Lattice_reduction_gpu: add GRID_REDUCTION_TIMING instrumentation Uncomment #define GRID_REDUCTION_TIMING to enable per-phase timing output: sumD_gpu_reduce_words: pack time (accelerator_for) per R and base sumD_gpu_small: reduceKernel+barrier time and D2H time separately sumD_gpu_large: total wall time across all word groups This lets us identify whether the large-type bottleneck is in the pack kernel, the shared-memory reduction kernel, the barrier, or the D2H. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	43648924c3	sumD_gpu_large: radix-12 word-bundle reduction replacing radix-1 Replace the word-by-word loop (one kernel launch per scalar word) with sumD_gpu_reduce_words<R> which packs R consecutive vector_type words per site into iVector<iScalar<vector>,R>, then calls the existing sumD_gpu_small shared-memory kernel once for the whole bundle. Dispatch: radix-12 first, radix-4 for the remainder < 12, radix-1 for any final < 4 words. For LatticePropagator (144 words = 12x12), this reduces the kernel-launch count from 144 to 12 -- a 12x reduction. Bundle::Nsimd() inherits from vector_type so sumD_gpu_small handles SIMD lane extraction and double-precision promotion identically to the scalar word case. sizeof(Bundle::scalar_objectD) = R*16 <= 192 B; well within sharedMemPerBlock on all supported devices. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	bf2140e74d	Lattice_reduction_sycl: fix double-precision accumulation in sumD_gpu_tensor Accumulate in sobjD throughout rather than accumulating in sobj and converting the final sum. For float fields this matters: summing N floats then casting loses O(N*eps_float) relative precision vs accumulating in double from the start. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	a1119266c1	Revert to hand-rolled reduction; drop Lattice_reduction_gpu_cub.h Remove the CUB/hipCUB direction entirely. Restore Lattice_reduction_gpu.h, Lattice_reduction_sycl.h, and Lattice_reduction.h to the state before the CUB rewrite (commit `969b0a39`), recovering the original primary function names (sumD_gpu_small, sumD_gpu_large, sumD_gpu, sum_gpu, sum_gpu_large) and the hand-rolled shared-memory reduction kernel. Delete Lattice_reduction_gpu_cub.h. Update Test_reduction to remove the old/new comparison sections that depended on sum_gpu_old. The lesson: CUB DeviceReduce is slower than the hand-rolled kernel for small types, and the smem sizing problem for the extraction pass has no clean solution within the accelerator_for abstraction. The right improvement is a higher radix (12 then 4) in sumD_gpu_large, applied directly to the existing hand-rolled kernel. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	a0f00c0eca	sumD_gpu_direct: revert to per-lane write; CUB handles Nsimdosites inputs Benchmarking showed the shared-memory lane-summation approach (`843d6497`) was slower than writing each SIMD lane individually and letting CUB reduce the full nlanes = ositesNsimd array. CUB's device reduce is more efficient over the larger input than the smem overhead + serialised lane-0 summation. The smem approach also required overriding acceleratorThreads() to avoid the block-size sizing problem. Restore the simpler per-lane path. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	d358954a84	sumD_gpu_direct: shared-memory lane reduction with acceleratorThreads(1) Set acceleratorThreads to 1 before the extraction kernel so that dim3(nsimd,1,1) blocks give exactly one site group per block and __shared__ sobjD smem[nsimd] is correctly sized without depending on the runtime acceleratorThreads() value. threadIdx.x (acceleratorSIMTlane) indexes the SIMD lane for coalesced reads; lane 0 sums smem[0..nsimd-1] and writes one sobjD per site. CUB then reduces osites elements instead of osites*nsimd, reducing both store traffic and CUB work by Nsimd. acceleratorSynchronise() (warp-level) suffices since nsimd < warpSize. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	aee00bdfb5	sumD_gpu_direct: one thread per SIMD lane using extractLane Replaces one thread per outer site calling Reduce() (sequential Nsimd-wide loop) with one thread per lane calling extractLane() — O(1) per thread. CUB now reduces over osites*Nsimd elements. Avoids serial lane reduction but leaves the per-lane sobjD store stride as a known remaining concern. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	cf324b0fa1	Lattice_reduction_gpu_cub: define GRID_REDUCTION_TIMING in header Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	b314dc224d	Lattice_reduction_gpu_cub: add GRID_REDUCTION_TIMING instrumentation Guards accelerator_for and CUB DeviceReduce calls in sumD_gpu_direct and sumD_gpu_large with #ifdef GRID_REDUCTION_TIMING to isolate where time is spent in each path. Large path accumulates across all groups and prints totals with words/nfull/rem context. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	1bbd62498e	Lattice_reduction_gpu_cub: replace WordBundle4 with iVector<iScalar<scalarD>,4> WordBundle4 was redundant with Grid's existing tensor infrastructure. iVector<iScalar<scalarD>,4> already provides accelerator_inline operator+, zeroit(), and sycl::is_device_copyable — no new type needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	f3c3b1c04b	Test_reduction: add timing benchmark for new vs old reduction paths Reports us/call and GB/s for sum_gpu (CUB/sycl::reduction) and sum_gpu_old (hand-rolled shared-memory) for each field type, with 5-call warmup and 100-call timed loop. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	069f98b253	skills: HPC battle-hardening skill files for GPU+MPI correctness Six skill files encoding expertise for making codebases robust on problematic HPC systems, covering: correctness verification (double-run, fingerprinting, flight recorder), hang diagnosis, GPU runtime correctness (premature barrier, infinite poll), MPI correctness on heterogeneous systems (device buffer aliasing, AARCH64 PLT corruption, deterministic reductions), compiler validation, and communication/computation overlap pipeline design. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	dfd0503eae	Test_reduction: use separate float and double grids Float fields require a grid constructed with vComplexF::Nsimd(); using a double grid causes grid->_gsites to undercount the sites in float vobjF, making the constant-field expected value wrong. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	c629b2e87e	Rename scalarNorm2 to squaredSum in Test_reduction.cc The function computes \|sum\|^2 — the squared magnitude of an aggregate sum — not a norm. squaredSum makes clear that squaring is applied to the sum, not to individual site values before summing, distinguishing it from sumOfSquares (the squared L2 norm). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	7c8462abd1	Fix Zero() used on thrust::complex in WordBundle4 initialisation Grid's Zero() sentinel is not assignable to thrust::complex<double>; use scalarD(0) instead. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	95a6a0bde7	Reinstate large/small dispatch in CUB reduction path; radix-4 word-bundle for large types rocPRIM's DeviceReduce requires warpSize(64) threads each holding one element in shared memory, so sizeof(T)64 must fit in sharedMemPerBlock. LatticePropagator::scalar_objectD is 2304 bytes (642304 = 147 KB), exceeding the budget and triggering a compile-time static_assert in limit_block_size. Introduce sumD_gpu_direct (the original direct-CUB path, safe for small types) and a new sumD_gpu_large that groups the vobj's vector_type words in bundles of 4, reducing each bundle as WordBundle4<scalarD> (64 bytes, 64*64 = 4 KB — always within budget). If words % 4 != 0, the final partial bundle is zero-padded. sumD_gpu dispatches at compile time via if constexpr on sizeof(sobjD) > 512. For LatticePropagator (144 words) this gives 36 CUB launches instead of 144. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	bba328fac5	Add Test_reduction to tests/debug Tests the new CUB/hipCUB/SYCL lattice reduction (sum_gpu) against the preserved hand-rolled implementation (sum_gpu_old) for LatticeComplexF/D, LatticeColourMatrixF/D and LatticePropagatorF/D. Part a) gaussian random field: checks that old and new agree to within float/double roundoff tolerance. Part b) constant field (= 1.0, identity-matrix init): verifies innerProduct(sum, sum) = Ncomp * V^2 where Ncomp counts the nonzero diagonal scalar components per site (1 / Nc / Ns*Nc respectively). Make.inc is auto-generated by scripts/filelist on bootstrap and is not tracked; the new .cc file is all that is needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	41362349f3	Rewrite lattice GPU reduction to use CUB, hipCUB, and SYCL reduction Replace hand-rolled shared-memory reduction kernels (reduceBlock/reduceBlocks/ reduceKernel) and the global device variable retirementCount with a unified CUB/hipCUB DeviceReduce::Reduce path for CUDA/HIP and sycl::reduction for SYCL. No small/large split is needed: both CUB and sycl::reduction handle arbitrary object sizes internally. Old implementations preserved as sum_gpu_old / sumD_gpu_old etc. in the original files for regression testing on GPU hardware. Also add CLAUDE.md with build, test, and architecture guidance. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-21 12:34:30 -04:00
Peter Boyle	a5a04929fb	Merge pull request #492 from giltirn/develop Fixes to support CUDA > 13	2026-05-19 15:26:58 -04:00
Christopher Kelly	77b8657fcc	Fixes to support CUDA > 13. Specifically, the CUDA header is no longer accidentally included within Grid's namespace, and the breaking change to cub::Sum() -> ::cuda::std::plus<>{} in CUDA-13 has been worked around	2026-05-19 12:22:14 -04:00
Peter Boyle	f8b2eacf99	File list issue (Ed Bennets pull request?)	2026-05-15 12:57:42 -04:00
Peter Boyle	6140ac6864	Hip Happy	2026-05-15 12:13:01 -04:00