diff --git a/tests/Test_fft_memory.cc b/tests/Test_fft_memory.cc new file mode 100644 index 00000000..40cd6085 --- /dev/null +++ b/tests/Test_fft_memory.cc @@ -0,0 +1,261 @@ +/************************************************************************************* + + Test_fft_memory.cc + + Memory growth test for PlannedFFT on a spin-colour matrix (propagator) field. + + The test creates a single PlannedFFT object (which allocates FFTW plans once), + then repeatedly applies FFT_all_dim to the same propagator 400 times. + + If PlannedFFT is working correctly the RSS should remain flat after the first + iteration — no new plans, no new deviceVector allocations beyond the per-call + pencil buffer which is freed at the end of each FFT_dim_execute call. + + Build exactly like any other Grid test, e.g.: + make Test_fft_memory + or compile manually: + $(CXX) $(CXXFLAGS) Test_fft_memory.cc -o Test_fft_memory $(LDFLAGS) + +*************************************************************************************/ + +#include +using namespace Grid; + +// -------------------------------------------------------------------------- +// Helper: read RSS (resident set size) in kB from /proc/self/status. +// Returns 0 on platforms where /proc is unavailable. +// -------------------------------------------------------------------------- +static long getCPURSSKb() +{ + long rss = 0; + FILE *fp = fopen("/proc/self/status", "r"); + if (!fp) return -1; + char line[256]; + while (fgets(line, sizeof(line), fp)) { + if (strncmp(line, "VmRSS:", 6) == 0) { + sscanf(line + 6, "%ld", &rss); + break; + } + } + fclose(fp); + return rss; +} + +static long getGPUUsedMb() +{ +#if defined(GRID_CUDA) + size_t free_bytes = 0; + size_t total_bytes = 0; + cudaError_t err = cudaMemGetInfo(&free_bytes, &total_bytes); + if (err != cudaSuccess) return -1; + return (long)((total_bytes - free_bytes) / (1024 * 1024)); + +#elif defined(GRID_HIP) + size_t free_bytes = 0; + size_t total_bytes = 0; + hipError_t err = hipMemGetInfo(&free_bytes, &total_bytes); + if (err != hipSuccess) return -1; + return (long)((total_bytes - free_bytes) / (1024 * 1024)); + +#else + return -1; // CPU-only build: no GPU to query +#endif +} + +// ============================================================ +// Convenience struct — one snapshot of both sides +// ============================================================ +struct MemSnapshot { + long cpu_rss_kb; // host RSS in kB (-1 if unavailable) + long gpu_used_mb; // device used in MB (-1 if no GPU) +}; + +static MemSnapshot takeSnapshot() +{ + MemSnapshot s; + s.cpu_rss_kb = getCPURSSKb(); + s.gpu_used_mb = getGPUUsedMb(); + return s; +} + +// ============================================================ +// Pretty-print one row of the monitoring table +// ============================================================ +static void printRow(int iter, + const MemSnapshot &now, + const MemSnapshot &prev) +{ + long cpu_delta = (now.cpu_rss_kb >= 0 && prev.cpu_rss_kb >= 0) + ? now.cpu_rss_kb - prev.cpu_rss_kb : 0; + long gpu_delta = (now.gpu_used_mb >= 0 && prev.gpu_used_mb >= 0) + ? now.gpu_used_mb - prev.gpu_used_mb : 0; + + // Sign prefix so deltas are unambiguous + auto sign = [](long v) -> const char* { return v >= 0 ? "+" : ""; }; + + std::cout << GridLogMessage + << std::setw(6) << iter + << " CPU: " << std::setw(10) << now.cpu_rss_kb << " kB" + << " (" << sign(cpu_delta) << std::setw(7) << cpu_delta << " kB)" + << " GPU: " << std::setw(7) << now.gpu_used_mb << " MB" + << " (" << sign(gpu_delta) << std::setw(5) << gpu_delta << " MB)" + << "\n"; +} + +// ============================================================ + +int main(int argc, char **argv) +{ + Grid_init(&argc, &argv); + + int threads = GridThread::GetThreads(); + std::cout << GridLogMessage + << "Grid is setup to use " << threads << " threads" << std::endl; + + // ------------------------------------------------------------------ + // Grid setup — use whatever lattice/mpi/simd was passed on the CLI, + // e.g. --grid 8.8.8.8 --mpi 1.1.1.1 + // ------------------------------------------------------------------ + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + GridCartesian GRID(latt_size, simd_layout, mpi_layout); + + int vol = 1; + for (int d = 0; d < (int)latt_size.size(); d++) vol *= latt_size[d]; + + std::cout << GridLogMessage << "Lattice : "; + for (int d = 0; d < Nd; d++) std::cout << latt_size[d] << " "; + std::cout << std::endl; + + // ------------------------------------------------------------------ + // Propagator field: SpinColourMatrix = 12x12 complex, i.e. + // LatticePropagatorD (= Lattice>). + // This is the standard QCD quark propagator type. + // ------------------------------------------------------------------ + LatticePropagatorD prop(&GRID); + + // ------------------------------------------------------------------ + // Fill the propagator with a momentum-space plane wave, + // following the pattern from Test_fft.cc. + // We set each spin-colour component (a,b) to exp(i * sum_mu p_mu x_mu) + // with a fixed momentum p = (1,2,1,2). + // ------------------------------------------------------------------ + Coordinate pvec({1, 2, 1, 2}); + + LatticeComplexD phase(&GRID); + LatticeComplexD coor(&GRID); + ComplexD ci(0.0, 1.0); + + phase = Zero(); + for (int mu = 0; mu < Nd; mu++) { + RealD TwoPiL = M_PI * 2.0 / latt_size[mu]; + LatticeCoordinate(coor, mu); + phase = phase + (TwoPiL * pvec[mu]) * coor; + } + phase = exp(phase * ci); // e^{i p.x} + + // Broadcast the phase into every spin-colour matrix entry + prop = Zero(); + prop = prop + phase; + + std::cout << GridLogMessage + << "Propagator norm2 = " << norm2(prop) << std::endl; + + // ------------------------------------------------------------------ + // Baseline snapshot BEFORE PlannedFFT construction + // ------------------------------------------------------------------ + MemSnapshot snap_before_plan = takeSnapshot(); + std::cout << GridLogMessage + << "[mem] Before PlannedFFT construction" + << " CPU: " << snap_before_plan.cpu_rss_kb << " kB" + << " GPU: " << snap_before_plan.gpu_used_mb << " MB" + << std::endl; + + // ------------------------------------------------------------------ + // Create the PlannedFFT — plans are allocated here ONCE for all + // dimensions and stored inside the object. + // ------------------------------------------------------------------ + PlannedFFT> plannedFFT(&GRID); + + // ------------------------------------------------------------------ + // Snapshot AFTER plan construction — this is the true baseline + // for the loop, because cufftPlanMany itself grabs device memory. + // ------------------------------------------------------------------ + MemSnapshot snap_after_plan = takeSnapshot(); + std::cout << GridLogMessage + << "[mem] After PlannedFFT construction" + << " CPU: " << snap_after_plan.cpu_rss_kb << " kB" + << " GPU: " << snap_after_plan.gpu_used_mb << " MB" + << " (plan overhead:" + << " CPU +" << snap_after_plan.cpu_rss_kb - snap_before_plan.cpu_rss_kb << " kB" + << " GPU +" << snap_after_plan.gpu_used_mb - snap_before_plan.gpu_used_mb << " MB)" + << std::endl; + + MemoryManager::Print(); + // ------------------------------------------------------------------ + // 400-iteration loop. + // Each iteration computes the full 4d forward FFT of `prop`. + // We deliberately do NOT cache the result — we always start from + // the same `prop` so the FFT is recomputed identically each time. + // The point is to watch memory, not correctness. + // ------------------------------------------------------------------ + const int Niter = 40; + const int Niter2 = 32; + + // Print header for the memory table + std::cout << GridLogMessage + << "\n" + << std::setw(6) << "iter" + << " CPU: " << std::setw(10) << "RSS[kB]" + << " ( delta )" + << " GPU: " << std::setw(7) << "used[MB]" + << " (delta)" + << "\n"; + + MemSnapshot snap_prev = snap_after_plan; + + for (int i = 0; i < Niter; i++) { + std::vector G; + + for (int j = 0; j < Niter2; j++) { + LatticePropagatorD prop_fft(&GRID); + + // Full 4d forward FFT using the pre-built plans + plannedFFT.FFT_all_dim(prop_fft, prop, FFT::forward); + + G.push_back(prop_fft); + } + + // cudaMemGetInfo reflects the state *after* any pooled frees have + // been committed, so this is accurate without an explicit sync — + // FFT_dim_execute already calls accelerator_barrier() internally. + MemSnapshot snap_now = takeSnapshot(); + printRow(i, snap_now, snap_prev); + MemoryManager::Print(); + snap_prev = snap_now; + } + + // ------------------------------------------------------------------ + // Summary + // ------------------------------------------------------------------ + MemSnapshot snap_final = takeSnapshot(); + + long cpu_growth = snap_final.cpu_rss_kb - snap_after_plan.cpu_rss_kb; + long gpu_growth = snap_final.gpu_used_mb - snap_after_plan.gpu_used_mb; + + std::cout << GridLogMessage + << "\n==== Memory summary (baseline = after plan construction) ====\n" + << " CPU RSS growth over " << Niter << " FFTs : " + << cpu_growth << " kB" + << (cpu_growth == 0 ? " OK" : " *** GROWING ***") << "\n" + << " GPU used growth over " << Niter << " FFTs : " + << gpu_growth << " MB" + << (gpu_growth == 0 ? " OK" : " *** GROWING ***") << "\n" + << " Note: first-call watermark from pool fill is expected and benign.\n" + << " A leak shows as continuous growth beyond iter ~2-3.\n"; + + Grid_finalize(); + return 0; +}