/************************************************************************************* Test_fft_memory.cc Memory growth test for PlannedFFT on a spin-colour matrix (propagator) field. The test creates a single PlannedFFT object (which allocates FFTW plans once), then repeatedly applies FFT_all_dim to the same propagator 400 times. If PlannedFFT is working correctly the RSS should remain flat after the first iteration — no new plans, no new deviceVector allocations beyond the per-call pencil buffer which is freed at the end of each FFT_dim_execute call. Build exactly like any other Grid test, e.g.: make Test_fft_memory or compile manually: $(CXX) $(CXXFLAGS) Test_fft_memory.cc -o Test_fft_memory $(LDFLAGS) *************************************************************************************/ #include using namespace Grid; // -------------------------------------------------------------------------- // Helper: read RSS (resident set size) in kB from /proc/self/status. // Returns 0 on platforms where /proc is unavailable. // -------------------------------------------------------------------------- static long getCPURSSKb() { long rss = 0; FILE *fp = fopen("/proc/self/status", "r"); if (!fp) return -1; char line[256]; while (fgets(line, sizeof(line), fp)) { if (strncmp(line, "VmRSS:", 6) == 0) { sscanf(line + 6, "%ld", &rss); break; } } fclose(fp); return rss; } static long getGPUUsedMb() { #if defined(GRID_CUDA) size_t free_bytes = 0; size_t total_bytes = 0; cudaError_t err = cudaMemGetInfo(&free_bytes, &total_bytes); if (err != cudaSuccess) return -1; return (long)((total_bytes - free_bytes) / (1024 * 1024)); #elif defined(GRID_HIP) size_t free_bytes = 0; size_t total_bytes = 0; hipError_t err = hipMemGetInfo(&free_bytes, &total_bytes); if (err != hipSuccess) return -1; return (long)((total_bytes - free_bytes) / (1024 * 1024)); #else return -1; // CPU-only build: no GPU to query #endif } // ============================================================ // Convenience struct — one snapshot of both sides // ============================================================ struct MemSnapshot { long cpu_rss_kb; // host RSS in kB (-1 if unavailable) long gpu_used_mb; // device used in MB (-1 if no GPU) }; static MemSnapshot takeSnapshot() { MemSnapshot s; s.cpu_rss_kb = getCPURSSKb(); s.gpu_used_mb = getGPUUsedMb(); return s; } // ============================================================ // Pretty-print one row of the monitoring table // ============================================================ static void printRow(int iter, const MemSnapshot &now, const MemSnapshot &prev) { long cpu_delta = (now.cpu_rss_kb >= 0 && prev.cpu_rss_kb >= 0) ? now.cpu_rss_kb - prev.cpu_rss_kb : 0; long gpu_delta = (now.gpu_used_mb >= 0 && prev.gpu_used_mb >= 0) ? now.gpu_used_mb - prev.gpu_used_mb : 0; // Sign prefix so deltas are unambiguous auto sign = [](long v) -> const char* { return v >= 0 ? "+" : ""; }; std::cout << GridLogMessage << std::setw(6) << iter << " CPU: " << std::setw(10) << now.cpu_rss_kb << " kB" << " (" << sign(cpu_delta) << std::setw(7) << cpu_delta << " kB)" << " GPU: " << std::setw(7) << now.gpu_used_mb << " MB" << " (" << sign(gpu_delta) << std::setw(5) << gpu_delta << " MB)" << "\n"; } // ============================================================ int main(int argc, char **argv) { Grid_init(&argc, &argv); int threads = GridThread::GetThreads(); std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; // ------------------------------------------------------------------ // Grid setup — use whatever lattice/mpi/simd was passed on the CLI, // e.g. --grid 8.8.8.8 --mpi 1.1.1.1 // ------------------------------------------------------------------ Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); GridCartesian GRID(latt_size, simd_layout, mpi_layout); int vol = 1; for (int d = 0; d < (int)latt_size.size(); d++) vol *= latt_size[d]; std::cout << GridLogMessage << "Lattice : "; for (int d = 0; d < Nd; d++) std::cout << latt_size[d] << " "; std::cout << std::endl; // ------------------------------------------------------------------ // Propagator field: SpinColourMatrix = 12x12 complex, i.e. // LatticePropagatorD (= Lattice>). // This is the standard QCD quark propagator type. // ------------------------------------------------------------------ LatticePropagatorD prop(&GRID); // ------------------------------------------------------------------ // Fill the propagator with a momentum-space plane wave, // following the pattern from Test_fft.cc. // We set each spin-colour component (a,b) to exp(i * sum_mu p_mu x_mu) // with a fixed momentum p = (1,2,1,2). // ------------------------------------------------------------------ Coordinate pvec({1, 2, 1, 2}); LatticeComplexD phase(&GRID); LatticeComplexD coor(&GRID); ComplexD ci(0.0, 1.0); phase = Zero(); for (int mu = 0; mu < Nd; mu++) { RealD TwoPiL = M_PI * 2.0 / latt_size[mu]; LatticeCoordinate(coor, mu); phase = phase + (TwoPiL * pvec[mu]) * coor; } phase = exp(phase * ci); // e^{i p.x} // Broadcast the phase into every spin-colour matrix entry prop = Zero(); prop = prop + phase; std::cout << GridLogMessage << "Propagator norm2 = " << norm2(prop) << std::endl; // ------------------------------------------------------------------ // Baseline snapshot BEFORE PlannedFFT construction // ------------------------------------------------------------------ MemSnapshot snap_before_plan = takeSnapshot(); std::cout << GridLogMessage << "[mem] Before PlannedFFT construction" << " CPU: " << snap_before_plan.cpu_rss_kb << " kB" << " GPU: " << snap_before_plan.gpu_used_mb << " MB" << std::endl; // ------------------------------------------------------------------ // Create the PlannedFFT — plans are allocated here ONCE for all // dimensions and stored inside the object. // ------------------------------------------------------------------ PlannedFFT> plannedFFT(&GRID); // ------------------------------------------------------------------ // Snapshot AFTER plan construction — this is the true baseline // for the loop, because cufftPlanMany itself grabs device memory. // ------------------------------------------------------------------ MemSnapshot snap_after_plan = takeSnapshot(); std::cout << GridLogMessage << "[mem] After PlannedFFT construction" << " CPU: " << snap_after_plan.cpu_rss_kb << " kB" << " GPU: " << snap_after_plan.gpu_used_mb << " MB" << " (plan overhead:" << " CPU +" << snap_after_plan.cpu_rss_kb - snap_before_plan.cpu_rss_kb << " kB" << " GPU +" << snap_after_plan.gpu_used_mb - snap_before_plan.gpu_used_mb << " MB)" << std::endl; MemoryManager::Print(); // ------------------------------------------------------------------ // 400-iteration loop. // Each iteration computes the full 4d forward FFT of `prop`. // We deliberately do NOT cache the result — we always start from // the same `prop` so the FFT is recomputed identically each time. // The point is to watch memory, not correctness. // ------------------------------------------------------------------ const int Niter = 40; const int Niter2 = 32; // Print header for the memory table std::cout << GridLogMessage << "\n" << std::setw(6) << "iter" << " CPU: " << std::setw(10) << "RSS[kB]" << " ( delta )" << " GPU: " << std::setw(7) << "used[MB]" << " (delta)" << "\n"; MemSnapshot snap_prev = snap_after_plan; for (int i = 0; i < Niter; i++) { std::vector G; for (int j = 0; j < Niter2; j++) { LatticePropagatorD prop_fft(&GRID); // Full 4d forward FFT using the pre-built plans plannedFFT.FFT_all_dim(prop_fft, prop, FFT::forward); G.push_back(prop_fft); } // cudaMemGetInfo reflects the state *after* any pooled frees have // been committed, so this is accurate without an explicit sync — // FFT_dim_execute already calls accelerator_barrier() internally. MemSnapshot snap_now = takeSnapshot(); printRow(i, snap_now, snap_prev); MemoryManager::Print(); snap_prev = snap_now; } // ------------------------------------------------------------------ // Summary // ------------------------------------------------------------------ MemSnapshot snap_final = takeSnapshot(); long cpu_growth = snap_final.cpu_rss_kb - snap_after_plan.cpu_rss_kb; long gpu_growth = snap_final.gpu_used_mb - snap_after_plan.gpu_used_mb; std::cout << GridLogMessage << "\n==== Memory summary (baseline = after plan construction) ====\n" << " CPU RSS growth over " << Niter << " FFTs : " << cpu_growth << " kB" << (cpu_growth == 0 ? " OK" : " *** GROWING ***") << "\n" << " GPU used growth over " << Niter << " FFTs : " << gpu_growth << " MB" << (gpu_growth == 0 ? " OK" : " *** GROWING ***") << "\n" << " Note: first-call watermark from pool fill is expected and benign.\n" << " A leak shows as continuous growth beyond iter ~2-3.\n"; Grid_finalize(); return 0; }