mirror of
https://github.com/paboyle/Grid.git
synced 2026-06-18 18:03:44 +01:00
Compare commits
61 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f11ba18df2 | |||
| cf8587e401 | |||
| 7dd35ef749 | |||
| 41e570ddce | |||
| a452131b50 | |||
| 4e49ca55ab | |||
| c3f4474401 | |||
| 3d3eff86f3 | |||
| fc9f154ac1 | |||
| 4aa0bca4dc | |||
| 905da6f083 | |||
| 86c7f29183 | |||
| b0c99f876e | |||
| bf5fcdc860 | |||
| b58a1508fa | |||
| 4d527e81fa | |||
| 7803580aa6 | |||
| 32654db366 | |||
| cd340cfab3 | |||
| f32866b2ff | |||
| 1cd1dc091e | |||
| 0493656e86 | |||
| 66fd504c4d | |||
| be4dd2b52f | |||
| 707d059766 | |||
| f08c755ae6 | |||
| dbbfdd4e4b | |||
| f967fb40bf | |||
| 74e0f846cb | |||
| 303a4d26e5 | |||
| 119888653c | |||
| a9f42c08f9 | |||
| e79adc9d31 | |||
| 5a9056cd93 | |||
| 012c36ab5a | |||
| 5c4574f9aa | |||
| a424775884 | |||
| d6b1388741 | |||
| 796c6cae4e | |||
| 1a8064d6d9 | |||
| 43648924c3 | |||
| bf2140e74d | |||
| a1119266c1 | |||
| a0f00c0eca | |||
| d358954a84 | |||
| aee00bdfb5 | |||
| cf324b0fa1 | |||
| b314dc224d | |||
| 1bbd62498e | |||
| f3c3b1c04b | |||
| 069f98b253 | |||
| dfd0503eae | |||
| c629b2e87e | |||
| 7c8462abd1 | |||
| 95a6a0bde7 | |||
| bba328fac5 | |||
| 41362349f3 | |||
| a5a04929fb | |||
| 77b8657fcc | |||
| f8b2eacf99 | |||
| 6140ac6864 |
@@ -63,12 +63,10 @@ void MemoryManager::PrintBytes(void)
|
||||
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
|
||||
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
|
||||
uint64_t cacheBytes;
|
||||
cacheBytes = CacheBytes[Cpu];
|
||||
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
|
||||
cacheBytes = CacheBytes[Acc];
|
||||
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
|
||||
cacheBytes = CacheBytes[Shared];
|
||||
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
|
||||
cacheBytes = HostCacheBytes();
|
||||
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu alloc cache Mbytes "<<std::endl;
|
||||
cacheBytes = DeviceCacheBytes();
|
||||
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc alloc cache Mbytes "<<std::endl;
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
cuda_mem();
|
||||
|
||||
@@ -289,7 +289,7 @@ public:
|
||||
///////////////////////////////////////////
|
||||
// move constructor
|
||||
///////////////////////////////////////////
|
||||
Lattice(Lattice && r){
|
||||
Lattice(Lattice && r) noexcept {
|
||||
this->_grid = r.Grid();
|
||||
this->_odata = r._odata;
|
||||
this->_odata_size = r._odata_size;
|
||||
@@ -330,7 +330,7 @@ public:
|
||||
///////////////////////////////////////////
|
||||
// Move assignment possible if same type
|
||||
///////////////////////////////////////////
|
||||
inline Lattice<vobj> & operator = (Lattice<vobj> && r){
|
||||
inline Lattice<vobj> & operator = (Lattice<vobj> && r) noexcept {
|
||||
|
||||
resize(0); // deletes if appropriate
|
||||
this->_grid = r.Grid();
|
||||
|
||||
@@ -438,5 +438,11 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
||||
result = sumD_gpu_large(lat,osites);
|
||||
return result;
|
||||
}
|
||||
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
|
||||
{
|
||||
Word w;
|
||||
bzero(&w,sizeof(w));
|
||||
return w;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#if defined(GRID_CUDA)
|
||||
|
||||
#include <cub/cub.cuh>
|
||||
#define gpucub cub
|
||||
#define gpuError_t cudaError_t
|
||||
@@ -57,8 +56,13 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
||||
//copy offsets to device
|
||||
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||
|
||||
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
|
||||
#define GRID_CUB_SUM_OP ::cuda::std::plus<>{}
|
||||
#else
|
||||
#define GRID_CUB_SUM_OP ::gpucub::Sum()
|
||||
#endif
|
||||
|
||||
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
||||
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
|
||||
if (gpuErr!=gpuSuccess) {
|
||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
@@ -82,12 +86,14 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
||||
});
|
||||
|
||||
//issue segmented reductions in computeStream
|
||||
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
|
||||
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1, GRID_CUB_SUM_OP, zero_init, computeStream);
|
||||
if (gpuErr!=gpuSuccess) {
|
||||
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#undef GRID_CUB_SUM_OP
|
||||
|
||||
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||
|
||||
//sync after copy
|
||||
|
||||
@@ -51,8 +51,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#endif
|
||||
#ifdef __x86_64__
|
||||
#ifdef GRID_CUDA
|
||||
//accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
||||
//accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
||||
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
||||
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
@@ -93,8 +93,7 @@ inline uint64_t cyclecount(void){
|
||||
}
|
||||
#elif defined __x86_64__
|
||||
inline uint64_t cyclecount(void){
|
||||
uint64_t ret = __rdtsc();
|
||||
return (uint64_t)ret;
|
||||
return (uint64_t)0;
|
||||
}
|
||||
#else
|
||||
|
||||
|
||||
@@ -113,6 +113,14 @@ accelerator_inline RealD adj(const RealD & r){ return r; }
|
||||
accelerator_inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); }
|
||||
accelerator_inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); }
|
||||
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
//Provide for convenience
|
||||
inline std::complex<double> conjugate(const std::complex<double>& r){ return(conj(r)); }
|
||||
inline std::complex<float> conjugate(const std::complex<float>& r) { return(conj(r)); }
|
||||
inline std::complex<double> adj(const std::complex<double>& r) { return(conj(r)); }
|
||||
inline std::complex<float> adj(const std::complex<float>& r) { return(conj(r)); }
|
||||
#endif
|
||||
|
||||
accelerator_inline RealF real(const RealF & r){ return r; }
|
||||
accelerator_inline RealD real(const RealD & r){ return r; }
|
||||
accelerator_inline RealF real(const ComplexF & r){ return r.real(); }
|
||||
|
||||
@@ -96,7 +96,9 @@ void acceleratorInit(void);
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
#include <cuda.h>
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define GRID_SIMT
|
||||
|
||||
@@ -8,7 +8,6 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--with-lime=$LIME \
|
||||
--enable-accelerator-cshift \
|
||||
--disable-unified \
|
||||
CXX=nvcc \
|
||||
LDFLAGS="-cudart shared " \
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
DIR=`pwd`
|
||||
PREFIX=$HOME/DDHMC/Grid/systems/Prerequisites/install/
|
||||
|
||||
../../configure \
|
||||
--enable-comms=mpi \
|
||||
--enable-simd=GPU \
|
||||
--enable-shm=nvlink \
|
||||
--enable-gen-simd-width=64 \
|
||||
--with-gmp=$PREFIX \
|
||||
--with-mpfr=$PREFIX \
|
||||
--with-gmp=$GMP \
|
||||
--with-mpfr=$MPFR \
|
||||
--enable-accelerator=cuda \
|
||||
--disable-fermion-reps \
|
||||
--disable-unified \
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
|
||||
export CRAY_ACCEL_TARGET=nvidia80
|
||||
source /global/homes/p/pboyle/spack/share/spack/setup-env.sh
|
||||
export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
|
||||
export GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
|
||||
|
||||
module load PrgEnv-gnu cpe-cuda cudatoolkit/11.4
|
||||
module load PrgEnv-gnu cpe-cuda cudatoolkit/12.0
|
||||
|
||||
@@ -3,7 +3,10 @@
|
||||
CXX=mpicxx ../../configure \
|
||||
--enable-simd=GEN \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-Sp=yes \
|
||||
--enable-Sp=no \
|
||||
--disable-fermion-reps \
|
||||
--disable-gparity \
|
||||
--with-fftw=$FFTW \
|
||||
--enable-unified=yes \
|
||||
--prefix /Users/peterboyle/QCD/vtk/Grid/install \
|
||||
--with-lime=$CLIME \
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
source /Users/peterboyle/QCD//Spack/spack//share/spack/setup-env.sh
|
||||
|
||||
export FFTW=`spack find --paths fftw | grep ^fftw | awk '{print $2}' `
|
||||
#export HDF5=`spack find --paths hdf5+cxx | grep ^hdf5 | awk '{print $2}' `
|
||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||
export MPFR=`spack find --paths mpfr | grep ^mpfr | awk '{print $2}' `
|
||||
export OPENSSL=`spack find --paths openssl | grep openssl | awk '{print $2}' `
|
||||
export GMP=`spack find --paths gmp | grep ^gmp | awk '{print $2}' `
|
||||
|
||||
export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
|
||||
export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
|
||||
@@ -0,0 +1,261 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Test_fft_memory.cc
|
||||
|
||||
Memory growth test for PlannedFFT on a spin-colour matrix (propagator) field.
|
||||
|
||||
The test creates a single PlannedFFT object (which allocates FFTW plans once),
|
||||
then repeatedly applies FFT_all_dim to the same propagator 400 times.
|
||||
|
||||
If PlannedFFT is working correctly the RSS should remain flat after the first
|
||||
iteration — no new plans, no new deviceVector allocations beyond the per-call
|
||||
pencil buffer which is freed at the end of each FFT_dim_execute call.
|
||||
|
||||
Build exactly like any other Grid test, e.g.:
|
||||
make Test_fft_memory
|
||||
or compile manually:
|
||||
$(CXX) $(CXXFLAGS) Test_fft_memory.cc -o Test_fft_memory $(LDFLAGS)
|
||||
|
||||
*************************************************************************************/
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
using namespace Grid;
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Helper: read RSS (resident set size) in kB from /proc/self/status.
|
||||
// Returns 0 on platforms where /proc is unavailable.
|
||||
// --------------------------------------------------------------------------
|
||||
static long getCPURSSKb()
|
||||
{
|
||||
long rss = 0;
|
||||
FILE *fp = fopen("/proc/self/status", "r");
|
||||
if (!fp) return -1;
|
||||
char line[256];
|
||||
while (fgets(line, sizeof(line), fp)) {
|
||||
if (strncmp(line, "VmRSS:", 6) == 0) {
|
||||
sscanf(line + 6, "%ld", &rss);
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
return rss;
|
||||
}
|
||||
|
||||
static long getGPUUsedMb()
|
||||
{
|
||||
#if defined(GRID_CUDA)
|
||||
size_t free_bytes = 0;
|
||||
size_t total_bytes = 0;
|
||||
cudaError_t err = cudaMemGetInfo(&free_bytes, &total_bytes);
|
||||
if (err != cudaSuccess) return -1;
|
||||
return (long)((total_bytes - free_bytes) / (1024 * 1024));
|
||||
|
||||
#elif defined(GRID_HIP)
|
||||
size_t free_bytes = 0;
|
||||
size_t total_bytes = 0;
|
||||
hipError_t err = hipMemGetInfo(&free_bytes, &total_bytes);
|
||||
if (err != hipSuccess) return -1;
|
||||
return (long)((total_bytes - free_bytes) / (1024 * 1024));
|
||||
|
||||
#else
|
||||
return -1; // CPU-only build: no GPU to query
|
||||
#endif
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Convenience struct — one snapshot of both sides
|
||||
// ============================================================
|
||||
struct MemSnapshot {
|
||||
long cpu_rss_kb; // host RSS in kB (-1 if unavailable)
|
||||
long gpu_used_mb; // device used in MB (-1 if no GPU)
|
||||
};
|
||||
|
||||
static MemSnapshot takeSnapshot()
|
||||
{
|
||||
MemSnapshot s;
|
||||
s.cpu_rss_kb = getCPURSSKb();
|
||||
s.gpu_used_mb = getGPUUsedMb();
|
||||
return s;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Pretty-print one row of the monitoring table
|
||||
// ============================================================
|
||||
static void printRow(int iter,
|
||||
const MemSnapshot &now,
|
||||
const MemSnapshot &prev)
|
||||
{
|
||||
long cpu_delta = (now.cpu_rss_kb >= 0 && prev.cpu_rss_kb >= 0)
|
||||
? now.cpu_rss_kb - prev.cpu_rss_kb : 0;
|
||||
long gpu_delta = (now.gpu_used_mb >= 0 && prev.gpu_used_mb >= 0)
|
||||
? now.gpu_used_mb - prev.gpu_used_mb : 0;
|
||||
|
||||
// Sign prefix so deltas are unambiguous
|
||||
auto sign = [](long v) -> const char* { return v >= 0 ? "+" : ""; };
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< std::setw(6) << iter
|
||||
<< " CPU: " << std::setw(10) << now.cpu_rss_kb << " kB"
|
||||
<< " (" << sign(cpu_delta) << std::setw(7) << cpu_delta << " kB)"
|
||||
<< " GPU: " << std::setw(7) << now.gpu_used_mb << " MB"
|
||||
<< " (" << sign(gpu_delta) << std::setw(5) << gpu_delta << " MB)"
|
||||
<< "\n";
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
Grid_init(&argc, &argv);
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
std::cout << GridLogMessage
|
||||
<< "Grid is setup to use " << threads << " threads" << std::endl;
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Grid setup — use whatever lattice/mpi/simd was passed on the CLI,
|
||||
// e.g. --grid 8.8.8.8 --mpi 1.1.1.1
|
||||
// ------------------------------------------------------------------
|
||||
Coordinate latt_size = GridDefaultLatt();
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
|
||||
GridCartesian GRID(latt_size, simd_layout, mpi_layout);
|
||||
|
||||
int vol = 1;
|
||||
for (int d = 0; d < (int)latt_size.size(); d++) vol *= latt_size[d];
|
||||
|
||||
std::cout << GridLogMessage << "Lattice : ";
|
||||
for (int d = 0; d < Nd; d++) std::cout << latt_size[d] << " ";
|
||||
std::cout << std::endl;
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Propagator field: SpinColourMatrix = 12x12 complex, i.e.
|
||||
// LatticePropagatorD (= Lattice<iSpinColourMatrix<vComplexD>>).
|
||||
// This is the standard QCD quark propagator type.
|
||||
// ------------------------------------------------------------------
|
||||
LatticePropagatorD prop(&GRID);
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Fill the propagator with a momentum-space plane wave,
|
||||
// following the pattern from Test_fft.cc.
|
||||
// We set each spin-colour component (a,b) to exp(i * sum_mu p_mu x_mu)
|
||||
// with a fixed momentum p = (1,2,1,2).
|
||||
// ------------------------------------------------------------------
|
||||
Coordinate pvec({1, 2, 1, 2});
|
||||
|
||||
LatticeComplexD phase(&GRID);
|
||||
LatticeComplexD coor(&GRID);
|
||||
ComplexD ci(0.0, 1.0);
|
||||
|
||||
phase = Zero();
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
RealD TwoPiL = M_PI * 2.0 / latt_size[mu];
|
||||
LatticeCoordinate(coor, mu);
|
||||
phase = phase + (TwoPiL * pvec[mu]) * coor;
|
||||
}
|
||||
phase = exp(phase * ci); // e^{i p.x}
|
||||
|
||||
// Broadcast the phase into every spin-colour matrix entry
|
||||
prop = Zero();
|
||||
prop = prop + phase;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "Propagator norm2 = " << norm2(prop) << std::endl;
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Baseline snapshot BEFORE PlannedFFT construction
|
||||
// ------------------------------------------------------------------
|
||||
MemSnapshot snap_before_plan = takeSnapshot();
|
||||
std::cout << GridLogMessage
|
||||
<< "[mem] Before PlannedFFT construction"
|
||||
<< " CPU: " << snap_before_plan.cpu_rss_kb << " kB"
|
||||
<< " GPU: " << snap_before_plan.gpu_used_mb << " MB"
|
||||
<< std::endl;
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Create the PlannedFFT — plans are allocated here ONCE for all
|
||||
// dimensions and stored inside the object.
|
||||
// ------------------------------------------------------------------
|
||||
PlannedFFT<iSpinColourMatrix<vComplexD>> plannedFFT(&GRID);
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Snapshot AFTER plan construction — this is the true baseline
|
||||
// for the loop, because cufftPlanMany itself grabs device memory.
|
||||
// ------------------------------------------------------------------
|
||||
MemSnapshot snap_after_plan = takeSnapshot();
|
||||
std::cout << GridLogMessage
|
||||
<< "[mem] After PlannedFFT construction"
|
||||
<< " CPU: " << snap_after_plan.cpu_rss_kb << " kB"
|
||||
<< " GPU: " << snap_after_plan.gpu_used_mb << " MB"
|
||||
<< " (plan overhead:"
|
||||
<< " CPU +" << snap_after_plan.cpu_rss_kb - snap_before_plan.cpu_rss_kb << " kB"
|
||||
<< " GPU +" << snap_after_plan.gpu_used_mb - snap_before_plan.gpu_used_mb << " MB)"
|
||||
<< std::endl;
|
||||
|
||||
MemoryManager::Print();
|
||||
// ------------------------------------------------------------------
|
||||
// 400-iteration loop.
|
||||
// Each iteration computes the full 4d forward FFT of `prop`.
|
||||
// We deliberately do NOT cache the result — we always start from
|
||||
// the same `prop` so the FFT is recomputed identically each time.
|
||||
// The point is to watch memory, not correctness.
|
||||
// ------------------------------------------------------------------
|
||||
const int Niter = 40;
|
||||
const int Niter2 = 32;
|
||||
|
||||
// Print header for the memory table
|
||||
std::cout << GridLogMessage
|
||||
<< "\n"
|
||||
<< std::setw(6) << "iter"
|
||||
<< " CPU: " << std::setw(10) << "RSS[kB]"
|
||||
<< " ( delta )"
|
||||
<< " GPU: " << std::setw(7) << "used[MB]"
|
||||
<< " (delta)"
|
||||
<< "\n";
|
||||
|
||||
MemSnapshot snap_prev = snap_after_plan;
|
||||
|
||||
for (int i = 0; i < Niter; i++) {
|
||||
std::vector<LatticePropagatorD> G;
|
||||
|
||||
for (int j = 0; j < Niter2; j++) {
|
||||
LatticePropagatorD prop_fft(&GRID);
|
||||
|
||||
// Full 4d forward FFT using the pre-built plans
|
||||
plannedFFT.FFT_all_dim(prop_fft, prop, FFT::forward);
|
||||
|
||||
G.push_back(prop_fft);
|
||||
}
|
||||
|
||||
// cudaMemGetInfo reflects the state *after* any pooled frees have
|
||||
// been committed, so this is accurate without an explicit sync —
|
||||
// FFT_dim_execute already calls accelerator_barrier() internally.
|
||||
MemSnapshot snap_now = takeSnapshot();
|
||||
printRow(i, snap_now, snap_prev);
|
||||
MemoryManager::Print();
|
||||
snap_prev = snap_now;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// Summary
|
||||
// ------------------------------------------------------------------
|
||||
MemSnapshot snap_final = takeSnapshot();
|
||||
|
||||
long cpu_growth = snap_final.cpu_rss_kb - snap_after_plan.cpu_rss_kb;
|
||||
long gpu_growth = snap_final.gpu_used_mb - snap_after_plan.gpu_used_mb;
|
||||
|
||||
std::cout << GridLogMessage
|
||||
<< "\n==== Memory summary (baseline = after plan construction) ====\n"
|
||||
<< " CPU RSS growth over " << Niter << " FFTs : "
|
||||
<< cpu_growth << " kB"
|
||||
<< (cpu_growth == 0 ? " OK" : " *** GROWING ***") << "\n"
|
||||
<< " GPU used growth over " << Niter << " FFTs : "
|
||||
<< gpu_growth << " MB"
|
||||
<< (gpu_growth == 0 ? " OK" : " *** GROWING ***") << "\n"
|
||||
<< " Note: first-call watermark from pool fill is expected and benign.\n"
|
||||
<< " A leak shows as continuous growth beyond iter ~2-3.\n";
|
||||
|
||||
Grid_finalize();
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user