diff --git a/systems/Frontier/config-command b/systems/Frontier/config-command index 7561fb15..7f71b4d9 100644 --- a/systems/Frontier/config-command +++ b/systems/Frontier/config-command @@ -13,8 +13,8 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ --disable-fermion-reps \ CXX=hipcc MPICXX=mpicxx \ -CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \ - LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft" +CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include " \ + LDFLAGS="-L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -lmpi_gtl_hsa -lhipblas -lrocblas -lhipfft -lamdhip64" diff --git a/systems/Frontier/sourceme.sh b/systems/Frontier/sourceme.sh index 1c090eb9..0788fdf6 100644 --- a/systems/Frontier/sourceme.sh +++ b/systems/Frontier/sourceme.sh @@ -1,16 +1,10 @@ echo spack -. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh +. /autofs/nccs-svm1_home1/paboyle/spack/share/spack/setup-env.sh -module load amd/7.0.2 -module load cray-fftw -module load craype-accel-amd-gfx90a -mkdir $HOME/LD_PATH -ln -s /opt/rocm-6.4.2/lib/libamdhip* $HOME/LD_PATH -#Ugly hacks to get down level software working on current system -export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH -#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/LD_PATH/ -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-7.0.2/lib +module load cce/21.0.0 +module load cpe/26.03 +module load rocm/7.0.2 +export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/opt/rocm-7.0.2/lib/llvm/lib/:$LD_LIBRARY_PATH diff --git a/tests/debug/Test_hipfft_bug_fail.cc b/tests/debug/Test_hipfft_bug_fail.cc new file mode 100644 index 00000000..990250d1 --- /dev/null +++ b/tests/debug/Test_hipfft_bug_fail.cc @@ -0,0 +1,76 @@ +/* + * Isolating the hipfft HIPFFT_PARSE_ERROR on ROCm 7 / hipFFT 1.0.20. + * + * Tests three orderings with an empty rocFFT cache to find which GPU + * operation before plan creation triggers the failure: + * A) hipMalloc only — hypothesis: passes (no async GPU work) + * B) hipMalloc + hipMemset — hypothesis: fails (async GPU work in flight) + * C) hipMalloc + hipMemset — hypothesis: passes (work completed before plan) + * + hipDeviceSynchronize + * + * Compile: + * hipcc -o Test_hipfft_bug_fail Test_hipfft_bug_fail.cc -lhipfft + * + * Run with empty cache: + * rm -rf ~/.cache/ + * ./Test_hipfft_bug_fail + */ + +#include +#include +#include + +static const char *res(hipfftResult rv) { + return rv == HIPFFT_SUCCESS ? "SUCCESS" : "PARSE_ERROR"; +} + +static hipfftResult makePlan(int G, int howmany) { + int n[] = {G}; + hipfftHandle p; + size_t workSize = 0; + hipfftCreate(&p); + hipfftResult rv = hipfftMakePlanMany(p, 1, n, + nullptr, 1, G, nullptr, 1, G, + HIPFFT_Z2Z, howmany, &workSize); + hipfftDestroy(p); + return rv; +} + +int main(void) { + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, 0); + printf("Device: %s\n", prop.name); +#ifdef hipfftVersionMinor + printf("hipFFT version: %d.%d.%d\n\n", + hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch); +#endif + + for (int G : {4, 8, 16, 32}) { + int howmany = 512; + long nelems = (long)G * howmany; + hipfftDoubleComplex *buf = nullptr; + hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex)); + + // Tests ordered so each runs before a prior success can populate the cache. + + // B first: hipMalloc + hipMemset (async GPU work in flight) + // If this fails, A (no hipMemset) will pass, confirming hipMemset is the trigger. + hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex)); + hipfftResult rvB = makePlan(G, howmany); + printf("G=%-4d B) hipMalloc + hipMemset : %s\n", G, res(rvB)); + + // C: hipMalloc + hipMemset + sync — does syncing before plan creation fix it? + hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex)); + hipDeviceSynchronize(); + hipfftResult rvC = makePlan(G, howmany); + printf("G=%-4d C) hipMalloc + hipMemset + sync: %s\n", G, res(rvC)); + + // A last: hipMalloc only, no async GPU work — should always pass + hipfftResult rvA = makePlan(G, howmany); + printf("G=%-4d A) hipMalloc only : %s\n\n", G, res(rvA)); + + hipFree(buf); + } + + return 0; +} diff --git a/tests/debug/Test_hipfft_bug_pass.cc b/tests/debug/Test_hipfft_bug_pass.cc new file mode 100644 index 00000000..3cd7c210 --- /dev/null +++ b/tests/debug/Test_hipfft_bug_pass.cc @@ -0,0 +1,61 @@ +/* + * Minimal program demonstrating the workaround for the hipfft ROCm 7 bug. + * + * Workaround: create the hipfft plan BEFORE any hipMalloc. Plan creation + * for G < 32 then succeeds even with an empty rocFFT cache. + * + * Compile: + * hipcc -o Test_hipfft_bug_pass Test_hipfft_bug_pass.cc -lhipfft + * + * Run: + * rm -rf ~/.cache/rocfft + * ./Test_hipfft_bug_pass + * + * Expected: all G values succeed. + * Compare with Test_hipfft_bug_fail.cc which uses the opposite ordering. + */ + +#include +#include +#include + +int main(void) { + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, 0); + printf("Device: %s\n", prop.name); +#ifdef hipfftVersionMinor + printf("hipFFT version: %d.%d.%d\n\n", + hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch); +#endif + + for (int G : {8, 16, 32}) { + int howmany = 512; + int n[] = {G}; + long nelems = (long)G * howmany; + + // Plan created BEFORE hipMalloc — succeeds for all G + hipfftHandle p; + size_t workSize = 0; + hipfftCreate(&p); + hipfftResult rv = hipfftMakePlanMany(p, 1, n, + nullptr, 1, G, nullptr, 1, G, + HIPFFT_Z2Z, howmany, &workSize); + printf("G=%-4d plan-then-hipMalloc: %d (%s)\n", + G, (int)rv, rv == HIPFFT_SUCCESS ? "HIPFFT_SUCCESS" : "HIPFFT_PARSE_ERROR"); + + if (rv == HIPFFT_SUCCESS) { + hipfftDoubleComplex *buf = nullptr; + hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex)); + hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex)); + rv = hipfftExecZ2Z(p, buf, buf, HIPFFT_FORWARD); + hipDeviceSynchronize(); + printf("G=%-4d execFwd: %d (%s)\n", + G, (int)rv, rv == HIPFFT_SUCCESS ? "HIPFFT_SUCCESS" : "FAILED"); + hipFree(buf); + } + + hipfftDestroy(p); + } + + return 0; +} diff --git a/tests/debug/Test_hipfft_minimal.cc b/tests/debug/Test_hipfft_minimal.cc new file mode 100644 index 00000000..bbccff3c --- /dev/null +++ b/tests/debug/Test_hipfft_minimal.cc @@ -0,0 +1,142 @@ +/* + * Minimal reproducer for hipfftMakePlanMany / hipfftPlanMany failures. + * + * Compile on Frontier (no Grid headers needed): + * hipcc -o Test_hipfft_minimal Test_hipfft_minimal.cc -lhipfft + * + * Run: + * ./Test_hipfft_minimal + */ + +#include +#include +#include +#include + +static const char *hipfftResultString(hipfftResult r) { + switch (r) { + case HIPFFT_SUCCESS: return "HIPFFT_SUCCESS"; + case HIPFFT_INVALID_PLAN: return "HIPFFT_INVALID_PLAN"; + case HIPFFT_ALLOC_FAILED: return "HIPFFT_ALLOC_FAILED"; + case HIPFFT_INVALID_TYPE: return "HIPFFT_INVALID_TYPE"; + case HIPFFT_INVALID_VALUE: return "HIPFFT_INVALID_VALUE"; + case HIPFFT_INTERNAL_ERROR: return "HIPFFT_INTERNAL_ERROR"; + case HIPFFT_EXEC_FAILED: return "HIPFFT_EXEC_FAILED"; + case HIPFFT_SETUP_FAILED: return "HIPFFT_SETUP_FAILED"; + case HIPFFT_INVALID_SIZE: return "HIPFFT_INVALID_SIZE"; + case HIPFFT_UNALIGNED_DATA: return "HIPFFT_UNALIGNED_DATA"; + case HIPFFT_INCOMPLETE_PARAMETER_LIST:return "HIPFFT_INCOMPLETE_PARAMETER_LIST"; + case HIPFFT_INVALID_DEVICE: return "HIPFFT_INVALID_DEVICE"; + case HIPFFT_PARSE_ERROR: return "HIPFFT_PARSE_ERROR"; + case HIPFFT_NO_WORKSPACE: return "HIPFFT_NO_WORKSPACE"; + case HIPFFT_NOT_IMPLEMENTED: return "HIPFFT_NOT_IMPLEMENTED"; + case HIPFFT_NOT_SUPPORTED: return "HIPFFT_NOT_SUPPORTED"; + default: return "UNKNOWN"; + } +} + +// Plan creation + execution for (G, howmany). +// Tests two orderings to isolate whether a prior hipMalloc poisons hipfft +// plan creation for small G on ROCm 7: +// A) plan BEFORE hipMalloc — hypothesis: succeeds +// B) hipMalloc BEFORE plan — hypothesis: fails for G < 32 +static void tryPlanAndExec(int G, long howmany) { + int n[] = {G}; + long nelems = (long)G * howmany; + + printf("--- G=%-4d howmany=%-10ld total_elems=%-12ld ---\n", + G, howmany, nelems); + + // --- A: create plan first, allocate buffer afterwards --- + { + hipfftHandle p; + size_t workSize = 0; + hipfftCreate(&p); + hipfftResult rv = hipfftMakePlanMany(p, 1, n, + nullptr, 1, G, nullptr, 1, G, + HIPFFT_Z2Z, (int)howmany, &workSize); + printf(" plan-first create : %d (%s)\n", (int)rv, hipfftResultString(rv)); + if (rv == HIPFFT_SUCCESS) { + hipfftDoubleComplex *buf = nullptr; + hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex)); + hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex)); + rv = hipfftExecZ2Z(p, buf, buf, HIPFFT_FORWARD); + hipDeviceSynchronize(); + printf(" plan-first execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv)); + hipFree(buf); + } + hipfftDestroy(p); + } + + // --- B: hipMalloc first, create plan afterwards --- + { + hipfftDoubleComplex *buf = nullptr; + hipMalloc(&buf, nelems * sizeof(hipfftDoubleComplex)); + hipMemset(buf, 0, nelems * sizeof(hipfftDoubleComplex)); + + hipfftHandle p; + size_t workSize = 0; + hipfftCreate(&p); + hipfftResult rv = hipfftMakePlanMany(p, 1, n, + nullptr, 1, G, nullptr, 1, G, + HIPFFT_Z2Z, (int)howmany, &workSize); + printf(" malloc-first create : %d (%s)\n", (int)rv, hipfftResultString(rv)); + if (rv == HIPFFT_SUCCESS) { + rv = hipfftExecZ2Z(p, buf, buf, HIPFFT_FORWARD); + hipDeviceSynchronize(); + printf(" malloc-first execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv)); + } + hipfftDestroy(p); + hipFree(buf); + } + + printf("\n"); +} + +int main(void) { + // Print HIP device info + int device = 0; + hipGetDevice(&device); + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, device); + printf("Device %d: %s warpSize=%d\n\n", device, prop.name, prop.warpSize); + +#ifdef hipfftVersionMinor + printf("hipFFT version: %d.%d.%d\n\n", + hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch); +#endif + + // Original sweep with small howmany (these passed first time) + printf("=== Small howmany (original sweep) ===\n\n"); + for (int G : {4, 8, 12, 16, 24, 32, 48, 64}) + tryPlanAndExec(G, 512); + + // Grid-realistic howmany values derived from actual lattice geometries. + // howmany = Ncomp * product(ldimensions[d] for d != dim) + // For LatticeComplexD: Ncomp=1. + printf("=== Grid-realistic parameters ===\n\n"); + + // --grid 16.16.16.16 4D FFT (KNOWN TO FAIL in Grid) + // Each dim: G=16, Nperp=16^3=4096 + tryPlanAndExec(16, 4096); + + // --grid 32.32.32.32 4D FFT (KNOWN TO SUCCEED in Grid) + // Each dim: G=32, Nperp=32^3=32768 + tryPlanAndExec(32, 32768); + + // --grid 32.32.32.32 Ls=8 5D DWF FFT (KNOWN TO FAIL on dim 0 in Grid) + // dim 0: G=8, Nperp=32^4=1048576 + tryPlanAndExec(8, 1048576); + // dim 1-4: G=32, Nperp=8*32^3=262144 + tryPlanAndExec(32, 262144); + + // Extra intermediate cases to bracket the failure + tryPlanAndExec(16, 1024); + tryPlanAndExec(16, 2048); + tryPlanAndExec(16, 8192); + tryPlanAndExec(8, 4096); + tryPlanAndExec(8, 65536); + tryPlanAndExec(8, 262144); + + return 0; +} diff --git a/tests/debug/Test_hipfft_repro.cc b/tests/debug/Test_hipfft_repro.cc new file mode 100644 index 00000000..12f0d3d6 --- /dev/null +++ b/tests/debug/Test_hipfft_repro.cc @@ -0,0 +1,168 @@ +/* + * Reproducer for HIPFFT_PARSE_ERROR (error 12) from hipfftMakePlanMany on + * ROCm 7 / hipFFT 1.0.20 (Frontier, MI210 login and MI250X compute nodes). + * + * Observed failure: G < 32 returns HIPFFT_PARSE_ERROR from all three plan + * creation APIs (hipfftPlanMany, hipfftMakePlanMany, hipfftPlan1d) when a + * device buffer is allocated and zeroed with hipMalloc+hipMemset before the + * plan creation call. G >= 32 succeeds. + * + * Contrast with Test_hipfft_minimal.cc (plan-first ordering) which passes + * for all G even with an empty rocFFT cache. + * + * Compile on Frontier (no Grid headers needed): + * hipcc -o Test_hipfft_repro Test_hipfft_repro.cc -lhipfft + * + * Run with empty cache to reproduce the failure: + * rm -rf ~/.cache/rocfft + * ./Test_hipfft_repro + */ + +#include +#include +#include +#include + +static const char *hipfftResultString(hipfftResult r) { + switch (r) { + case HIPFFT_SUCCESS: return "HIPFFT_SUCCESS"; + case HIPFFT_INVALID_PLAN: return "HIPFFT_INVALID_PLAN"; + case HIPFFT_ALLOC_FAILED: return "HIPFFT_ALLOC_FAILED"; + case HIPFFT_INVALID_TYPE: return "HIPFFT_INVALID_TYPE"; + case HIPFFT_INVALID_VALUE: return "HIPFFT_INVALID_VALUE"; + case HIPFFT_INTERNAL_ERROR: return "HIPFFT_INTERNAL_ERROR"; + case HIPFFT_EXEC_FAILED: return "HIPFFT_EXEC_FAILED"; + case HIPFFT_SETUP_FAILED: return "HIPFFT_SETUP_FAILED"; + case HIPFFT_INVALID_SIZE: return "HIPFFT_INVALID_SIZE"; + case HIPFFT_UNALIGNED_DATA: return "HIPFFT_UNALIGNED_DATA"; + case HIPFFT_INCOMPLETE_PARAMETER_LIST:return "HIPFFT_INCOMPLETE_PARAMETER_LIST"; + case HIPFFT_INVALID_DEVICE: return "HIPFFT_INVALID_DEVICE"; + case HIPFFT_PARSE_ERROR: return "HIPFFT_PARSE_ERROR"; + case HIPFFT_NO_WORKSPACE: return "HIPFFT_NO_WORKSPACE"; + case HIPFFT_NOT_IMPLEMENTED: return "HIPFFT_NOT_IMPLEMENTED"; + case HIPFFT_NOT_SUPPORTED: return "HIPFFT_NOT_SUPPORTED"; + default: return "UNKNOWN"; + } +} + +// Plan creation + execution for (G, howmany) using hipfftCreate+hipfftMakePlanMany. +// This is the path Grid's FFT.h now uses. +static void tryPlanAndExec(int G, long howmany) { + int n[] = {G}; + long nelems = (long)G * howmany; + + printf("--- G=%-4d howmany=%-10ld total_elems=%-12ld ---\n", + G, howmany, nelems); + + // Allocate device buffer (hipfftDoubleComplex = 16 bytes each) + hipfftDoubleComplex *dbuf = nullptr; + hipError_t herr = hipMalloc(&dbuf, nelems * sizeof(hipfftDoubleComplex)); + if (herr != hipSuccess) { + printf(" hipMalloc failed (%d) for %ld elems — skipping\n\n", (int)herr, nelems); + return; + } + hipMemset(dbuf, 0, nelems * sizeof(hipfftDoubleComplex)); + + // 1. hipfftPlanMany (one-step, nullptr embed) — current Grid path + { + hipfftHandle p; + hipfftResult rv = hipfftPlanMany(&p, 1, n, + nullptr, 1, G, + nullptr, 1, G, + HIPFFT_Z2Z, (int)howmany); + printf(" hipfftPlanMany create : %d (%s)\n", (int)rv, hipfftResultString(rv)); + if (rv == HIPFFT_SUCCESS) { + rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD); + hipDeviceSynchronize(); + printf(" hipfftPlanMany execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv)); + hipfftDestroy(p); + } + } + + // 2. hipfftCreate + hipfftMakePlanMany (two-step) — also current Grid path + { + hipfftHandle p; + size_t workSize = 0; + hipfftResult rc = hipfftCreate(&p); + if (rc == HIPFFT_SUCCESS) { + hipfftResult rv = hipfftMakePlanMany(p, 1, n, + nullptr, 1, G, + nullptr, 1, G, + HIPFFT_Z2Z, (int)howmany, &workSize); + printf(" hipfftMakePlanMany : %d (%s) workSize=%zu\n", + (int)rv, hipfftResultString(rv), workSize); + if (rv == HIPFFT_SUCCESS) { + rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD); + hipDeviceSynchronize(); + printf(" hipfftMakePlanMany exec : %d (%s)\n", (int)rv, hipfftResultString(rv)); + } + hipfftDestroy(p); + } else { + printf(" hipfftCreate : %d (%s)\n", (int)rc, hipfftResultString(rc)); + } + } + + // 3. hipfftPlan1d (simplest API, batch = howmany) + { + hipfftHandle p; + hipfftResult rv = hipfftPlan1d(&p, G, HIPFFT_Z2Z, (int)howmany); + printf(" hipfftPlan1d create : %d (%s)\n", (int)rv, hipfftResultString(rv)); + if (rv == HIPFFT_SUCCESS) { + rv = hipfftExecZ2Z(p, dbuf, dbuf, HIPFFT_FORWARD); + hipDeviceSynchronize(); + printf(" hipfftPlan1d execFwd: %d (%s)\n", (int)rv, hipfftResultString(rv)); + hipfftDestroy(p); + } + } + + hipFree(dbuf); + printf("\n"); +} + +int main(void) { + // Print HIP device info + int device = 0; + hipGetDevice(&device); + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, device); + printf("Device %d: %s warpSize=%d\n\n", device, prop.name, prop.warpSize); + +#ifdef hipfftVersionMinor + printf("hipFFT version: %d.%d.%d\n\n", + hipfftVersionMajor, hipfftVersionMinor, hipfftVersionPatch); +#endif + + // Original sweep with small howmany (these passed first time) + printf("=== Small howmany (original sweep) ===\n\n"); + for (int G : {4, 8, 12, 16, 24, 32, 48, 64}) + tryPlanAndExec(G, 512); + + // Grid-realistic howmany values derived from actual lattice geometries. + // howmany = Ncomp * product(ldimensions[d] for d != dim) + // For LatticeComplexD: Ncomp=1. + printf("=== Grid-realistic parameters ===\n\n"); + + // --grid 16.16.16.16 4D FFT (KNOWN TO FAIL in Grid) + // Each dim: G=16, Nperp=16^3=4096 + tryPlanAndExec(16, 4096); + + // --grid 32.32.32.32 4D FFT (KNOWN TO SUCCEED in Grid) + // Each dim: G=32, Nperp=32^3=32768 + tryPlanAndExec(32, 32768); + + // --grid 32.32.32.32 Ls=8 5D DWF FFT (KNOWN TO FAIL on dim 0 in Grid) + // dim 0: G=8, Nperp=32^4=1048576 + tryPlanAndExec(8, 1048576); + // dim 1-4: G=32, Nperp=8*32^3=262144 + tryPlanAndExec(32, 262144); + + // Extra intermediate cases to bracket the failure + tryPlanAndExec(16, 1024); + tryPlanAndExec(16, 2048); + tryPlanAndExec(16, 8192); + tryPlanAndExec(8, 4096); + tryPlanAndExec(8, 65536); + tryPlanAndExec(8, 262144); + + return 0; +}