RNG seed change safer for large volumes; this is a long term solution

Updated bench script
2025-11-29 10:59:32 +00:00 · 2024-02-07 00:56:39 +00:00 · 2024-02-06 23:45:10 +00:00
5 changed files with 57 additions and 150 deletions
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -152,6 +152,7 @@ public:
 #ifdef RNG_FAST_DISCARD
  static void Skip(RngEngine &eng,uint64_t site)
  {
+#if 0
    /////////////////////////////////////////////////////////////////////////////////////
    // Skip by 2^40 elements between successive lattice sites
    // This goes by 10^12.
@@ -162,9 +163,9 @@ public:
    // tens of seconds per trajectory so this is clean in all reasonable cases,
    // and margin of safety is orders of magnitude.
    // We could hack Sitmo to skip in the higher order words of state if necessary
-      //
-      // Replace with 2^30 ; avoid problem on large volumes
-      //
+    //
+    // Replace with 2^30 ; avoid problem on large volumes
+    //
    /////////////////////////////////////////////////////////////////////////////////////
    //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
    const int shift = 30;
@@ -179,6 +180,9 @@ public:
    assert((skip >> shift)==site); // check for overflow

    eng.discard(skip);
+#else
+    eng.discardhi(site);
+#endif
    //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
  } 
 #endif
--- a/Grid/sitmo_rng/sitmo_prng_engine.hpp
+++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp
@@ -218,6 +218,10 @@ public:
    // -------------------------------------------------
    // misc
    // -------------------------------------------------
+    void discardhi(uint64_t z) {
+      _s[3] += z;
+      encrypt_counter();
+    }
    
    // req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9
    // Advances e’s state ei to ei+z by any means equivalent to z
@@ -387,4 +391,4 @@ private:
 #undef MIXK
 #undef MIX2

-#endif
+#endif
--- a/systems/Aurora/benchmarks/bench.pbs
+++ b/systems/Aurora/benchmarks/bench.pbs
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
-#PBS -q EarlyAppAccess
-#PBS -l select=1
-#PBS -l walltime=01:00:00
-#PBS -A LatticeQCD_aesp_CNDA
-
-HDIR=/home/paboyle/
-#module use /soft/testing/modulefiles/
-#module load intel-UMD23.05.25593.11/23.05.25593.11
-#module load tools/pti-gpu  
-#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
-#export PATH=$HDIR/tools/bin:$PATH
-
-export TZ='/usr/share/zoneinfo/US/Central'
-export OMP_PROC_BIND=spread
-export OMP_NUM_THREADS=3
-unset OMP_PLACES
-
-cd $PBS_O_WORKDIR
-
-source ../sourceme.sh
-
-echo Jobid: $PBS_JOBID
-echo Running on host `hostname`
-echo Running on nodes `cat $PBS_NODEFILE`
-
-echo NODES
-cat $PBS_NODEFILE
-NNODES=`wc -l < $PBS_NODEFILE`
-NRANKS=12         # Number of MPI ranks per node
-NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node
-NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
-
-NTOTRANKS=$(( NNODES * NRANKS ))
-
-echo "NUM_NODES=${NNODES}  TOTAL_RANKS=${NTOTRANKS}  RANKS_PER_NODE=${NRANKS}  THREADS_PER_RANK=${OMP_NUM_THREADS}"
-echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
-
-    
-#CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
-#	     ./gpu_tile_compact.sh \
-#	./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
-#	--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-
-CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
-	     ./gpu_tile_compact.sh \
-	./Benchmark_memory_bandwidth --mpi 1.1.2.6 --grid 32.24.32.192 \
-	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-$CMD
--- a/systems/Aurora/benchmarks/bench12.pbs
+++ b/systems/Aurora/benchmarks/bench12.pbs
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
+
+#PBS -q EarlyAppAccess
+#PBS -l select=2
+#PBS -l walltime=01:00:00
+#PBS -A LatticeQCD_aesp_CNDA
+
+#export OMP_PROC_BIND=spread
+#unset OMP_PLACES
+
+cd $PBS_O_WORKDIR
+
+source ../sourceme.sh
+
+export OMP_NUM_THREADS=3
+export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
+
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
+#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
+#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
+
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
+export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
+export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
+export MPICH_OFI_NIC_POLICY=GPU
+
+CMD="mpiexec -np 24 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
+		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
+$CMD 
+
+CMD="mpiexec -np 24 -ppn 12  -envall \
+	     ./gpu_tile_compact.sh \
+	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
+		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
+
+$CMD 
--- a/systems/Aurora/benchmarks/bench2.pbs
+++ b/systems/Aurora/benchmarks/bench2.pbs
@@ -1,95 +0,0 @@
-#!/bin/bash
-
-## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
-
-#PBS -q EarlyAppAccess
-#PBS -l select=2
-#PBS -l walltime=01:00:00
-#PBS -A LatticeQCD_aesp_CNDA
-#export OMP_PROC_BIND=spread
-#export OMP_NUM_THREADS=3
-#unset OMP_PLACES
-
-cd $PBS_O_WORKDIR
-
-source ../sourceme.sh
-
-#echo Jobid: $PBS_JOBID
-#echo Running on host `hostname`
-#echo Running on nodes `cat $PBS_NODEFILE`
-
-#echo NODES
-#cat $PBS_NODEFILE
-NNODES=`wc -l < $PBS_NODEFILE`
-NRANKS=12         # Number of MPI ranks per node
-NDEPTH=3          # Number of hardware threads per rank, spacing between MPI ranks on a node
-NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
-
-NTOTRANKS=$(( NNODES * NRANKS ))
-
-
-CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \
-	     ./gpu_tile_compact.sh \
-	./Benchmark_comms_host_device --mpi 1.1.1.2 --grid 32.24.32.192 \
-	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-#$CMD | tee 1-to-1.comms.hmem0
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
-#$CMD | tee 1-to-1.comms.hmem1
-
-
-CMD="mpiexec -np 4 -ppn 2 -d ${NDEPTH} --cpu-bind=depth -envall \
-	     ./gpu_tile_compact.sh \
-	./Benchmark_comms_host_device --mpi 2.2.1.1 --grid 32.24.32.96 \
-	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
-#$CMD | tee 2-to-2.comms.hmem1
-
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-$CMD | tee 2-to-2.comms.hmem0
-
-CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \
-	     ./gpu_tile_compact.sh \
-	./Benchmark_comms_host_device --mpi 3.2.1.1 --grid 32.24.32.96 \
-	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
-#$CMD | tee 3-to-3.comms.hmem1
-
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-#$CMD | tee 3-to-3.comms.hmem0
-
-
-CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \
-	     ./gpu_tile_compact4a.sh \
-	./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \
-	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
-#$CMD | tee 4-to-4.comms.hmem1.nic-affinity
-
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-$CMD | tee 4-to-4.comms.hmem0
-
-#mpiexec -np 1 --ppn 1 -d 1 numactl -H | tee numa.log
-
-CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \
-	     ./gpu_tile_compact.sh \
-	./Benchmark_comms_host_device --mpi 3.2.2.1 --grid 32.24.32.96 \
-	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
-#$CMD | tee 6-to-6.comms.hmem1
-
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-$CMD | tee 6-to-6.comms.hmem0
-
-
-CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
-	     ./gpu_tile_compact.sh \
-	./Benchmark_comms_host_device --mpi 3.2.2.2 --grid 32.24.32.192 \
-	--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
-
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
-#$CMD | tee 12-to-12.comms.hmem1
-
-export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0
-$CMD | tee 12-to-12.comms.hmem0
Author	SHA1	Message	Date
Peter Boyle	7019916294	RNG seed change safer for large volumes; this is a long term solution	2024-02-07 00:56:39 +00:00
Peter Boyle	91cf5ee312	Updated bench script	2024-02-06 23:45:10 +00:00