diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 2ad48926..32e8108e 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -325,12 +325,12 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, // Start comms // Gather intranode and extra node differentiated?? ///////////////////////////// { - std::cout << " WilsonFermion5D gather " < > requests; auto id=traceStart("Communicate overlapped"); st.CommunicateBegin(requests); @@ -339,7 +339,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, // Overlap with comms ///////////////////////////// { - std::cout << " WilsonFermion5D Comms merge " <::DhopInternalOverlappedComms(StencilImpl & st, ///////////////////////////// // do the compute interior ///////////////////////////// - std::cout << " WilsonFermion5D Interior " <::DhopInternalOverlappedComms(StencilImpl & st, ///////////////////////////// // Complete comms ///////////////////////////// - std::cout << " WilsonFermion5D Comms Complete " <::DhopInternalOverlappedComms(StencilImpl & st, // do the compute exterior ///////////////////////////// { - std::cout << " WilsonFermion5D Comms Merge " <::DhopInternalOverlappedComms(StencilImpl & st, GRID_TRACE("DhopExterior"); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); } - std::cout << " WilsonFermion5D Done " <::DhopInternalSerialComms(StencilImpl & st, int LLs = in.Grid()->_rdimensions[0]; - std::cout << " WilsonFermion5D Halo exch " <::DhopInternalSerialComms(StencilImpl & st, GRID_TRACE("Dhop"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); } - std::cout << " WilsonFermion5D Done " < > &reqs) { // All GPU kernel tasks must complete - // accelerator_barrier(); // All kernels should ALREADY be complete - // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer + accelerator_barrier(); // All kernels should ALREADY be complete + _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, @@ -390,8 +390,8 @@ public: if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); - // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete - // accelerator_barrier(); + acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete + accelerator_barrier(); _grid->StencilBarrier(); // run any checksums for(int i=0;i void HaloGather(const Lattice &source,compressor &compress) { - // accelerator_barrier(); + accelerator_barrier(); _grid->StencilBarrier();// Synch shared memory on a single nodes assert(source.Grid()==_grid); @@ -487,6 +487,7 @@ public: HaloGatherDir(source,compress,point,face_idx); } accelerator_barrier(); // All my local gathers are complete + _grid->StencilBarrier();// Synch shared memory on a single nodes face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); } @@ -653,7 +654,9 @@ public: } } } + std::cout << "BuildSurfaceList size is "< surface_list_host(surface_list_size); int32_t ss=0; for(int site = 0 ;site< vol4;site++){ int local = 1; @@ -665,12 +668,12 @@ public: if(local == 0) { for(int s=0;ssi_signo); + fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); + fprintf(stderr," code %d\n",si->si_code); + // x86 64bit +#ifdef __linux__ +#ifdef __x86_64__ + ucontext_t * uc= (ucontext_t *)ptr; + struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; + fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip); +#endif +#endif + fflush(stderr); + BACKTRACEFP(stderr); + fprintf(stderr,"Called backtrace\n"); + fflush(stdout); + fflush(stderr); + return; +} + void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) { + fprintf(stderr,"Signal handler on host %s\n",hostname); fprintf(stderr,"Caught signal %d\n",si->si_signo); fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); fprintf(stderr," code %d\n",si->si_code); @@ -561,7 +584,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) ucontext_t * uc= (ucontext_t *)ptr; struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip); -#define REG(A) printf(" %s %lx\n",#A,sc-> A); +#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A); REG(rdi); REG(rsi); REG(rbp); @@ -594,8 +617,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) void Grid_exit_handler(void) { - BACKTRACEFP(stdout); - fflush(stdout); + // BACKTRACEFP(stdout); + // fflush(stdout); } void Grid_debug_handler_init(void) { @@ -603,10 +626,10 @@ void Grid_debug_handler_init(void) sigemptyset (&sa.sa_mask); sa.sa_sigaction= Grid_sa_signal_handler; sa.sa_flags = SA_SIGINFO; - sigaction(SIGSEGV,&sa,NULL); + // sigaction(SIGSEGV,&sa,NULL); sigaction(SIGTRAP,&sa,NULL); sigaction(SIGBUS,&sa,NULL); - sigaction(SIGUSR2,&sa,NULL); + // sigaction(SIGUSR2,&sa,NULL); feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); @@ -614,7 +637,14 @@ void Grid_debug_handler_init(void) sigaction(SIGKILL,&sa,NULL); sigaction(SIGILL,&sa,NULL); - atexit(Grid_exit_handler); + // Non terminating SIGUSR1/2 handler + struct sigaction sa_ping; + sigemptyset (&sa_ping.sa_mask); + sa_ping.sa_sigaction= Grid_usr_signal_handler; + sa_ping.sa_flags = SA_SIGINFO; + sigaction(SIGHUP,&sa_ping,NULL); + + // atexit(Grid_exit_handler); } NAMESPACE_END(Grid); diff --git a/systems/Aurora/benchmarks/bench1.pbs b/systems/Aurora/benchmarks/bench1.pbs index a202b587..e85dc09e 100644 --- a/systems/Aurora/benchmarks/bench1.pbs +++ b/systems/Aurora/benchmarks/bench1.pbs @@ -5,63 +5,34 @@ #PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA -#export OMP_PROC_BIND=spread -#unset OMP_PLACES - cd $PBS_O_WORKDIR source ../sourceme.sh -module load pti-gpu -#cat $PBS_NODEFILE +cp $PBS_NODEFILE nodefile export OMP_NUM_THREADS=4 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 - -#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE -#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE -#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST - +unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPICH_OFI_NIC_POLICY=GPU -# 12 ppn, 2 nodes, 24 ranks -# -CMD="mpiexec -np 1 -ppn 1 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_usqcd --mpi 1.1.1.1 --grid 24.32.32.24 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" -$CMD | tee usqcd.log - - -CMD="mpiexec -np 1 -ppn 1 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " -$CMD | tee 1tile.dwf - CMD="mpiexec -np 12 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -#$CMD | tee 1node.32.32.32.48.dwf + ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.48 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --debug-signals" - -CMD="mpiexec -np 12 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -#$CMD | tee 1node.64.64.32.96.dwf - -CMD="mpiexec -np 12 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ - ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -#$CMD | tee 1node.64.32.32.48.dwf +#for f in 1 2 3 4 5 6 7 8 +for f in 1 +do +echo $CMD +$CMD | tee 1node.32.32.64.48.dwf.hbm.$f +done diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs index ce477319..4b8eb3fc 100644 --- a/systems/Aurora/benchmarks/bench2.pbs +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -11,17 +11,16 @@ cd $PBS_O_WORKDIR source ../sourceme.sh -module load pti-gpu +#module load pti-gpu -#cat $PBS_NODEFILE + +cp $PBS_NODEFILE nodefile export OMP_NUM_THREADS=4 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 - #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST - export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 @@ -34,22 +33,26 @@ export MPICH_OFI_NIC_POLICY=GPU # 12 ppn, 2 nodes, 24 ranks # CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ + ./gpu_tile.sh \ ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" -$CMD | tee 2node.comms +#$CMD | tee 2node.comms.hbm CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 " -$CMD | tee 2node.32.32.64.48.dwf + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap --debug-signals" +#for f in 1 2 3 4 5 6 7 8 +for f in 1 +do +echo $CMD +$CMD | tee 2node.32.32.64.48.dwf.hbm.$f +done CMD="mpiexec -np 24 -ppn 12 -envall \ - ./gpu_tile_compact.sh \ + ./gpu_tile.sh \ ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ - --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 " -$CMD | tee 2node.64.64.64.96.dwf + --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +#$CMD | tee 2node.64.64.64.96.dwf.hbm diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 8ccba356..7952a819 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,4 +1,6 @@ module load oneapi/release/2023.12.15.001 +#module load mpich/icc-all-debug-pmix-gpu/52.2 +#module load mpich-config/mode/deterministic #module load intel_compute_runtime/release/821.35 source ~/spack/share/spack/setup-env.sh spack load c-lime diff --git a/systems/Aurora/tests/reproBigJob.pbs b/systems/Aurora/tests/reproBigJob.pbs index 721b4707..1d880f0d 100644 --- a/systems/Aurora/tests/reproBigJob.pbs +++ b/systems/Aurora/tests/reproBigJob.pbs @@ -15,13 +15,13 @@ # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6 -#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 -#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 @@ -30,20 +30,22 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export GRID_PRINT_ENTIRE_LOG=0 -export GRID_CHECKSUM_RECV_BUF=1 -export GRID_CHECKSUM_SEND_BUF=1 +export GRID_CHECKSUM_RECV_BUF=0 +export GRID_CHECKSUM_SEND_BUF=0 export MPICH_OFI_NIC_POLICY=GPU -export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 -export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 -export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling -unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE -unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE -unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE +#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE cd $PBS_O_WORKDIR +cp $PBS_NODEFILE nodefile + DIR=reproBigJob.$PBS_JOBID mkdir -p $DIR @@ -51,10 +53,19 @@ cd $DIR cp $PBS_NODEFILE nodefile +BINARY=../Test_dwf_mixedcg_prec + +echo > pingjob < command-line env > environment