From 3d014864e24f753a84c5bb4966eff12696b0509e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 6 Mar 2025 14:18:43 -0500 Subject: [PATCH] Makinig LLVM happy --- Grid/threads/Accelerator.h | 38 ++++++++--------- Grid/threads/Threads.h | 6 +-- benchmarks/Benchmark_usqcd.cc | 69 +++++++++++++++++-------------- systems/sdcc-genoa/bench.slurm | 32 ++++++++++++++ systems/sdcc-genoa/bench2.slurm | 36 ++++++++++++++++ systems/sdcc-genoa/config-command | 16 +++++++ systems/sdcc-genoa/sourceme.sh | 4 ++ 7 files changed, 147 insertions(+), 54 deletions(-) create mode 100644 systems/sdcc-genoa/bench.slurm create mode 100644 systems/sdcc-genoa/bench2.slurm create mode 100644 systems/sdcc-genoa/config-command create mode 100644 systems/sdcc-genoa/sourceme.sh diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 28c3aa0a..b2a40e7b 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -245,12 +245,12 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);}; -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} -inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} -inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} +inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} +inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream); } @@ -359,12 +359,12 @@ inline int acceleratorEventIsComplete(acceleratorEvent_t ev) return (ev.get_info() == sycl::info::event_command_status::complete); } -inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);} -inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } -inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } +inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);} +inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } +inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) @@ -511,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch { auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream); } -inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { +inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream); } -inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { +inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream); } inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); }; @@ -583,9 +583,9 @@ inline void acceleratorMem(void) accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } @@ -668,7 +668,7 @@ accelerator_inline void acceleratorFence(void) return; } -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) +inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes) { acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); acceleratorCopySynchronise(); diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index 6887134d..cdb4fa62 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -73,9 +73,9 @@ Author: paboyle #define thread_critical DO_PRAGMA(omp critical) #ifdef GRID_OMP -inline void thread_bcopy(void *from, void *to,size_t bytes) +inline void thread_bcopy(const void *from, void *to,size_t bytes) { - uint64_t *ufrom = (uint64_t *)from; + const uint64_t *ufrom = (const uint64_t *)from; uint64_t *uto = (uint64_t *)to; assert(bytes%8==0); uint64_t words=bytes/8; @@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes) }); } #else -inline void thread_bcopy(void *from, void *to,size_t bytes) +inline void thread_bcopy(const void *from, void *to,size_t bytes) { bcopy(from,to,bytes); } diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc index e400138b..4b50121e 100644 --- a/benchmarks/Benchmark_usqcd.cc +++ b/benchmarks/Benchmark_usqcd.cc @@ -492,17 +492,18 @@ public: } FGrid->Barrier(); double t1=usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t no = 50; + uint64_t ni = 100; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); double t1=usecond(); - uint64_t ncall = 500; - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t no = 50; + uint64_t ni = 100; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); - double t1=usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t ni = 100; + uint64_t no = 50; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); @@ -814,20 +818,21 @@ public: double mf_hi, mf_lo, mf_err; timestat.statistics(t_time); - mf_hi = flops/timestat.min; - mf_lo = flops/timestat.max; + mf_hi = flops/timestat.min*ni; + mf_lo = flops/timestat.max*ni; mf_err= flops/timestat.min * timestat.err/timestat.mean; - mflops = flops/timestat.mean; + mflops = flops/timestat.mean*ni; mflops_all.push_back(mflops); if ( mflops_best == 0 ) mflops_best = mflops; if ( mflops_worst== 0 ) mflops_worst= mflops; if ( mflops>mflops_best ) mflops_best = mflops; if ( mflops L_list({8,12,16,24}); + std::vector L_list({8,12,16,24,32}); int selm1=sel-1; std::vector clover; diff --git a/systems/sdcc-genoa/bench.slurm b/systems/sdcc-genoa/bench.slurm new file mode 100644 index 00000000..2c7f6c32 --- /dev/null +++ b/systems/sdcc-genoa/bench.slurm @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:50:00 +#SBATCH -A lqcdtest +#SBATCH -q lqcd +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=64 +#SBATCH --qos lqcd + +source sourceme.sh + +export PLACES=(1:16:4 1:32:2 0:64:1); +export THR=(16 32 64) + +for t in 2 +do + +export OMP_NUM_THREADS=${THR[$t]} +export OMP_PLACES=${PLACES[$t]} +export thr=${THR[$t]} + +#for vol in 24.24.24.24 32.32.32.32 48.48.48.96 +for vol in 48.48.48.96 +do +srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr +done +#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr +done + diff --git a/systems/sdcc-genoa/bench2.slurm b/systems/sdcc-genoa/bench2.slurm new file mode 100644 index 00000000..be21c816 --- /dev/null +++ b/systems/sdcc-genoa/bench2.slurm @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:50:00 +#SBATCH -A lqcdtest +#SBATCH -q lqcd +#SBATCH --exclusive +#SBATCH --nodes=2 +#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054 +#SBATCH --ntasks=2 +#SBATCH --cpus-per-task=64 +#SBATCH --qos lqcd + +source sourceme.sh + +export PLACES=(1:16:4 1:32:2 0:64:1); +export THR=(16 32 64) + +nodes=2 +mpi=1.1.1.2 + +for t in 2 +do + +export OMP_NUM_THREADS=${THR[$t]} +export OMP_PLACES=${PLACES[$t]} +export thr=${THR[$t]} + +#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr + +for vol in 64.64.64.128 +do +srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr +done + +done + diff --git a/systems/sdcc-genoa/config-command b/systems/sdcc-genoa/config-command new file mode 100644 index 00000000..d992e1da --- /dev/null +++ b/systems/sdcc-genoa/config-command @@ -0,0 +1,16 @@ +../../configure \ +--enable-comms=mpi-auto \ +--enable-unified=yes \ +--enable-shm=shmopen \ +--enable-shm-fast-path=shmopen \ +--enable-accelerator=none \ +--enable-simd=AVX512 \ +--disable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=clang++ \ +MPICXX=mpicxx \ +CXXFLAGS="-std=c++17" + + + diff --git a/systems/sdcc-genoa/sourceme.sh b/systems/sdcc-genoa/sourceme.sh new file mode 100644 index 00000000..4f37888c --- /dev/null +++ b/systems/sdcc-genoa/sourceme.sh @@ -0,0 +1,4 @@ +source $HOME/spack/share/spack/setup-env.sh +spack load llvm@17.0.4 +export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH +module load openmpi