diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index b667d32e..38b9f9c6 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -746,26 +746,34 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list,int dir) { - // int nreq=list.size(); + acceleratorCopySynchronise(); // Complete all pending copy transfers D2D - // if (nreq==0) return; - // std::vector status(nreq); - // std::vector MpiRequests(nreq); + std::vector status; + std::vector MpiRequests; + + for(int r=0;r0) { + status.resize(MpiRequests.size()); + int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing. + assert(ierr==0); + } - // int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing. - // assert(ierr==0); - // for(int r=0;rHostBufferFreeAll(); // Clean up the buffer allocs diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index f162a903..3dee8f4d 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, { assert(0); } -void CartesianCommunicator::CommsComplete(std::vector &list){ assert(0);} +void CartesianCommunicator::CommsComplete(std::vector &list){ assert(list.size()==0);} void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, void *xmit, int dest, diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index f3371f98..b2a40e7b 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -245,12 +245,12 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);}; -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} -inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} -inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} +inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} +inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream); } @@ -359,12 +359,12 @@ inline int acceleratorEventIsComplete(acceleratorEvent_t ev) return (ev.get_info() == sycl::info::event_command_status::complete); } -inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);} -inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } -inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } +inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);} +inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } +inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); } -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) @@ -511,19 +511,19 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch { auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream); } -inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { +inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream); } -inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { +inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream); } inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); }; @@ -583,9 +583,9 @@ inline void acceleratorMem(void) accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} +inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } +inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } @@ -668,15 +668,15 @@ accelerator_inline void acceleratorFence(void) return; } -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) +inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes) { acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); acceleratorCopySynchronise(); } -template void acceleratorPut(T& dev,T&host) +template void acceleratorPut(T& dev,const T&host) { - acceleratorCopyToDevice(&host,&dev,sizeof(T)); + acceleratorCopyToDevice((void *)&host,&dev,sizeof(T)); } template T acceleratorGet(T& dev) { diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index 6887134d..cdb4fa62 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -73,9 +73,9 @@ Author: paboyle #define thread_critical DO_PRAGMA(omp critical) #ifdef GRID_OMP -inline void thread_bcopy(void *from, void *to,size_t bytes) +inline void thread_bcopy(const void *from, void *to,size_t bytes) { - uint64_t *ufrom = (uint64_t *)from; + const uint64_t *ufrom = (const uint64_t *)from; uint64_t *uto = (uint64_t *)to; assert(bytes%8==0); uint64_t words=bytes/8; @@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes) }); } #else -inline void thread_bcopy(void *from, void *to,size_t bytes) +inline void thread_bcopy(const void *from, void *to,size_t bytes) { bcopy(from,to,bytes); } diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 1424667e..feb44645 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv) Grid_default_latt, Grid_default_mpi); - + if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){ + std::cout << GridLogMessage <<" Enabling flight recorder " <=2*1024*1024*1024LL ){ - std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<Barrier(); double t1=usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t no = 50; + uint64_t ni = 100; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); double t1=usecond(); - uint64_t ncall = 500; - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t no = 50; + uint64_t ni = 100; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); - double t1=usecond(); - uint64_t ncall = 500; - - FGrid->Broadcast(0,&ncall,sizeof(ncall)); + uint64_t ni = 100; + uint64_t no = 50; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); - for(uint64_t i=0;i t_time(no); + for(uint64_t i=0;iBarrier(); @@ -814,20 +818,21 @@ public: double mf_hi, mf_lo, mf_err; timestat.statistics(t_time); - mf_hi = flops/timestat.min; - mf_lo = flops/timestat.max; + mf_hi = flops/timestat.min*ni; + mf_lo = flops/timestat.max*ni; mf_err= flops/timestat.min * timestat.err/timestat.mean; - mflops = flops/timestat.mean; + mflops = flops/timestat.mean*ni; mflops_all.push_back(mflops); if ( mflops_best == 0 ) mflops_best = mflops; if ( mflops_worst== 0 ) mflops_worst= mflops; if ( mflops>mflops_best ) mflops_best = mflops; if ( mflops L_list({8,12,16,24}); + std::vector L_list({8,12,16,24,32}); int selm1=sel-1; std::vector clover; diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index 6e5512ff..08b77f4f 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -1,18 +1,19 @@ #Ahead of time compile for PVC -export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib" -export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/" +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC" #JIT compile #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions " -../../configure \ +../configure \ --enable-simd=GPU \ --enable-reduction=grid \ --enable-gen-simd-width=64 \ --enable-comms=mpi-auto \ --enable-debug \ + --prefix $HOME/gpt-install \ --disable-gparity \ --disable-fermion-reps \ --with-lime=$CLIME \ diff --git a/systems/WorkArounds.txt b/systems/WorkArounds.txt new file mode 100644 index 00000000..7191b4ff --- /dev/null +++ b/systems/WorkArounds.txt @@ -0,0 +1,206 @@ +The purpose of this file is to collate all non-obvious known magic shell variables +and compiler flags required for either correctness or performance on various systems. + +A repository of work-arounds. + +Contents: +1. Interconnect + MPI +2. Compilation +3. Profiling + +************************ +* 1. INTERCONNECT + MPI +************************ + +-------------------------------------------------------------------- +MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O +-------------------------------------------------------------------- +export OMPI_MCA_io=romio321 + +-------------------------------------- +ROMIO fail with > 2GB per node read (32 bit issue) +-------------------------------------- + +Use later MPICH + +https://github.com/paboyle/Grid/issues/381 + +https://github.com/pmodels/mpich/commit/3a479ab0 + +-------------------------------------------------------------------- +Slingshot: Frontier and Perlmutter libfabric slow down +and physical memory fragmentation +-------------------------------------------------------------------- +export FI_MR_CACHE_MONITOR=disabled +or +export FI_MR_CACHE_MONITOR=kdreg2 + +-------------------------------------------------------------------- +Perlmutter +-------------------------------------------------------------------- + +export MPICH_RDMA_ENABLED_CUDA=1 +export MPICH_GPU_IPC_ENABLED=1 +export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0 +export MPICH_GPU_NO_ASYNC_MEMCPY=0 + +-------------------------------------------------------------------- +Frontier/LumiG +-------------------------------------------------------------------- + +Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU + +cat << EOF > select_gpu +#!/bin/bash +export MPICH_GPU_SUPPORT_ENABLED=1 +export MPICH_SMP_SINGLE_COPY_MODE=XPMEM +export GPU_MAP=(0 1 2 3 7 6 5 4) +export NUMA_MAP=(3 3 1 1 2 2 0 0) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} +export HIP_VISIBLE_DEVICES=\$GPU +unset ROCR_VISIBLE_DEVICES +echo RANK \$SLURM_LOCALID using GPU \$GPU +exec numactl -m \$NUMA -N \$NUMA \$* +EOF +chmod +x ./select_gpu + +srun ./select_gpu BINARY + + +-------------------------------------------------------------------- +Mellanox performance with A100 GPU (Tursa, Booster, Leonardo) +-------------------------------------------------------------------- +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes + +-------------------------------------------------------------------- +Mellanox + A100 correctness (Tursa, Booster, Leonardo) +-------------------------------------------------------------------- +export UCX_MEMTYPE_CACHE=n + +-------------------------------------------------------------------- +MPICH/Aurora/PVC correctness and performance +-------------------------------------------------------------------- + +https://github.com/pmodels/mpich/issues/7302 + +--enable-cuda-aware-mpi=no +--enable-unified=no + +Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI +Do not use SVM + +Ideally use MPICH with fix to issue 7302: + +https://github.com/pmodels/mpich/pull/7312 + +Ideally: +MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic + +Alternatives: +export MPIR_CVAR_NOLOCAL=1 +export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000 + +-------------------------------------------------------------------- +MPICH/Aurora/PVC correctness and performance +-------------------------------------------------------------------- + +Broken: +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +This gives good peformance without requiring +--enable-cuda-aware-mpi=no + +But is an open issue reported by James Osborn +https://github.com/pmodels/mpich/issues/7139 + +Possibly resolved but unclear if in the installed software yet. + +************************ +* 2. COMPILATION +************************ + +-------------------------------------------------------------------- +G++ compiler breakage / graveyard +-------------------------------------------------------------------- + +9.3.0, 10.3.1, +https://github.com/paboyle/Grid/issues/290 +https://github.com/paboyle/Grid/issues/264 + +Working (-) Broken (X): + +4.9.0 - +4.9.1 - +5.1.0 X +5.2.0 X +5.3.0 X +5.4.0 X +6.1.0 X +6.2.0 X +6.3.0 - +7.1.0 - +8.0.0 (HEAD) - + +https://github.com/paboyle/Grid/issues/100 + +-------------------------------------------------------------------- +AMD GPU nodes : +-------------------------------------------------------------------- + +multiple ROCM versions broken; use 5.3.0 +manifests itself as wrong results in fp32 + +https://github.com/paboyle/Grid/issues/464 + +-------------------------------------------------------------------- +Aurora/PVC +-------------------------------------------------------------------- + +SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB) +SYCL slow link and relocatable code issues (Christoph Lehner) +Opt large register file required for good performance in fp64 + + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" +export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc" +export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC" + +-------------------------------------------------------------------- +Aurora/PVC useful extra options +-------------------------------------------------------------------- + +Host only sanitizer: +-Xarch_host -fsanitize=leak +-Xarch_host -fsanitize=address + +Deterministic MPI reduction: +export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 +export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling +unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE +unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE + + + +************************ +* 3. Visual profile tools +************************ + +-------------------------------------------------------------------- +Frontier/rocprof +-------------------------------------------------------------------- + +-------------------------------------------------------------------- +Aurora/unitrace +-------------------------------------------------------------------- + + +-------------------------------------------------------------------- +Tursa/nsight-sys +-------------------------------------------------------------------- diff --git a/systems/sdcc-genoa/bench.slurm b/systems/sdcc-genoa/bench.slurm new file mode 100644 index 00000000..2c7f6c32 --- /dev/null +++ b/systems/sdcc-genoa/bench.slurm @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:50:00 +#SBATCH -A lqcdtest +#SBATCH -q lqcd +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=64 +#SBATCH --qos lqcd + +source sourceme.sh + +export PLACES=(1:16:4 1:32:2 0:64:1); +export THR=(16 32 64) + +for t in 2 +do + +export OMP_NUM_THREADS=${THR[$t]} +export OMP_PLACES=${PLACES[$t]} +export thr=${THR[$t]} + +#for vol in 24.24.24.24 32.32.32.32 48.48.48.96 +for vol in 48.48.48.96 +do +srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr +done +#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr +done + diff --git a/systems/sdcc-genoa/bench2.slurm b/systems/sdcc-genoa/bench2.slurm new file mode 100644 index 00000000..be21c816 --- /dev/null +++ b/systems/sdcc-genoa/bench2.slurm @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:50:00 +#SBATCH -A lqcdtest +#SBATCH -q lqcd +#SBATCH --exclusive +#SBATCH --nodes=2 +#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054 +#SBATCH --ntasks=2 +#SBATCH --cpus-per-task=64 +#SBATCH --qos lqcd + +source sourceme.sh + +export PLACES=(1:16:4 1:32:2 0:64:1); +export THR=(16 32 64) + +nodes=2 +mpi=1.1.1.2 + +for t in 2 +do + +export OMP_NUM_THREADS=${THR[$t]} +export OMP_PLACES=${PLACES[$t]} +export thr=${THR[$t]} + +#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr + +for vol in 64.64.64.128 +do +srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr +done + +done + diff --git a/systems/sdcc-genoa/config-command b/systems/sdcc-genoa/config-command new file mode 100644 index 00000000..d992e1da --- /dev/null +++ b/systems/sdcc-genoa/config-command @@ -0,0 +1,16 @@ +../../configure \ +--enable-comms=mpi-auto \ +--enable-unified=yes \ +--enable-shm=shmopen \ +--enable-shm-fast-path=shmopen \ +--enable-accelerator=none \ +--enable-simd=AVX512 \ +--disable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=clang++ \ +MPICXX=mpicxx \ +CXXFLAGS="-std=c++17" + + + diff --git a/systems/sdcc-genoa/sourceme.sh b/systems/sdcc-genoa/sourceme.sh new file mode 100644 index 00000000..4f37888c --- /dev/null +++ b/systems/sdcc-genoa/sourceme.sh @@ -0,0 +1,4 @@ +source $HOME/spack/share/spack/setup-env.sh +spack load llvm@17.0.4 +export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH +module load openmpi