diff --git a/Grid/algorithms/GeneralCoarsenedMatrix.h b/Grid/algorithms/GeneralCoarsenedMatrix.h index b87f4cb3..86971fdf 100644 --- a/Grid/algorithms/GeneralCoarsenedMatrix.h +++ b/Grid/algorithms/GeneralCoarsenedMatrix.h @@ -300,10 +300,10 @@ public: const int Nsimd = CComplex::Nsimd(); int osites=pin.Grid()->oSites(); - int gsites=pin.Grid()->gSites(); + // int gsites=pin.Grid()->gSites(); - RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites; - RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint; + RealD flops = 1.0* npoint * nbasis * nbasis * 8 * osites; + RealD bytes = (1.0*osites*sizeof(siteMatrix)*npoint+2.0*osites*sizeof(siteVector))*npoint; // for(int point=0;pointgSites() ;bidx++){ + for(int64_t bidx=0;bidxgSites() ;bidx++){ Coordinate bcoor; CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor); @@ -543,10 +543,13 @@ public: } // Only needed if nonhermitian - if ( ! hermitian ) + if ( ! hermitian ) { + std::cout << GridLogMessage<<"PopulateAdag "< std::ostream& operator<< (std::ostream& stream, const Lattice &o){ typedef typename vobj::scalar_object sobj; - for(int g=0;g_gsites;g++){ + for(int64_t g=0;g_gsites;g++){ Coordinate gcoor; o.Grid()->GlobalIndexToGlobalCoor(g,gcoor); diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index c9f6aa52..a19edf00 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -432,7 +432,7 @@ public: #if 1 thread_for( lidx, _grid->lSites(), { - int gidx; + int64_t gidx; int o_idx; int i_idx; int rank; diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index f22b7001..a2e4982e 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -1054,7 +1054,7 @@ void Replicate(const Lattice &coarse,Lattice & fine) Coordinate fcoor(nd); Coordinate ccoor(nd); - for(int g=0;ggSites();g++){ + for(int64_t g=0;ggSites();g++){ fg->GlobalIndexToGlobalCoor(g,fcoor); for(int d=0;d - static accelerator_inline void CoorFromIndex (coor_t& coor,int index,const coor_t &dims){ + static accelerator_inline void CoorFromIndex (coor_t& coor,int64_t index,const coor_t &dims){ int nd= dims.size(); coor.resize(nd); for(int d=0;d - static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){ + static accelerator_inline void IndexFromCoor (const coor_t& coor,int64_t &index,const coor_t &dims){ int nd=dims.size(); int stride=1; index=0; for(int d=0;d + static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){ + int64_t index64; + IndexFromCoor(coor,index64,dims); + assert(index64<2*1024*1024*1024LL); + index = (int) index64; + } template - static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){ + static inline void IndexFromCoorReversed (const coor_t& coor,int64_t &index,const coor_t &dims){ int nd=dims.size(); int stride=1; index=0; for(int d=nd-1;d>=0;d--){ - index = index+stride*coor[d]; + index = index+(int64_t)stride*coor[d]; stride=stride*dims[d]; } } template - static inline void CoorFromIndexReversed (coor_t& coor,int index,const coor_t &dims){ + static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){ + int64_t index64; + IndexFromCoorReversed(coor,index64,dims); + if ( index64>=2*1024*1024*1024LL ){ + std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "< + static inline void CoorFromIndexReversed (coor_t& coor,int64_t index,const coor_t &dims){ int nd= dims.size(); coor.resize(nd); for(int d=nd-1;d>=0;d--){ diff --git a/systems/Frontier/benchmarks/bench2.slurm b/systems/Frontier/benchmarks/bench2.slurm new file mode 100755 index 00000000..cc82de79 --- /dev/null +++ b/systems/Frontier/benchmarks/bench2.slurm @@ -0,0 +1,43 @@ +#!/bin/bash -l +#SBATCH --job-name=bench +##SBATCH --partition=small-g +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=7 +#SBATCH --gpus-per-node=8 +#SBATCH --time=00:10:00 +#SBATCH --account=phy157_dwf +#SBATCH --gpu-bind=none +#SBATCH --exclusive +#SBATCH --mem=0 + +cat << EOF > select_gpu +#!/bin/bash +export GPU_MAP=(0 1 2 3 7 6 5 4) +export NUMA_MAP=(3 3 1 1 2 2 0 0) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} +export HIP_VISIBLE_DEVICES=\$GPU +unset ROCR_VISIBLE_DEVICES +echo RANK \$SLURM_LOCALID using GPU \$GPU +exec numactl -m \$NUMA -N \$NUMA \$* +EOF + +chmod +x ./select_gpu + +root=$HOME/Frontier/Grid/systems/Frontier/ +source ${root}/sourceme.sh + +export OMP_NUM_THREADS=7 +export MPICH_GPU_SUPPORT_ENABLED=1 +export MPICH_SMP_SINGLE_COPY_MODE=XPMEM + +for vol in 32.32.32.64 +do +srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol +srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol + +srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol +srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol +done + diff --git a/systems/Frontier/config-command b/systems/Frontier/config-command new file mode 100644 index 00000000..b932ba7f --- /dev/null +++ b/systems/Frontier/config-command @@ -0,0 +1,23 @@ +CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` +../../configure --enable-comms=mpi-auto \ +--with-lime=$CLIME \ +--enable-unified=no \ +--enable-shm=nvlink \ +--enable-tracing=timer \ +--enable-accelerator=hip \ +--enable-gen-simd-width=64 \ +--disable-gparity \ +--disable-fermion-reps \ +--enable-simd=GPU \ +--enable-accelerator-cshift \ +--with-gmp=$OLCF_GMP_ROOT \ +--with-fftw=$FFTW_DIR/.. \ +--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ +--disable-fermion-reps \ +CXX=hipcc MPICXX=mpicxx \ +CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 " \ + LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 " + + + + diff --git a/systems/Frontier/mpiwrapper.sh b/systems/Frontier/mpiwrapper.sh new file mode 100755 index 00000000..f6a56698 --- /dev/null +++ b/systems/Frontier/mpiwrapper.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +lrank=$SLURM_LOCALID +lgpu=(0 1 2 3 7 6 5 4) + +export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]} + +echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES " + +$* + + + diff --git a/systems/Frontier/sourceme.sh b/systems/Frontier/sourceme.sh new file mode 100644 index 00000000..987241b4 --- /dev/null +++ b/systems/Frontier/sourceme.sh @@ -0,0 +1,13 @@ +. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh +spack load c-lime +#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib +module load emacs +module load PrgEnv-gnu +module load rocm +module load cray-mpich/8.1.23 +module load gmp +module load cray-fftw +module load craype-accel-amd-gfx90a +export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH +#Hack for lib +#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH diff --git a/systems/Frontier/wrap.sh b/systems/Frontier/wrap.sh new file mode 100755 index 00000000..eb58353c --- /dev/null +++ b/systems/Frontier/wrap.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +export HIP_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES +unset ROCR_VISIBLE_DEVICES + +#rank=$SLURM_PROCID +#rocprof -d rocprof.$rank -o rocprof.$rank/results.rank$SLURM_PROCID.csv --sys-trace $@ + +$@ diff --git a/tests/debug/Test_general_coarse_hdcg.cc b/tests/debug/Test_general_coarse_hdcg.cc index becb7e51..2fe0b90a 100644 --- a/tests/debug/Test_general_coarse_hdcg.cc +++ b/tests/debug/Test_general_coarse_hdcg.cc @@ -305,10 +305,6 @@ int main (int argc, char ** argv) // std::vector ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults) std::vector ords({7}); // Nbasis 40 == 40 iters (320 mults) - // Standard CG - // result=Zero(); - // CGfine(HermOpEO, src, result); - for(int l=0;l