From 40e119c61cac619b7fa1874e5fa7ccdc1dcb77cb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 8 Jul 2017 22:27:11 -0400 Subject: [PATCH 1/3] NUMA improvements worth preserving from AMD EPYC tests --- benchmarks/Benchmark_memory_bandwidth.cc | 48 ++++++++++++------------ lib/allocator/AlignedAllocator.h | 3 +- lib/communicator/Communicator_mpi3.cc | 20 +++++++++- 3 files changed, 45 insertions(+), 26 deletions(-) diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc index 1136dfe0..848f271d 100644 --- a/benchmarks/Benchmark_memory_bandwidth.cc +++ b/benchmarks/Benchmark_memory_bandwidth.cc @@ -60,16 +60,16 @@ int main (int argc, char ** argv) for(int lat=8;lat<=lmax;lat+=8){ std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); uint64_t Nloop=NLOOP; - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); double a=2.0; @@ -83,7 +83,7 @@ int main (int argc, char ** argv) double time = (stop-start)/Nloop*1000; double flops=vol*Nvec*2;// mul,add - double bytes=3*vol*Nvec*sizeof(Real); + double bytes=3.0*vol*Nvec*sizeof(Real); std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); double a=2.0; uint64_t Nloop=NLOOP; @@ -119,7 +119,7 @@ int main (int argc, char ** argv) double time = (stop-start)/Nloop*1000; double flops=vol*Nvec*2;// mul,add - double bytes=3*vol*Nvec*sizeof(Real); + double bytes=3.0*vol*Nvec*sizeof(Real); std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; uint64_t Nloop=NLOOP; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); RealD a=2.0; @@ -154,7 +154,7 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=2*vol*Nvec*sizeof(Real); + double bytes=2.0*vol*Nvec*sizeof(Real); double flops=vol*Nvec*1;// mul std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; uint64_t Nloop=NLOOP; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); RealD a=2.0; Real nn; double start=usecond(); @@ -187,7 +187,7 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=vol*Nvec*sizeof(Real); + double bytes=1.0*vol*Nvec*sizeof(Real); double flops=vol*Nvec*2;// mul,add std::cout< #include #include #include -//#include +#include +#include #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif @@ -214,6 +215,23 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); + + int status; + int flags=MPOL_MF_MOVE; +#ifdef KNL + int nodes=1; // numa domain == MCDRAM + // Find out if in SNC2,SNC4 mode ? +#else + int nodes=r; // numa domain == MPI ID +#endif + unsigned long count=1; + for(uint64_t page=0;page Date: Sun, 9 Jul 2017 00:11:54 +0100 Subject: [PATCH 2/3] Update README.md --- README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/README.md b/README.md index 3572be26..e0a9bb14 100644 --- a/README.md +++ b/README.md @@ -324,6 +324,60 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to ``` This is the default. +### Build setup for AMD EPYC / RYZEN + +The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores. +So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total +are common. Each chip within the module exposes a separate NUMA domain. +There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain. +MPI-3 is recommended with the use of four ranks per socket, +and 8 threads per rank. + +The following configuration is recommended for the AMD EPYC platform. + +``` bash +../configure --enable-precision=double\ + --enable-simd=AVX2 \ + --enable-comms=mpi3 \ + CXX=mpicxx +``` + +If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: +``` bash + --with-gmp= \ + --with-mpfr= \ +``` +where `` is the UNIX prefix where GMP and MPFR are installed. + +Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank. +This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. + +It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and +shared memory to communicate within this node: + +mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 + +Where omp_bind.sh does the following: +``` +#!/bin/bash + +numanode=` expr $PMI_RANK % 8 ` +basecore=`expr $numanode \* 16` +core0=`expr $basecore + 0 ` +core1=`expr $basecore + 2 ` +core2=`expr $basecore + 4 ` +core3=`expr $basecore + 6 ` +core4=`expr $basecore + 8 ` +core5=`expr $basecore + 10 ` +core6=`expr $basecore + 12 ` +core7=`expr $basecore + 14 ` + +export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7" +echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY + +$@ +``` + ### Build setup for BlueGene/Q To be written... From dc6f078246b006ad1b3e61c513273b73f8f0da81 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Tue, 11 Jul 2017 14:15:08 +0100 Subject: [PATCH 3/3] fixed the header file for mpi3 --- configure.ac | 8 +++++++- lib/communicator/Communicator_mpi3.cc | 18 +++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/configure.ac b/configure.ac index 8c43d67a..dc6754da 100644 --- a/configure.ac +++ b/configure.ac @@ -51,6 +51,7 @@ AC_CHECK_HEADERS(malloc/malloc.h) AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(execinfo.h) +AC_CHECK_HEADERS(numaif.h) AC_CHECK_DECLS([ntohll],[], [], [[#include ]]) AC_CHECK_DECLS([be64toh],[], [], [[#include ]]) @@ -186,9 +187,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)]) AC_SEARCH_LIBS([crc32], [z], [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])] - [have_zlib=true], + [have_zlib=true] [LIBS="${LIBS} -lz"], [AC_MSG_ERROR(zlib library was not found in your system.)]) +AC_SEARCH_LIBS([move_pages], [numa], + [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])] + [have_libnuma=true] [LIBS="${LIBS} -lnuma"], + [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)]) + AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] [have_hdf5=true] diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index f5646d44..4192300b 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -38,7 +38,9 @@ Author: Peter Boyle #include #include #include +#ifdef HAVE_NUMAIF_H #include +#endif #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif @@ -216,6 +218,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); + // Try to force numa domain on the shm segment if we have numaif.h +#ifdef HAVE_NUMAIF_H int status; int flags=MPOL_MF_MOVE; #ifdef KNL @@ -225,13 +229,13 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { int nodes=r; // numa domain == MPI ID #endif unsigned long count=1; - for(uint64_t page=0;page