diff --git a/README.md b/README.md index 3572be26..e0a9bb14 100644 --- a/README.md +++ b/README.md @@ -324,6 +324,60 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to ``` This is the default. +### Build setup for AMD EPYC / RYZEN + +The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores. +So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total +are common. Each chip within the module exposes a separate NUMA domain. +There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain. +MPI-3 is recommended with the use of four ranks per socket, +and 8 threads per rank. + +The following configuration is recommended for the AMD EPYC platform. + +``` bash +../configure --enable-precision=double\ + --enable-simd=AVX2 \ + --enable-comms=mpi3 \ + CXX=mpicxx +``` + +If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: +``` bash + --with-gmp= \ + --with-mpfr= \ +``` +where `` is the UNIX prefix where GMP and MPFR are installed. + +Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank. +This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. + +It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and +shared memory to communicate within this node: + +mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 + +Where omp_bind.sh does the following: +``` +#!/bin/bash + +numanode=` expr $PMI_RANK % 8 ` +basecore=`expr $numanode \* 16` +core0=`expr $basecore + 0 ` +core1=`expr $basecore + 2 ` +core2=`expr $basecore + 4 ` +core3=`expr $basecore + 6 ` +core4=`expr $basecore + 8 ` +core5=`expr $basecore + 10 ` +core6=`expr $basecore + 12 ` +core7=`expr $basecore + 14 ` + +export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7" +echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY + +$@ +``` + ### Build setup for BlueGene/Q To be written... diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc index 1136dfe0..848f271d 100644 --- a/benchmarks/Benchmark_memory_bandwidth.cc +++ b/benchmarks/Benchmark_memory_bandwidth.cc @@ -60,16 +60,16 @@ int main (int argc, char ** argv) for(int lat=8;lat<=lmax;lat+=8){ std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); uint64_t Nloop=NLOOP; - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); double a=2.0; @@ -83,7 +83,7 @@ int main (int argc, char ** argv) double time = (stop-start)/Nloop*1000; double flops=vol*Nvec*2;// mul,add - double bytes=3*vol*Nvec*sizeof(Real); + double bytes=3.0*vol*Nvec*sizeof(Real); std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); double a=2.0; uint64_t Nloop=NLOOP; @@ -119,7 +119,7 @@ int main (int argc, char ** argv) double time = (stop-start)/Nloop*1000; double flops=vol*Nvec*2;// mul,add - double bytes=3*vol*Nvec*sizeof(Real); + double bytes=3.0*vol*Nvec*sizeof(Real); std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; uint64_t Nloop=NLOOP; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); RealD a=2.0; @@ -154,7 +154,7 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=2*vol*Nvec*sizeof(Real); + double bytes=2.0*vol*Nvec*sizeof(Real); double flops=vol*Nvec*1;// mul std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); - int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; uint64_t Nloop=NLOOP; GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); - LatticeVec z(&Grid); random(pRNG,z); - LatticeVec x(&Grid); random(pRNG,x); - LatticeVec y(&Grid); random(pRNG,y); + // GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + LatticeVec z(&Grid);// random(pRNG,z); + LatticeVec x(&Grid);// random(pRNG,x); + LatticeVec y(&Grid);// random(pRNG,y); RealD a=2.0; Real nn; double start=usecond(); @@ -187,7 +187,7 @@ int main (int argc, char ** argv) double stop=usecond(); double time = (stop-start)/Nloop*1000; - double bytes=vol*Nvec*sizeof(Real); + double bytes=1.0*vol*Nvec*sizeof(Real); double flops=vol*Nvec*2;// mul,add std::cout<]]) AC_CHECK_DECLS([be64toh],[], [], [[#include ]]) @@ -186,9 +187,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)]) AC_SEARCH_LIBS([crc32], [z], [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])] - [have_zlib=true], + [have_zlib=true] [LIBS="${LIBS} -lz"], [AC_MSG_ERROR(zlib library was not found in your system.)]) +AC_SEARCH_LIBS([move_pages], [numa], + [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])] + [have_libnuma=true] [LIBS="${LIBS} -lnuma"], + [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)]) + AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] [have_hdf5=true] diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index 4513ce26..db86c435 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -195,7 +195,8 @@ public: #endif size_type bytes = __n*sizeof(_Tp); uint8_t *cp = (uint8_t *)ptr; -#pragma omp parallel for + // One touch per 4k page, static OMP loop to catch same loop order +#pragma omp parallel for schedule(static) for(size_type n=0;n #include #include #include -//#include +#include +#ifdef HAVE_NUMAIF_H +#include +#endif #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif @@ -214,6 +217,25 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); + + // Try to force numa domain on the shm segment if we have numaif.h +#ifdef HAVE_NUMAIF_H + int status; + int flags=MPOL_MF_MOVE; +#ifdef KNL + int nodes=1; // numa domain == MCDRAM + // Find out if in SNC2,SNC4 mode ? +#else + int nodes=r; // numa domain == MPI ID +#endif + unsigned long count=1; + for(uint64_t page=0;page