From 40e119c61cac619b7fa1874e5fa7ccdc1dcb77cb Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 8 Jul 2017 22:27:11 -0400
Subject: [PATCH 1/3] NUMA improvements worth preserving from AMD EPYC tests

---
 benchmarks/Benchmark_memory_bandwidth.cc | 48 ++++++++++++------------
 lib/allocator/AlignedAllocator.h         |  3 +-
 lib/communicator/Communicator_mpi3.cc    | 20 +++++++++-
 3 files changed, 45 insertions(+), 26 deletions(-)
diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc
index 1136dfe0..848f271d 100644
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -60,16 +60,16 @@ int main (int argc, char ** argv)
   for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
       uint64_t Nloop=NLOOP;
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       double a=2.0;
 
 
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
       double time = (stop-start)/Nloop*1000;
       
       double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 
     }
@@ -97,14 +97,14 @@ int main (int argc, char ** argv)
   for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       double a=2.0;
 
       uint64_t Nloop=NLOOP;
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
       double time = (stop-start)/Nloop*1000;
      
       double flops=vol*Nvec*2;// mul,add
-      double bytes=3*vol*Nvec*sizeof(Real);
+      double bytes=3.0*vol*Nvec*sizeof(Real);
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 
     }
@@ -133,16 +133,16 @@ int main (int argc, char ** argv)
 
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       uint64_t Nloop=NLOOP;
 
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       RealD a=2.0;
 
 
@@ -154,7 +154,7 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
       
-      double bytes=2*vol*Nvec*sizeof(Real);
+      double bytes=2.0*vol*Nvec*sizeof(Real);
       double flops=vol*Nvec*1;// mul
       std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 
@@ -169,14 +169,14 @@ int main (int argc, char ** argv)
   for(int lat=8;lat<=lmax;lat+=8){
 
       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
-      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
       uint64_t Nloop=NLOOP;
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
-      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-      LatticeVec z(&Grid); random(pRNG,z);
-      LatticeVec x(&Grid); random(pRNG,x);
-      LatticeVec y(&Grid); random(pRNG,y);
+      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+      LatticeVec z(&Grid);// random(pRNG,z);
+      LatticeVec x(&Grid);// random(pRNG,x);
+      LatticeVec y(&Grid);// random(pRNG,y);
       RealD a=2.0;
       Real nn;      
       double start=usecond();
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
       double stop=usecond();
       double time = (stop-start)/Nloop*1000;
       
-      double bytes=vol*Nvec*sizeof(Real);
+      double bytes=1.0*vol*Nvec*sizeof(Real);
       double flops=vol*Nvec*2;// mul,add
       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
 
diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h
index 4513ce26..db86c435 100644
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -195,7 +195,8 @@ public:
 #endif
     size_type bytes = __n*sizeof(_Tp);
     uint8_t *cp = (uint8_t *)ptr;
-#pragma omp parallel for
+    // One touch per 4k page, static OMP loop to catch same loop order
+#pragma omp parallel for schedule(static)
     for(size_type n=0;n<bytes;n+=4096){
       cp[n]=0;
     }
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index 632eb991..f5646d44 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -37,7 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
-//#include <zlib.h>
+#include <zlib.h>
+#include <numaif.h>
 #ifndef SHM_HUGETLB
 #define SHM_HUGETLB 04000
 #endif
@@ -214,6 +215,23 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
       assert(((uint64_t)ptr&0x3F)==0);
+
+	int status;
+	int flags=MPOL_MF_MOVE;
+#ifdef KNL
+	int nodes=1; // numa domain == MCDRAM
+	// Find out if in SNC2,SNC4 mode ?
+#else
+	int nodes=r; // numa domain == MPI ID
+#endif
+	unsigned long count=1;
+      for(uint64_t page=0;page<size;page+=4096){
+	void *pages = (void *) ( page + (uint64_t)ptr );
+	uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
+	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+	if (ierr && (page==0)) perror("numa relocate command failed");
+      }
+
       ShmCommBufs[r] =ptr;
       
     }

From 8a4714a4a6ca1e9a613e097e892f9c78cb05c4e1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@users.noreply.github.com>
Date: Sun, 9 Jul 2017 00:11:54 +0100
Subject: [PATCH 2/3] Update README.md

---
 README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/README.md b/README.md
index 3572be26..e0a9bb14 100644
--- a/README.md
+++ b/README.md
@@ -324,6 +324,60 @@ one rank per socket. If using the Intel MPI library, threads should be pinned to
 ```
 This is the default. 
 
+### Build setup for AMD EPYC / RYZEN
+
+The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
+So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
+are common. Each chip within the module exposes a separate NUMA domain.
+There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
+MPI-3 is recommended with the use of four ranks per socket,
+and 8 threads per rank. 
+
+The following configuration is recommended for the AMD EPYC platform.
+
+``` bash
+../configure --enable-precision=double\
+             --enable-simd=AVX2       \
+             --enable-comms=mpi3 \
+             CXX=mpicxx 
+```
+
+If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
+``` bash
+               --with-gmp=<path>        \
+               --with-mpfr=<path>       \
+```
+where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
+
+Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
+This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
+
+It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
+shared memory to communicate within this node:
+
+mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
+
+Where omp_bind.sh does the following:
+```
+#!/bin/bash
+
+numanode=` expr $PMI_RANK % 8 `
+basecore=`expr $numanode \* 16`
+core0=`expr $basecore + 0 `
+core1=`expr $basecore + 2 `
+core2=`expr $basecore + 4 `
+core3=`expr $basecore + 6 `
+core4=`expr $basecore + 8 `
+core5=`expr $basecore + 10 `
+core6=`expr $basecore + 12 `
+core7=`expr $basecore + 14 `
+
+export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
+echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
+
+$@
+```
+
 ### Build setup for BlueGene/Q
 
 To be written...

From dc6f078246b006ad1b3e61c513273b73f8f0da81 Mon Sep 17 00:00:00 2001
From: azusayamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 11 Jul 2017 14:15:08 +0100
Subject: [PATCH 3/3] fixed the header file for mpi3

---
 configure.ac                          |  8 +++++++-
 lib/communicator/Communicator_mpi3.cc | 18 +++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/configure.ac b/configure.ac
index 8c43d67a..dc6754da 100644
--- a/configure.ac
+++ b/configure.ac
@@ -51,6 +51,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 AC_CHECK_HEADERS(malloc.h)
 AC_CHECK_HEADERS(endian.h)
 AC_CHECK_HEADERS(execinfo.h)
+AC_CHECK_HEADERS(numaif.h)
 AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 
@@ -186,9 +187,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 
 AC_SEARCH_LIBS([crc32], [z],
                [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
-               [have_zlib=true],
+               [have_zlib=true] [LIBS="${LIBS} -lz"],
 	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 
+AC_SEARCH_LIBS([move_pages], [numa],
+               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
+               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
+	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
+
 AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
                [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
                [have_hdf5=true]
diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc
index f5646d44..4192300b 100644
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -38,7 +38,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/shm.h>
 #include <sys/mman.h>
 #include <zlib.h>
+#ifdef HAVE_NUMAIF_H
 #include <numaif.h>
+#endif
 #ifndef SHM_HUGETLB
 #define SHM_HUGETLB 04000
 #endif
@@ -216,6 +218,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
       assert(((uint64_t)ptr&0x3F)==0);
 
+      // Try to force numa domain on the shm segment if we have numaif.h
+#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
@@ -225,13 +229,13 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 	int nodes=r; // numa domain == MPI ID
 #endif
 	unsigned long count=1;
-      for(uint64_t page=0;page<size;page+=4096){
-	void *pages = (void *) ( page + (uint64_t)ptr );
-	uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
-	ierr= move_pages(0,count, &pages,&nodes,&status,flags);
-	if (ierr && (page==0)) perror("numa relocate command failed");
-      }
-
+	for(uint64_t page=0;page<size;page+=4096){
+	  void *pages = (void *) ( page + (uint64_t)ptr );
+	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
+	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+	  if (ierr && (page==0)) perror("numa relocate command failed");
+	}
+#endif
       ShmCommBufs[r] =ptr;
       
     }