From cff3bae1557f90077ec67d56c96eb72f817bb273 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Thu, 25 Jan 2018 13:46:31 +0100 Subject: [PATCH 1/8] Adding support for general Nc in the benchmark outputs --- benchmarks/Benchmark_dwf.cc | 15 ++++++---- benchmarks/Benchmark_dwf_sweep.cc | 11 ++++--- benchmarks/Benchmark_gparity.cc | 6 ++-- benchmarks/Benchmark_wilson.cc | 13 ++++++-- benchmarks/Benchmark_wilson_sweep.cc | 45 ++++++++++++++++++++++------ 5 files changed, 66 insertions(+), 24 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 73621bbe..1d9de772 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -48,7 +48,6 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); - std::cout< latt4 = GridDefaultLatt(); int Ls=16; @@ -57,6 +56,10 @@ int main (int argc, char ** argv) std::stringstream ss(argv[i+1]); ss >> Ls; } + GridLogLayout(); + + long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); @@ -187,7 +190,7 @@ int main (int argc, char ** argv) FGrid->Barrier(); double volume=Ls; for(int mu=0;muBarrier(); double volume=Ls; for(int mu=0;muBarrier(); double volume=Ls; for(int mu=0;mu & latt4, int Ls, int threads,int report ) GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); @@ -196,7 +198,7 @@ void benchDw(std::vector & latt4, int Ls, int threads,int report ) if ( ! report ) { double volume=Ls; for(int mu=0;mu & latt4, int Ls, int threads,int report ) if(!report){ double volume=Ls; for(int mu=0;mu & latt4, int Ls, int threads,int report ) #define CHECK_SDW void benchsDw(std::vector & latt4, int Ls, int threads, int report ) { + long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); @@ -321,7 +324,7 @@ void benchsDw(std::vector & latt4, int Ls, int threads, int report ) Counter.Report(); } else { double volume=Ls; for(int mu=0;mu & latt4, int Ls, int threads, int report ) CounterSdw.Report(); } else { double volume=Ls; for(int mu=0;muBarrier(); double volume=Ls; for(int mu=0;muBarrier(); double volume=Ls; for(int mu=0;mu latt_size = GridDefaultLatt(); std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); @@ -61,10 +64,15 @@ int main (int argc, char ** argv) GridRedBlackCartesian RBGrid(&Grid); int threads = GridThread::GetThreads(); - std::cout< seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); @@ -134,9 +142,10 @@ int main (int argc, char ** argv) Dw.Dhop(src,result,0); } double t1=usecond(); - double flops=1344*volume*ncall; + double flops=single_site_flops*volume*ncall; std::cout<()); WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params); - + + // Full operator + bench_wilson(src,result,Dw,volume,DaggerNo); + bench_wilson(src,result,Dw,volume,DaggerYes); + std::cout << "\t"; + // EO bench_wilson(src,result,Dw,volume,DaggerNo); bench_wilson(src,result,Dw,volume,DaggerYes); std::cout << std::endl; @@ -122,9 +132,26 @@ void bench_wilson ( int const dag ) { int ncall = 1000; + long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); double t0 = usecond(); for(int i=0; i Date: Sat, 27 Jan 2018 10:59:55 +0100 Subject: [PATCH 2/8] Correcting an missing semicolumn in avx512 --- lib/simd/Grid_avx512.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 85d27421..cce77a58 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -556,7 +556,7 @@ namespace Optimization { v3 = _mm256_add_epi32(v1, v2); v1 = _mm256_hadd_epi32(v3, v3); v2 = _mm256_hadd_epi32(v1, v1); - u1 = _mm256_castsi256_si128(v2) // upper half + u1 = _mm256_castsi256_si128(v2); // upper half u2 = _mm256_extracti128_si256(v2, 1); // lower half ret = _mm_add_epi32(u1, u2); return _mm_cvtsi128_si32(ret); From 655a69259a76b844ab06a2e78fbe8a0441dbf774 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Sun, 28 Jan 2018 17:02:46 +0100 Subject: [PATCH 3/8] Added support for GCC compilation for Skylake AVX512 --- configure.ac | 3 +++ lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 4 ++-- lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc | 2 +- lib/qcd/action/fermion/MobiusEOFAFermionvec.cc | 2 +- lib/simd/Intel512avx.h | 2 +- 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/configure.ac b/configure.ac index 468d9d5f..3a6a2960 100644 --- a/configure.ac +++ b/configure.ac @@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in AVX512) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; + SKL) + AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon]) + SIMD_FLAGS='-march=skylake-avx512';; KNC) AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner]) SIMD_FLAGS='';; diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 653e6ab3..2b2eace7 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -469,7 +469,7 @@ void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionFie } a0 = a0+incr; a1 = a1+incr; - a2 = a2+sizeof(Simd::scalar_type); + a2 = a2+sizeof(typename Simd::scalar_type); }} { int lexa = s1+LLs*site; @@ -701,7 +701,7 @@ void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionFi } a0 = a0+incr; a1 = a1+incr; - a2 = a2+sizeof(Simd::scalar_type); + a2 = a2+sizeof(typename Simd::scalar_type); }} { int lexa = s1+LLs*site; diff --git a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc index 81ce448c..c95172a5 100644 --- a/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc +++ b/lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc @@ -475,7 +475,7 @@ namespace QCD { } a0 = a0 + incr; a1 = a1 + incr; - a2 = a2 + sizeof(Simd::scalar_type); + a2 = a2 + sizeof(typename Simd::scalar_type); } } diff --git a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc index c4eaf0f3..290ba158 100644 --- a/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc +++ b/lib/qcd/action/fermion/MobiusEOFAFermionvec.cc @@ -853,7 +853,7 @@ namespace QCD { a0 = a0 + incr; a1 = a1 + incr; - a2 = a2 + sizeof(Simd::scalar_type); + a2 = a2 + sizeof(typename Simd::scalar_type); } } diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index 7b5964ad..def37b9b 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -79,7 +79,7 @@ Author: paboyle #define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\ "vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n" -#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\ +#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\ "vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n" #define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\ From fb24e3a7d24abb2bcdef4c85711ce0d25319a153 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Mon, 29 Jan 2018 11:11:45 +0100 Subject: [PATCH 4/8] Adding utilities for perf profiling --- benchmarks/Benchmark_wilson.cc | 24 +++++++++++- lib/util/Profiling.h | 72 ++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 lib/util/Profiling.h diff --git a/benchmarks/Benchmark_wilson.cc b/benchmarks/Benchmark_wilson.cc index d1499a76..754051f0 100644 --- a/benchmarks/Benchmark_wilson.cc +++ b/benchmarks/Benchmark_wilson.cc @@ -4,7 +4,7 @@ Source file: ./benchmarks/Benchmark_wilson.cc - Copyright (C) 2015 + Copyright (C) 2018 Author: Peter Boyle Author: paboyle @@ -32,6 +32,9 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; + +#include "Grid/util/Profiling.h" + template struct scal { d internal; @@ -45,6 +48,7 @@ struct scal { }; bool overlapComms = false; +bool perfProfiling = false; int main (int argc, char ** argv) { @@ -53,6 +57,9 @@ int main (int argc, char ** argv) if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; } + if( GridCmdOptionExists(argv,argv+argc,"--perf") ){ + perfProfiling = true; + } long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); @@ -144,6 +151,21 @@ int main (int argc, char ** argv) double t1=usecond(); double flops=single_site_flops*volume*ncall; + if (perfProfiling){ + std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ + +#ifndef GRID_PERF_PROFILING_H +#define GRID_PERF_PROFILING_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct System +{ + static void profile(const std::string& name,std::function body) { + std::string filename = name.find(".data") == std::string::npos ? (name + ".data") : name; + + // Launch profiler + pid_t pid; + std::stringstream s; + s << getpid(); + pid = fork(); + if (pid == 0) { + auto fd=open("/dev/null",O_RDWR); + dup2(fd,1); + dup2(fd,2); + exit(execl("/usr/bin/perf","perf","record","-o",filename.c_str(),"-p",s.str().c_str(),nullptr)); + } + + // Run body + body(); + + // Kill profiler + kill(pid,SIGINT); + waitpid(pid,nullptr,0); + } + + static void profile(std::function body) { + profile("perf.data",body); + } +}; + +#endif // GRID_PERF_PROFILING_H \ No newline at end of file From cd44e851f1021db5f895a4caf409c885b35d7bd9 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Tue, 30 Jan 2018 06:04:30 +0100 Subject: [PATCH 5/8] Fixing compilation error in FundtoHirep --- extras/Hadrons/Modules/MGauge/FundtoHirep.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extras/Hadrons/Modules/MGauge/FundtoHirep.cc b/extras/Hadrons/Modules/MGauge/FundtoHirep.cc index f15a3b7c..31c5a34d 100644 --- a/extras/Hadrons/Modules/MGauge/FundtoHirep.cc +++ b/extras/Hadrons/Modules/MGauge/FundtoHirep.cc @@ -57,7 +57,7 @@ std::vector TFundtoHirep::getOutput(void) template void TFundtoHirep::setup(void) { - env().template registerLattice(getName()); + envCreateLat(typename Rep::LatticeField, getName()); } // execution /////////////////////////////////////////////////////////////////// @@ -70,6 +70,6 @@ void TFundtoHirep::execute(void) Rep TargetRepresentation(U._grid); TargetRepresentation.update_representation(U); - typename Rep::LatticeField &URep = *env().template createLattice(getName()); + auto &URep = envGet(typename Rep::LatticeField, getName()); URep = TargetRepresentation.U; } From 53bffb83d453080fe5dd16fb5601d16a94997d87 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Tue, 30 Jan 2018 12:42:36 +0100 Subject: [PATCH 6/8] Updating README with new SKL target --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 13dd6996..5a92cdec 100644 --- a/README.md +++ b/README.md @@ -187,10 +187,11 @@ Alternatively, some CPU codenames can be directly used: | `` | Description | | ----------- | -------------------------------------- | | `KNL` | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) | +| `SKL` | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) | | `BGQ` | Blue Gene/Q | #### Notes: -- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced. +- We currently support AVX512 for the Intel compiler and GCC (SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced. - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform. - BG/Q performances are currently rather poor. This is being investigated for future versions. - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`. From f0fcdf75b5b7c6be03224a50b1157170e441b3b5 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Tue, 30 Jan 2018 12:44:20 +0100 Subject: [PATCH 7/8] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a92cdec..86506f52 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,7 @@ Alternatively, some CPU codenames can be directly used: | `BGQ` | Blue Gene/Q | #### Notes: -- We currently support AVX512 for the Intel compiler and GCC (SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced. +- We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced. - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform. - BG/Q performances are currently rather poor. This is being investigated for future versions. - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`. From 896f3a8002b3116380e2293cf3ecca350c34ce5d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 1 Feb 2018 18:51:51 +0000 Subject: [PATCH 8/8] Fix to MPI for Hokusai system --- lib/communicator/SharedMemoryMPI.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/communicator/SharedMemoryMPI.cc b/lib/communicator/SharedMemoryMPI.cc index d7bd7c65..2a62b7ac 100644 --- a/lib/communicator/SharedMemoryMPI.cc +++ b/lib/communicator/SharedMemoryMPI.cc @@ -182,6 +182,7 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector &processors, #ifdef GRID_MPI3_SHMMMAP void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) { + std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<