From e1042aef77f70f2c4ab44d9066fd4c6d093f6e50 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Fri, 28 Oct 2016 17:20:04 +0100 Subject: [PATCH 01/17] First version of the doube prec for testing purposes It does not compile single and double version at the same time --- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 81 ++++++++++++++++++- lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 4 +- lib/simd/Grid_avx512.h | 6 -- 3 files changed, 80 insertions(+), 11 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 74862400..2fc9b035 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -53,12 +53,13 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo, } #if defined(AVX512) - +#include + +#if defined(GRID_DEFAULT_PRECISION_SINGLE) /////////////////////////////////////////////////////////// // If we are AVX512 specialise the single precision routine /////////////////////////////////////////////////////////// - -#include + #include static Vector signs; @@ -78,6 +79,7 @@ static Vector signs; #define MAYBEPERM(A,perm) if (perm) { A ; } #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define FX(A) WILSONASM_ ##A +#define COMPLEX_TYPE vComplexF #undef KERNEL_DAG template<> void @@ -113,8 +115,79 @@ template<> void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +#undef COMPLEX_TYPE + +#endif //Single precision + +#if defined(GRID_DEFAULT_PRECISION_DOUBLE) +//temporary separating the two sections +//for debug in isolation +//can be unified + + /////////////////////////////////////////////////////////// + // If we are AVX512 specialise the double precision routine + /////////////////////////////////////////////////////////// + +#include + +static Vector signs; + + int setupSigns(void ){ + Vector bother(2); + signs = bother; + vrsign(signs[0]); + visign(signs[1]); + return 1; + } + static int signInit = setupSigns(); + +#define label(A) ilabel(A) +#define ilabel(A) ".globl\n" #A ":\n" + +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) +#define FX(A) WILSONASM_ ##A +#define COMPLEX_TYPE vComplexD + +#undef KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#define KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include -#endif +#undef VMOVIDUP +#undef VMOVRDUP +#undef MAYBEPERM +#undef MULT_2SPIN +#undef FX +#define FX(A) DWFASM_ ## A +#define MAYBEPERM(A,B) +#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) +#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) + +#undef KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#define KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef COMPLEX_TYPE +#endif //Double precision + +#endif //AVX512 #define INSTANTIATE_ASM(A)\ template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 12579d8c..72e13754 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -5,7 +5,9 @@ const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; - vComplexF *isigns = &signs[0]; + //COMPLEX_TYPE is vComplexF of vComplexD depending + //on the chosen precision + COMPLEX_TYPE *isigns = &signs[0]; MASK_REGS; int nmax=U._grid->oSites(); diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 62789462..136c940e 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -371,14 +371,8 @@ namespace Optimization { // Some Template specialization // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases -<<<<<<< HEAD -#define GNU_CLANG_COMPILER -#ifdef GNU_CLANG_COMPILER -======= - #ifndef __INTEL_COMPILER #warning "Slow reduction due to incomplete reduce intrinsics" ->>>>>>> develop //Complex float Reduce template<> inline Grid::ComplexF Reduce::operator()(__m512 in){ From 9b066e94d00c7190d9bf0531caa8964a4e79cf1f Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Sun, 30 Oct 2016 12:04:06 +0000 Subject: [PATCH 02/17] Compilation with both single and double precision --- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 55 ++++++++++------------ 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 2fc9b035..8bd55d61 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -10,6 +10,7 @@ Author: Peter Boyle Author: paboyle +Author: Guido Cossu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -55,23 +56,24 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo, #if defined(AVX512) #include -#if defined(GRID_DEFAULT_PRECISION_SINGLE) /////////////////////////////////////////////////////////// // If we are AVX512 specialise the single precision routine /////////////////////////////////////////////////////////// #include -static Vector signs; - - int setupSigns(void ){ - Vector bother(2); +static Vector signsF; + + template + int setupSigns(Vector& signs ){ + Vector bother(2); signs = bother; vrsign(signs[0]); visign(signs[1]); return 1; } - static int signInit = setupSigns(); + + static int signInitF = setupSigns(signsF); #define label(A) ilabel(A) #define ilabel(A) ".globl\n" #A ":\n" @@ -80,6 +82,7 @@ static Vector signs; #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define FX(A) WILSONASM_ ##A #define COMPLEX_TYPE vComplexF +#define signs signsF #undef KERNEL_DAG template<> void @@ -116,34 +119,22 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebe int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include #undef COMPLEX_TYPE +#undef signs +#undef VMOVRDUP +#undef MAYBEPERM +#undef MULT_2SPIN +#undef FX -#endif //Single precision - -#if defined(GRID_DEFAULT_PRECISION_DOUBLE) -//temporary separating the two sections -//for debug in isolation -//can be unified - - /////////////////////////////////////////////////////////// - // If we are AVX512 specialise the double precision routine - /////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////// +// If we are AVX512 specialise the double precision routine +/////////////////////////////////////////////////////////// #include -static Vector signs; +static Vector signsD; +#define signs signsD +static int signInitD = setupSigns(signsD); - int setupSigns(void ){ - Vector bother(2); - signs = bother; - vrsign(signs[0]); - visign(signs[1]); - return 1; - } - static int signInit = setupSigns(); - -#define label(A) ilabel(A) -#define ilabel(A) ".globl\n" #A ":\n" - #define MAYBEPERM(A,perm) if (perm) { A ; } #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define FX(A) WILSONASM_ ##A @@ -185,7 +176,11 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebe #include #undef COMPLEX_TYPE -#endif //Double precision +#undef signs +#undef VMOVRDUP +#undef MAYBEPERM +#undef MULT_2SPIN +#undef FX #endif //AVX512 From e8c3174ae28746c8f24377ac7a4fbddc7d9a3831 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Sun, 30 Oct 2016 12:23:11 +0000 Subject: [PATCH 03/17] Small change in the defines --- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 8bd55d61..83124d1a 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -159,8 +159,8 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder #undef FX #define FX(A) DWFASM_ ## A #define MAYBEPERM(A,B) -#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) -#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) +#define VMOVIDUP(A,B,C) VBCASTIDUPd(A,B,C) +#define VMOVRDUP(A,B,C) VBCASTRDUPd(A,B,C) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #undef KERNEL_DAG From ae8561892ef8e6763454a552d9aba45fa33074c6 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Wed, 2 Nov 2016 10:21:06 +0000 Subject: [PATCH 04/17] Eliminating useless defines --- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 83124d1a..d7a9edd3 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -103,8 +103,8 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder #undef FX #define FX(A) DWFASM_ ## A #define MAYBEPERM(A,B) -#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) -#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) +//#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) +//#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #undef KERNEL_DAG @@ -159,8 +159,8 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder #undef FX #define FX(A) DWFASM_ ## A #define MAYBEPERM(A,B) -#define VMOVIDUP(A,B,C) VBCASTIDUPd(A,B,C) -#define VMOVRDUP(A,B,C) VBCASTRDUPd(A,B,C) +//#define VMOVIDUP(A,B,C) VBCASTIDUPd(A,B,C) +//#define VMOVRDUP(A,B,C) VBCASTRDUPd(A,B,C) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #undef KERNEL_DAG From afc8d3e524b6ceff65c415ad05b41384fc420e2a Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Mon, 7 Nov 2016 11:13:43 +0000 Subject: [PATCH 05/17] Adding support for parallel recursive compilation for the tests --- Makefile.am | 2 +- tests/Makefile.am | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.am b/Makefile.am index 049220e8..2606b88c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -4,7 +4,7 @@ SUBDIRS = lib benchmarks tests .PHONY: tests tests: - make -C tests tests + $(MAKE) -C tests tests AM_CXXFLAGS += -I$(top_builddir)/include ACLOCAL_AMFLAGS = -I m4 diff --git a/tests/Makefile.am b/tests/Makefile.am index c98bc2d0..2e7c1f0a 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -9,4 +9,4 @@ endif include Make.inc subtests: - for d in $(SUBDIRS); do make -C $${d} tests; done + for d in $(SUBDIRS); do $(MAKE) -C $${d} tests; done From 0cff8754d1b8e2c46b972dcc02b0e80069c2bcd7 Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Tue, 8 Nov 2016 11:35:41 +0000 Subject: [PATCH 06/17] Usecs --- lib/FFT.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/FFT.h b/lib/FFT.h index fda43eb8..52d08cbe 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -122,7 +122,8 @@ namespace Grid { double Flops(void) {return flops;} double MFlops(void) {return flops/usec;} - + double USec(void) {return (double)usec;} + FFT ( GridCartesian * grid ) : vgrid(grid), Nd(grid->_ndimension), From 3dc2e05d6edb5acd0582293017d4a7fe5753297d Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Tue, 8 Nov 2016 11:36:18 +0000 Subject: [PATCH 07/17] Time as well since MKL returns zero for Mflops --- tests/core/Test_fftf.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/core/Test_fftf.cc b/tests/core/Test_fftf.cc index 54b1ddb3..4eb4398d 100644 --- a/tests/core/Test_fftf.cc +++ b/tests/core/Test_fftf.cc @@ -93,10 +93,10 @@ int main (int argc, char ** argv) C=C-Ctilde; std::cout << "diff scalar "< Date: Tue, 8 Nov 2016 11:49:13 +0000 Subject: [PATCH 08/17] Fix a routine for single node processor coor from rank --- lib/communicator/Communicator_none.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 0f43f1f5..5e91b305 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -97,7 +97,7 @@ void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) { return 0;} -void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor){ assert(0);} +void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor){ coor = _processor_coor ;} void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) { source =0; From 343f3e829f0d8cd4cb5913a150db3039bec7437d Mon Sep 17 00:00:00 2001 From: azusayamaguchi Date: Tue, 8 Nov 2016 13:42:12 +0000 Subject: [PATCH 09/17] Fixes prerelease to make all tests --- Makefile.am | 2 +- tests/Makefile.am | 2 +- tests/core/Test_fft_gfix.cc | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile.am b/Makefile.am index 049220e8..2606b88c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -4,7 +4,7 @@ SUBDIRS = lib benchmarks tests .PHONY: tests tests: - make -C tests tests + $(MAKE) -C tests tests AM_CXXFLAGS += -I$(top_builddir)/include ACLOCAL_AMFLAGS = -I m4 diff --git a/tests/Makefile.am b/tests/Makefile.am index c98bc2d0..2e7c1f0a 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -9,4 +9,4 @@ endif include Make.inc subtests: - for d in $(SUBDIRS); do make -C $${d} tests; done + for d in $(SUBDIRS); do $(MAKE) -C $${d} tests; done diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index 6a2868b0..d5779726 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -120,7 +120,7 @@ class FourierAcceleratedGaugeFixer : public Gimpl { LatticeComplex Fp(grid); LatticeComplex psq(grid); psq=zero; LatticeComplex pmu(grid); - LatticeComplex one(grid); one = ComplexD(1.0,0.0); + LatticeComplex one(grid); one = Complex(1.0,0.0); GaugeMat g(grid); GaugeMat dmuAmu_p(grid); @@ -261,25 +261,25 @@ int main (int argc, char ** argv) std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge *" <::avgPlaquette(Umu); + RealD plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); + FourierAcceleratedGaugeFixer::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); - plaq=WilsonLoops::avgPlaquette(Umu); + plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Final plaquette "< Date: Tue, 8 Nov 2016 14:07:59 +0000 Subject: [PATCH 10/17] README typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 49f08237..f4a376f1 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ If you want to build all the tests at once just use `make tests`. ### Possible communication interfaces -The following options can be use with the `--enable-simd=` option to target different communication interfaces: +The following options can be use with the `--enable-comms=` option to target different communication interfaces: | `` | Description | | -------------- | ------------------------------------------------------------- | From f6e1a5b348a107e58e8bb9a94a2323047630a473 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 8 Nov 2016 14:08:33 +0000 Subject: [PATCH 11/17] building tests depends on building the library at the top level --- Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index 2606b88c..18b3ddc3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,7 +3,7 @@ SUBDIRS = lib benchmarks tests .PHONY: tests -tests: +tests: all $(MAKE) -C tests tests AM_CXXFLAGS += -I$(top_builddir)/include From a26adfb0908ebe4c72c977b78e0b803742c06a2e Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 8 Nov 2016 14:11:18 +0000 Subject: [PATCH 12/17] README: only markdown --- README | 44 -------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 README diff --git a/README b/README deleted file mode 100644 index 17e92fa0..00000000 --- a/README +++ /dev/null @@ -1,44 +0,0 @@ -This library provides data parallel C++ container classes with internal memory layout -that is transformed to map efficiently to SIMD architectures. CSHIFT facilities -are provided, similar to HPF and cmfortran, and user control is given over the mapping of -array indices to both MPI tasks and SIMD processing elements. - -* Identically shaped arrays then be processed with perfect data parallelisation. -* Such identically shapped arrays are called conformable arrays. - -The transformation is based on the observation that Cartesian array processing involves -identical processing to be performed on different regions of the Cartesian array. - -The library will (eventually) both geometrically decompose into MPI tasks and across SIMD lanes. - -Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but -optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification -for most programmers. - -The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. -Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported. - -These are presented as - - vRealF, vRealD, vComplexF, vComplexD - -internal vector data types. These may be useful in themselves for other programmers. -The corresponding scalar types are named - - RealF, RealD, ComplexF, ComplexD - -MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is present in the library. - - You can give `configure' initial values for configuration parameters -by setting variables in the command line or in the environment. Here -is are examples: - - ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4 - - ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1 - - ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2 - - ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none - - From 3d2a22a14d9a74bf3dd7788a41d69adb3759b722 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 8 Nov 2016 15:31:47 +0000 Subject: [PATCH 13/17] include fix for MKL --- lib/FFT.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/FFT.h b/lib/FFT.h index fda43eb8..c1cd1980 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -29,9 +29,13 @@ Author: Peter Boyle #ifndef _GRID_FFT_H_ #define _GRID_FFT_H_ -#ifdef HAVE_FFTW +#ifdef HAVE_FFTW +#ifdef USE_MKL +#include +#else #include #endif +#endif namespace Grid { From cd0be8cb24f29afe2a09010d7d87524af13aa543 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 8 Nov 2016 15:32:05 +0000 Subject: [PATCH 14/17] Test_fft_gfix.c precision fix --- tests/core/Test_fft_gfix.cc | 68 ++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index 6a2868b0..c6b77a13 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -42,7 +42,7 @@ class FourierAcceleratedGaugeFixer : public Gimpl { static void GaugeLinkToLieAlgebraField(const std::vector &U,std::vector &A) { for(int mu=0;mu::avgPlaquette(Umu); - RealD org_link_trace=WilsonLoops::linkTrace(Umu); - RealD old_trace = org_link_trace; - RealD trG; + Real org_plaq =WilsonLoops::avgPlaquette(Umu); + Real org_link_trace=WilsonLoops::linkTrace(Umu); + Real old_trace = org_link_trace; + Real trG; std::vector U(Nd,grid); GaugeMat dmuAmu(grid); @@ -71,13 +71,13 @@ class FourierAcceleratedGaugeFixer : public Gimpl { // Monitor progress and convergence test // infrequently to minimise cost overhead if ( i %20 == 0 ) { - RealD plaq =WilsonLoops::avgPlaquette(Umu); - RealD link_trace=WilsonLoops::linkTrace(Umu); + Real plaq =WilsonLoops::avgPlaquette(Umu); + Real link_trace=WilsonLoops::linkTrace(Umu); std::cout << GridLogMessage << " Iteration "< &U,RealD & alpha, GaugeMat & dmuAmu) { + static Real SteepestDescentStep(std::vector &U,Real & alpha, GaugeMat & dmuAmu) { GridBase *grid = U[0]._grid; std::vector A(Nd,grid); @@ -101,26 +101,26 @@ class FourierAcceleratedGaugeFixer : public Gimpl { ExpiAlphaDmuAmu(A,g,alpha,dmuAmu); - RealD vol = grid->gSites(); - RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + Real vol = grid->gSites(); + Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc; SU::GaugeTransform(U,g); return trG; } - static RealD FourierAccelSteepestDescentStep(std::vector &U,RealD & alpha, GaugeMat & dmuAmu) { + static Real FourierAccelSteepestDescentStep(std::vector &U,Real & alpha, GaugeMat & dmuAmu) { GridBase *grid = U[0]._grid; - RealD vol = grid->gSites(); + Real vol = grid->gSites(); FFT theFFT((GridCartesian *)grid); LatticeComplex Fp(grid); LatticeComplex psq(grid); psq=zero; LatticeComplex pmu(grid); - LatticeComplex one(grid); one = ComplexD(1.0,0.0); + LatticeComplex one(grid); one = Complex(1.0,0.0); GaugeMat g(grid); GaugeMat dmuAmu_p(grid); @@ -139,13 +139,13 @@ class FourierAcceleratedGaugeFixer : public Gimpl { std::vector coor(grid->_ndimension,0); for(int mu=0;mu::taExp(ciadmam,g); - RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc; SU::GaugeTransform(U,g); return trG; } - static void ExpiAlphaDmuAmu(const std::vector &A,GaugeMat &g,RealD & alpha, GaugeMat &dmuAmu) { + static void ExpiAlphaDmuAmu(const std::vector &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) { GridBase *grid = g._grid; - ComplexD cialpha(0.0,-alpha); + Complex cialpha(0.0,-alpha); GaugeMat ciadmam(grid); DmuAmu(A,dmuAmu); ciadmam = dmuAmu*cialpha; @@ -193,11 +193,11 @@ class FourierAcceleratedGaugeFixer : public Gimpl { ComplexField pha(grid); GaugeMat Apha(grid); - ComplexD ci(0.0,1.0); + Complex ci(0.0,1.0); for(int mu=0;mu latt_size = GridDefaultLatt(); - std::vector simd_layout( { vComplexD::Nsimd(),1,1,1}); + std::vector simd_layout( { vComplex::Nsimd(),1,1,1}); std::vector mpi_layout = GridDefaultMpi(); int vol = 1; @@ -261,25 +261,25 @@ int main (int argc, char ** argv) std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge *" <::avgPlaquette(Umu); + Real plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); + Real alpha=0.1; + FourierAcceleratedGaugeFixer::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); - plaq=WilsonLoops::avgPlaquette(Umu); + plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Final plaquette "< Date: Tue, 8 Nov 2016 17:00:38 +0000 Subject: [PATCH 15/17] README is now a symlink to README.md --- README | 1 + 1 file changed, 1 insertion(+) create mode 120000 README diff --git a/README b/README new file mode 120000 index 00000000..42061c01 --- /dev/null +++ b/README @@ -0,0 +1 @@ +README.md \ No newline at end of file From 9576f0903dd0c9f3c44fbedf6fe5d14bfc22d798 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 8 Nov 2016 19:07:47 +0000 Subject: [PATCH 16/17] namespace fix --- lib/PerfCount.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/PerfCount.h b/lib/PerfCount.h index 9ac58883..5ab07c02 100644 --- a/lib/PerfCount.h +++ b/lib/PerfCount.h @@ -43,6 +43,9 @@ Author: paboyle #else #include #endif +#ifdef __x86_64__ +#include +#endif namespace Grid { @@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){ return tmp; } #elif defined __x86_64__ -#include inline uint64_t cyclecount(void){ return __rdtsc(); // unsigned int dummy; From 33dc1f51b51a08a0b79b65276173d8c0b03fc582 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 9 Nov 2016 04:11:03 -0800 Subject: [PATCH 17/17] Final sign off commits from Cori-1 --- benchmarks/Benchmark_comms.cc | 86 ++++++++++++++++++++ benchmarks/Benchmark_dwf.cc | 37 +++++---- benchmarks/Benchmark_dwf_sweep.cc | 4 +- lib/Init.cc | 2 +- lib/Log.cc | 2 +- lib/algorithms/iterative/ConjugateGradient.h | 2 +- lib/qcd/hmc/HmcRunner.h | 2 +- tests/hmc/Test_hmc_WilsonFermionGauge.cc | 2 +- 8 files changed, 115 insertions(+), 22 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index de73bc81..969a2a42 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -193,6 +193,7 @@ int main (int argc, char ** argv) } } + Nloop=100; std::cout< latt_size ({lat*mpi_layout[0], + lat*mpi_layout[1], + lat*mpi_layout[2], + lat*mpi_layout[3]}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + std::vector xbuf(8); + std::vector rbuf(8); + Grid.ShmBufferFreeAll(); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + int ncomm; + int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + + double start=usecond(); + for(int i=0;i requests; + + ncomm=0; + for(int mu=0;mu<4;mu++){ + + if (mpi_layout[mu]>1 ) { + + ncomm++; + int comm_proc=1; + int xmit_to_rank; + int recv_from_rank; + + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + // Grid.StencilSendToRecvFromComplete(requests); + // requests.resize(0); + + comm_proc = mpi_layout[mu]-1; + + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + Grid.StencilSendToRecvFromComplete(requests); + requests.resize(0); + + } + } + Grid.Barrier(); + + } + double stop=usecond(); + + double dbytes = bytes; + double xbytes = Nloop*dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + double time = stop-start; // microseconds + + std::cout< latt4 = GridDefaultLatt(); - const int Ls=16; + const int Ls=8; GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -138,7 +138,7 @@ int main (int argc, char ** argv) int ncall =100; if (1) { - + FGrid->Barrier(); Dw.ZeroCounters(); double t0=usecond(); for(int i=0;iBarrier(); double volume=Ls; for(int mu=0;muBarrier(); double t0=usecond(); sDw.ZeroCounters(); for(int i=0;iBarrier(); double volume=Ls; for(int mu=0;muBarrier(); sDw.ZeroCounters(); sDw.stat.init("DhopEO"); double t0=usecond(); @@ -278,6 +282,7 @@ int main (int argc, char ** argv) sDw.DhopEO(ssrc_o, sr_e, DaggerNo); } double t1=usecond(); + FGrid->Barrier(); sDw.stat.print(); double volume=Ls; for(int mu=0;mu1.0e-5) { + if(error>1.0e-4) { setCheckerboard(ssrc,ssrc_o); setCheckerboard(ssrc,ssrc_e); std::cout<< ssrc << std::endl; @@ -337,7 +342,7 @@ int main (int argc, char ** argv) std::cout<Barrier(); double t0=usecond(); for(int i=0;iBarrier(); double volume=Ls; for(int mu=0;mu &logstreams) { //////////////////////////////////////////////////////////// void Grid_quiesce_nodes(void) { int me = 0; -#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) MPI_Comm_rank(MPI_COMM_WORLD, &me); #endif #ifdef GRID_COMMS_SHMEM diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index f340eb38..cf3872c8 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -154,7 +154,7 @@ class ConjugateGradient : public OperatorFunction { << LinalgTimer.Elapsed(); std::cout << std::endl; - if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0); + if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); return; } diff --git a/lib/qcd/hmc/HmcRunner.h b/lib/qcd/hmc/HmcRunner.h index a31ba784..53b127cf 100644 --- a/lib/qcd/hmc/HmcRunner.h +++ b/lib/qcd/hmc/HmcRunner.h @@ -116,7 +116,7 @@ class NerscHmcRunnerTemplate { NoSmearing SmearingPolicy; typedef MinimumNorm2, RepresentationsPolicy > IntegratorType; // change here to change the algorithm - IntegratorParameters MDpar(20, 1.0); + IntegratorParameters MDpar(40, 1.0); IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy); // Checkpoint strategy diff --git a/tests/hmc/Test_hmc_WilsonFermionGauge.cc b/tests/hmc/Test_hmc_WilsonFermionGauge.cc index 9dcf6343..351d1e68 100644 --- a/tests/hmc/Test_hmc_WilsonFermionGauge.cc +++ b/tests/hmc/Test_hmc_WilsonFermionGauge.cc @@ -68,7 +68,7 @@ class HmcRunner : public NerscHmcRunner { TwoFlavourPseudoFermionAction Nf2(FermOp, CG, CG); // Set smearing (true/false), default: false - Nf2.is_smeared = true; + Nf2.is_smeared = false; // Collect actions ActionLevel Level1(1);