diff --git a/Makefile.am b/Makefile.am index 049220e8..18b3ddc3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,8 +3,8 @@ SUBDIRS = lib benchmarks tests .PHONY: tests -tests: - make -C tests tests +tests: all + $(MAKE) -C tests tests AM_CXXFLAGS += -I$(top_builddir)/include ACLOCAL_AMFLAGS = -I m4 diff --git a/README b/README deleted file mode 100644 index 17e92fa0..00000000 --- a/README +++ /dev/null @@ -1,44 +0,0 @@ -This library provides data parallel C++ container classes with internal memory layout -that is transformed to map efficiently to SIMD architectures. CSHIFT facilities -are provided, similar to HPF and cmfortran, and user control is given over the mapping of -array indices to both MPI tasks and SIMD processing elements. - -* Identically shaped arrays then be processed with perfect data parallelisation. -* Such identically shapped arrays are called conformable arrays. - -The transformation is based on the observation that Cartesian array processing involves -identical processing to be performed on different regions of the Cartesian array. - -The library will (eventually) both geometrically decompose into MPI tasks and across SIMD lanes. - -Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but -optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification -for most programmers. - -The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. -Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported. - -These are presented as - - vRealF, vRealD, vComplexF, vComplexD - -internal vector data types. These may be useful in themselves for other programmers. -The corresponding scalar types are named - - RealF, RealD, ComplexF, ComplexD - -MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is present in the library. - - You can give `configure' initial values for configuration parameters -by setting variables in the command line or in the environment. Here -is are examples: - - ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4 - - ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1 - - ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2 - - ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none - - diff --git a/README b/README new file mode 120000 index 00000000..42061c01 --- /dev/null +++ b/README @@ -0,0 +1 @@ +README.md \ No newline at end of file diff --git a/README.md b/README.md index 49f08237..f4a376f1 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ If you want to build all the tests at once just use `make tests`. ### Possible communication interfaces -The following options can be use with the `--enable-simd=` option to target different communication interfaces: +The following options can be use with the `--enable-comms=` option to target different communication interfaces: | `` | Description | | -------------- | ------------------------------------------------------------- | diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index de73bc81..969a2a42 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -193,6 +193,7 @@ int main (int argc, char ** argv) } } + Nloop=100; std::cout< latt_size ({lat*mpi_layout[0], + lat*mpi_layout[1], + lat*mpi_layout[2], + lat*mpi_layout[3]}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + std::vector xbuf(8); + std::vector rbuf(8); + Grid.ShmBufferFreeAll(); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + int ncomm; + int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + + double start=usecond(); + for(int i=0;i requests; + + ncomm=0; + for(int mu=0;mu<4;mu++){ + + if (mpi_layout[mu]>1 ) { + + ncomm++; + int comm_proc=1; + int xmit_to_rank; + int recv_from_rank; + + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + // Grid.StencilSendToRecvFromComplete(requests); + // requests.resize(0); + + comm_proc = mpi_layout[mu]-1; + + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + Grid.StencilSendToRecvFromComplete(requests); + requests.resize(0); + + } + } + Grid.Barrier(); + + } + double stop=usecond(); + + double dbytes = bytes; + double xbytes = Nloop*dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + double time = stop-start; // microseconds + + std::cout< latt4 = GridDefaultLatt(); - const int Ls=16; + const int Ls=8; GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -138,7 +138,7 @@ int main (int argc, char ** argv) int ncall =100; if (1) { - + FGrid->Barrier(); Dw.ZeroCounters(); double t0=usecond(); for(int i=0;iBarrier(); double volume=Ls; for(int mu=0;muBarrier(); double t0=usecond(); sDw.ZeroCounters(); for(int i=0;iBarrier(); double volume=Ls; for(int mu=0;muBarrier(); sDw.ZeroCounters(); sDw.stat.init("DhopEO"); double t0=usecond(); @@ -278,6 +282,7 @@ int main (int argc, char ** argv) sDw.DhopEO(ssrc_o, sr_e, DaggerNo); } double t1=usecond(); + FGrid->Barrier(); sDw.stat.print(); double volume=Ls; for(int mu=0;mu1.0e-5) { + if(error>1.0e-4) { setCheckerboard(ssrc,ssrc_o); setCheckerboard(ssrc,ssrc_e); std::cout<< ssrc << std::endl; @@ -337,7 +342,7 @@ int main (int argc, char ** argv) std::cout<Barrier(); double t0=usecond(); for(int i=0;iBarrier(); double volume=Ls; for(int mu=0;mu #ifndef _GRID_FFT_H_ #define _GRID_FFT_H_ -#ifdef HAVE_FFTW +#ifdef HAVE_FFTW +#ifdef USE_MKL +#include +#else #include #endif +#endif namespace Grid { @@ -122,7 +126,8 @@ namespace Grid { double Flops(void) {return flops;} double MFlops(void) {return flops/usec;} - + double USec(void) {return (double)usec;} + FFT ( GridCartesian * grid ) : vgrid(grid), Nd(grid->_ndimension), diff --git a/lib/Init.cc b/lib/Init.cc index b5ef0a95..d6d6b9f8 100644 --- a/lib/Init.cc +++ b/lib/Init.cc @@ -369,7 +369,7 @@ void Grid_init(int *argc,char ***argv) void Grid_finalize(void) { -#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) MPI_Finalize(); Grid_unquiesce_nodes(); #endif diff --git a/lib/Log.cc b/lib/Log.cc index 733bfc58..7521657b 100644 --- a/lib/Log.cc +++ b/lib/Log.cc @@ -93,7 +93,7 @@ void GridLogConfigure(std::vector &logstreams) { //////////////////////////////////////////////////////////// void Grid_quiesce_nodes(void) { int me = 0; -#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) MPI_Comm_rank(MPI_COMM_WORLD, &me); #endif #ifdef GRID_COMMS_SHMEM diff --git a/lib/PerfCount.h b/lib/PerfCount.h index 9ac58883..5ab07c02 100644 --- a/lib/PerfCount.h +++ b/lib/PerfCount.h @@ -43,6 +43,9 @@ Author: paboyle #else #include #endif +#ifdef __x86_64__ +#include +#endif namespace Grid { @@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){ return tmp; } #elif defined __x86_64__ -#include inline uint64_t cyclecount(void){ return __rdtsc(); // unsigned int dummy; diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index bd90b864..1f9b9b58 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -191,7 +191,7 @@ class ConjugateGradient : public OperatorFunction { << LinalgTimer.Elapsed(); std::cout << std::endl; - if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0); + if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (!CGState.do_repro && ReproTest){ CGState.do_repro = true; diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 0f43f1f5..5e91b305 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -97,7 +97,7 @@ void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) { return 0;} -void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor){ assert(0);} +void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor){ coor = _processor_coor ;} void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) { source =0; diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 74862400..d7a9edd3 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -10,6 +10,7 @@ Author: Peter Boyle Author: paboyle +Author: Guido Cossu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -53,24 +54,26 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo, } #if defined(AVX512) - +#include + /////////////////////////////////////////////////////////// // If we are AVX512 specialise the single precision routine /////////////////////////////////////////////////////////// - -#include + #include -static Vector signs; - - int setupSigns(void ){ - Vector bother(2); +static Vector signsF; + + template + int setupSigns(Vector& signs ){ + Vector bother(2); signs = bother; vrsign(signs[0]); visign(signs[1]); return 1; } - static int signInit = setupSigns(); + + static int signInitF = setupSigns(signsF); #define label(A) ilabel(A) #define ilabel(A) ".globl\n" #A ":\n" @@ -78,6 +81,8 @@ static Vector signs; #define MAYBEPERM(A,perm) if (perm) { A ; } #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define FX(A) WILSONASM_ ##A +#define COMPLEX_TYPE vComplexF +#define signs signsF #undef KERNEL_DAG template<> void @@ -98,8 +103,8 @@ WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder #undef FX #define FX(A) DWFASM_ ## A #define MAYBEPERM(A,B) -#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) -#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) +//#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) +//#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) #undef KERNEL_DAG @@ -113,8 +118,71 @@ template<> void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +#undef COMPLEX_TYPE +#undef signs +#undef VMOVRDUP +#undef MAYBEPERM +#undef MULT_2SPIN +#undef FX + +/////////////////////////////////////////////////////////// +// If we are AVX512 specialise the double precision routine +/////////////////////////////////////////////////////////// + +#include + +static Vector signsD; +#define signs signsD +static int signInitD = setupSigns(signsD); + +#define MAYBEPERM(A,perm) if (perm) { A ; } +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) +#define FX(A) WILSONASM_ ##A +#define COMPLEX_TYPE vComplexD + +#undef KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#define KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include -#endif +#undef VMOVIDUP +#undef VMOVRDUP +#undef MAYBEPERM +#undef MULT_2SPIN +#undef FX +#define FX(A) DWFASM_ ## A +#define MAYBEPERM(A,B) +//#define VMOVIDUP(A,B,C) VBCASTIDUPd(A,B,C) +//#define VMOVRDUP(A,B,C) VBCASTRDUPd(A,B,C) +#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) + +#undef KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#define KERNEL_DAG +template<> void +WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + +#undef COMPLEX_TYPE +#undef signs +#undef VMOVRDUP +#undef MAYBEPERM +#undef MULT_2SPIN +#undef FX + +#endif //AVX512 #define INSTANTIATE_ASM(A)\ template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\ diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 12579d8c..72e13754 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -5,7 +5,9 @@ const uint64_t plocal =(uint64_t) & in._odata[0]; // vComplexF isigns[2] = { signs[0], signs[1] }; - vComplexF *isigns = &signs[0]; + //COMPLEX_TYPE is vComplexF of vComplexD depending + //on the chosen precision + COMPLEX_TYPE *isigns = &signs[0]; MASK_REGS; int nmax=U._grid->oSites(); diff --git a/lib/qcd/hmc/HmcRunner.h b/lib/qcd/hmc/HmcRunner.h index a31ba784..53b127cf 100644 --- a/lib/qcd/hmc/HmcRunner.h +++ b/lib/qcd/hmc/HmcRunner.h @@ -116,7 +116,7 @@ class NerscHmcRunnerTemplate { NoSmearing SmearingPolicy; typedef MinimumNorm2, RepresentationsPolicy > IntegratorType; // change here to change the algorithm - IntegratorParameters MDpar(20, 1.0); + IntegratorParameters MDpar(40, 1.0); IntegratorType MDynamics(UGrid, MDpar, TheAction, SmearingPolicy); // Checkpoint strategy diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index 4f2779a8..93e4c9ad 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -382,7 +382,6 @@ namespace Optimization { // Some Template specialization // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases - #ifndef __INTEL_COMPILER #warning "Slow reduction due to incomplete reduce intrinsics" //Complex float Reduce diff --git a/tests/Makefile.am b/tests/Makefile.am index c98bc2d0..2e7c1f0a 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -9,4 +9,4 @@ endif include Make.inc subtests: - for d in $(SUBDIRS); do make -C $${d} tests; done + for d in $(SUBDIRS); do $(MAKE) -C $${d} tests; done diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index 6a2868b0..c6b77a13 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -42,7 +42,7 @@ class FourierAcceleratedGaugeFixer : public Gimpl { static void GaugeLinkToLieAlgebraField(const std::vector &U,std::vector &A) { for(int mu=0;mu::avgPlaquette(Umu); - RealD org_link_trace=WilsonLoops::linkTrace(Umu); - RealD old_trace = org_link_trace; - RealD trG; + Real org_plaq =WilsonLoops::avgPlaquette(Umu); + Real org_link_trace=WilsonLoops::linkTrace(Umu); + Real old_trace = org_link_trace; + Real trG; std::vector U(Nd,grid); GaugeMat dmuAmu(grid); @@ -71,13 +71,13 @@ class FourierAcceleratedGaugeFixer : public Gimpl { // Monitor progress and convergence test // infrequently to minimise cost overhead if ( i %20 == 0 ) { - RealD plaq =WilsonLoops::avgPlaquette(Umu); - RealD link_trace=WilsonLoops::linkTrace(Umu); + Real plaq =WilsonLoops::avgPlaquette(Umu); + Real link_trace=WilsonLoops::linkTrace(Umu); std::cout << GridLogMessage << " Iteration "< &U,RealD & alpha, GaugeMat & dmuAmu) { + static Real SteepestDescentStep(std::vector &U,Real & alpha, GaugeMat & dmuAmu) { GridBase *grid = U[0]._grid; std::vector A(Nd,grid); @@ -101,26 +101,26 @@ class FourierAcceleratedGaugeFixer : public Gimpl { ExpiAlphaDmuAmu(A,g,alpha,dmuAmu); - RealD vol = grid->gSites(); - RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + Real vol = grid->gSites(); + Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc; SU::GaugeTransform(U,g); return trG; } - static RealD FourierAccelSteepestDescentStep(std::vector &U,RealD & alpha, GaugeMat & dmuAmu) { + static Real FourierAccelSteepestDescentStep(std::vector &U,Real & alpha, GaugeMat & dmuAmu) { GridBase *grid = U[0]._grid; - RealD vol = grid->gSites(); + Real vol = grid->gSites(); FFT theFFT((GridCartesian *)grid); LatticeComplex Fp(grid); LatticeComplex psq(grid); psq=zero; LatticeComplex pmu(grid); - LatticeComplex one(grid); one = ComplexD(1.0,0.0); + LatticeComplex one(grid); one = Complex(1.0,0.0); GaugeMat g(grid); GaugeMat dmuAmu_p(grid); @@ -139,13 +139,13 @@ class FourierAcceleratedGaugeFixer : public Gimpl { std::vector coor(grid->_ndimension,0); for(int mu=0;mu::taExp(ciadmam,g); - RealD trG = TensorRemove(sum(trace(g))).real()/vol/Nc; + Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc; SU::GaugeTransform(U,g); return trG; } - static void ExpiAlphaDmuAmu(const std::vector &A,GaugeMat &g,RealD & alpha, GaugeMat &dmuAmu) { + static void ExpiAlphaDmuAmu(const std::vector &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) { GridBase *grid = g._grid; - ComplexD cialpha(0.0,-alpha); + Complex cialpha(0.0,-alpha); GaugeMat ciadmam(grid); DmuAmu(A,dmuAmu); ciadmam = dmuAmu*cialpha; @@ -193,11 +193,11 @@ class FourierAcceleratedGaugeFixer : public Gimpl { ComplexField pha(grid); GaugeMat Apha(grid); - ComplexD ci(0.0,1.0); + Complex ci(0.0,1.0); for(int mu=0;mu latt_size = GridDefaultLatt(); - std::vector simd_layout( { vComplexD::Nsimd(),1,1,1}); + std::vector simd_layout( { vComplex::Nsimd(),1,1,1}); std::vector mpi_layout = GridDefaultMpi(); int vol = 1; @@ -261,25 +261,25 @@ int main (int argc, char ** argv) std::cout<< "* Testing we can gauge fix steep descent a RGT of Unit gauge *" <::avgPlaquette(Umu); + Real plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); + Real alpha=0.1; + FourierAcceleratedGaugeFixer::SteepestDescentGaugeFix(Umu,alpha,10000,1.0e-10, 1.0e-10); - plaq=WilsonLoops::avgPlaquette(Umu); + plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Final plaquette "< Nf2(FermOp, CG, CG); // Set smearing (true/false), default: false - Nf2.is_smeared = true; + Nf2.is_smeared = false; // Collect actions ActionLevel Level1(1);