From 6103c29ee397c5231bb211d081c2f4623958c9a7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 May 2015 07:51:41 +0100 Subject: [PATCH] Threading support rework. Placed parallel pragmas as macros; implemented deterministic thread reduction in style of BFM. --- TODO | 93 ++++++++------- lib/Grid.h | 1 - lib/Grid_comparison.h | 1 + lib/Grid_simd.h | 12 +- lib/Makefile.am | 4 +- lib/cshift/Grid_cshift_common.h | 12 +- lib/lattice/Grid_lattice_arith.h | 26 ++-- lib/lattice/Grid_lattice_base.h | 18 +-- lib/lattice/Grid_lattice_comparison.h | 6 +- lib/lattice/Grid_lattice_local.h | 6 +- lib/lattice/Grid_lattice_overload.h | 14 +-- lib/lattice/Grid_lattice_peekpoke.h | 12 +- lib/lattice/Grid_lattice_reality.h | 8 +- lib/lattice/Grid_lattice_reduction.h | 111 ++++++++++-------- lib/lattice/Grid_lattice_trace.h | 4 +- lib/lattice/Grid_lattice_transfer.h | 4 +- lib/lattice/Grid_lattice_transpose.h | 4 +- .../Grid_lattice_where.h} | 6 +- lib/math/Grid_math_reality.h | 54 +++++++-- lib/qcd/Grid_qcd_2spinor.h | 6 - lib/qcd/Grid_qcd_wilson_dop.cc | 2 +- lib/simd/Grid_vComplexD.h | 24 +++- lib/simd/Grid_vComplexF.h | 21 +++- lib/simd/Grid_vRealD.h | 4 + lib/simd/Grid_vRealF.h | 4 + tests/Grid_gamma.cc | 3 +- 26 files changed, 276 insertions(+), 184 deletions(-) rename lib/{Grid_where.h => lattice/Grid_lattice_where.h} (95%) diff --git a/TODO b/TODO index 1f5be83f..6a1624f7 100644 --- a/TODO +++ b/TODO @@ -1,33 +1,35 @@ +================================================================ +*** Hacks and bug fixes to clean up and Audits +================================================================ +* Base class to share common code between vRealF, VComplexF etc... + +* Performance check on Guido's reimplementation strategy + +* Bug in SeedFixedIntegers gives same output on each site. -- Think I fixed but NOT checked for sure + +* FIXME audit +* const audit + +* Replace vset with a call to merge.; +* care in Gmerge,Gextract over vset . +* extract / merge extra implementation removal + +* Strong test for norm2, conj and all primitive types. -- tests/Grid_simd.cc is almost there + +================================================================ +*** New Functionality +================================================================ + +* Implement where to take template scheme. + * - BinaryWriter, TextWriter etc... - use protocol buffers? replace xmlReader/Writer ec.. - Binary use htonll, htonl -*** Hacks and bug fixes to clean up -* Had to hack assignment to 1.0 in the tests/Grid_gamma test -* norm2l is a hack. figure out syntax error and make this norm2 c.f. tests/Grid_gamma.cc - -* Reduce implemention is poor ; need threaded reductions; OMP isn't able to do it for generic objects. - -* Bug in SeedFixedIntegers gives same output on each site. -* Bug in RNG with complex numbers ; only filling real values; need helper function -- DONE -* Conformable test in Cshift routines. - -*** Functionality - -* Implement where to take template scheme. - -* Command line args for geometry, simd, etc. layout. Is it necessary to have - user pass these? Is this a QCD specific? - -* Strong test for norm2, conj and all primitive types. -- Grid_simd test is almost there - * Expression template engine: - - Audit - - Introduce base clase for Grid Tensors. - - Introduce norm2 unary op. - - Introduce conversion automatic from expression to Lattice - + -- Audit + -- Norm2(expression) problem: introduce norm2 unary op, or Introduce conversion automatic from expression to Lattice * CovariantShift support -----Use a class to store gauge field? (parallel transport?) @@ -48,12 +50,10 @@ - I have collated into single location at least. - Need to use _mm_*insert/extract routines. - * Flavour matrices? * Pauli, SU subgroup, etc.. * su3 exponentiation & log etc.. [Jamie's code?] * TaProj - * FFTnD ? * Parallel MPI2 IO @@ -62,20 +62,23 @@ * rb4d support for 5th dimension in Mobius. * Check for missing functionality - partially audited against QDP++ layout - // Base class to share common code between vRealF, VComplexF etc... - // // Unary functions // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar only arg // exp, log, sqrt, fabs - // // transposeColor, transposeSpin, // adjColor, adjSpin, - // // copyMask. - // // localMaxAbs - // // Fourier transform equivalent. +Actions +* Fermion + - Wilson + - Clover + - DomainWall + - Mobius + - z-Mobius +* Gauge + - Wilson, symanzik, iwasaki Algorithms * LinearOperator @@ -83,23 +86,21 @@ Algorithms * Polynomial * Eigen * Pcg +* Adef2 +* DeflCG * fPcg * MCR * HDCG +* HMC, Heatbath * etc.. -AUDITS: - -* FIXME audit -* const audit -* Replace vset with a call to merge.; -* care in Gmerge,Gextract over vset . -* extract / merge extra implementation removal - - +====================================================================================================== +FUNCTIONALITY: it pleases me to keep track of things I have done (keeps sane) ====================================================================================================== -FUNCTIONALITY: +* Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE + user pass these? Is this a QCD specific? + * Stencil -- DONE * Test infrastructure -- DONE * Fourspin, two spin project --- DONE @@ -115,6 +116,7 @@ FUNCTIONALITY: * How to do U[mu] ... lorentz part of type structure or not. more like chroma if not. -- DONE * Twospin/Fourspin/Gamma/Proj/Recon ----- DONE +* norm2l is a hack. figure out syntax error and make this norm2 c.f. tests/Grid_gamma.cc -- DONE * subdirs lib, tests ?? ----- DONE - lib/math @@ -149,3 +151,10 @@ FUNCTIONALITY: * Controling std::cout ------- DONE +* Had to hack assignment to 1.0 in the tests/Grid_gamma test -- DONE +* Reduce implemention is poor ; need threaded reductions; OMP isn't able to do it for generic objects. -- DONE +* Bug in RNG with complex numbers ; only filling real values; need helper function -- DONE +* Conformable test in Cshift routines. -- none needed ; there is only one +* Conformable testing in expression templates -- DONE (recursive) + + diff --git a/lib/Grid.h b/lib/Grid.h index da04438a..4512f443 100644 --- a/lib/Grid.h +++ b/lib/Grid.h @@ -53,7 +53,6 @@ #include #include #include -#include #include #include #include diff --git a/lib/Grid_comparison.h b/lib/Grid_comparison.h index 45311831..3f9c206d 100644 --- a/lib/Grid_comparison.h +++ b/lib/Grid_comparison.h @@ -142,5 +142,6 @@ namespace Grid { } #include +#include #endif diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index 6695eb0d..39265896 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -68,10 +68,14 @@ namespace Grid { inline ComplexF adj(const ComplexF& r ){ return(conj(r)); } //conj already supported for complex - inline ComplexF timesI(const ComplexF r) { return(r*ComplexF(0.0,1.0));} - inline ComplexD timesI(const ComplexD r) { return(r*ComplexD(0.0,1.0));} - inline ComplexF timesMinusI(const ComplexF r){ return(r*ComplexF(0.0,-1.0));} - inline ComplexD timesMinusI(const ComplexD r){ return(r*ComplexD(0.0,-1.0));} + inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} + inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} + inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} + inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} + inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);} + inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);} + inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);} + inline void timesMinusI(ComplexD &ret,const ComplexD &r){ ret = timesMinusI(r);} inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} diff --git a/lib/Makefile.am b/lib/Makefile.am index 46795a33..f76b0b64 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -35,8 +35,7 @@ include_HEADERS =\ Grid_lattice.h\ Grid_math.h\ Grid_simd.h\ - Grid_stencil.h\ - Grid_where.h + Grid_stencil.h nobase_include_HEADERS=\ cartesian/Grid_cartesian_base.h\ @@ -56,6 +55,7 @@ nobase_include_HEADERS=\ lattice/Grid_lattice_trace.h\ lattice/Grid_lattice_transfer.h\ lattice/Grid_lattice_transpose.h\ + lattice/Grid_lattice_where.h\ math/Grid_math_arith.h\ math/Grid_math_arith_add.h\ math/Grid_math_arith_mac.h\ diff --git a/lib/cshift/Grid_cshift_common.h b/lib/cshift/Grid_cshift_common.h index 8320d2d0..9864df73 100644 --- a/lib/cshift/Grid_cshift_common.h +++ b/lib/cshift/Grid_cshift_common.h @@ -28,7 +28,7 @@ Gather_plane_simple (const Lattice &rhs,std::vector_ostride[dimension]; // base offset for start of plane int bo = 0; // offset in buffer -#pragma omp parallel for collapse(2) +PARALLEL_NESTED_LOOP(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ int o = n*rhs._grid->_slice_stride[dimension]; @@ -57,7 +57,7 @@ Gather_plane_extract(const Lattice &rhs,std::vector_ostride[dimension]; // base offset for start of plane int bo = 0; // offset in buffer -#pragma omp parallel for collapse(2) +PARALLEL_NESTED_LOOP(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ @@ -106,7 +106,7 @@ template void Scatter_plane_simple (Lattice &rhs,std::vector_ostride[dimension]; // base offset for start of plane int bo = 0; // offset in buffer -#pragma omp parallel for collapse(2) +PARALLEL_NESTED_LOOP(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ int o=n*rhs._grid->_slice_stride[dimension]; @@ -131,7 +131,7 @@ template void Scatter_plane_simple (Lattice &rhs,std::vector_ostride[dimension]; // base offset for start of plane -#pragma omp parallel for collapse(2) +PARALLEL_NESTED_LOOP(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ @@ -160,7 +160,7 @@ template void Copy_plane(Lattice& lhs,Lattice &rhs, int int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane -#pragma omp parallel for collapse(2) +PARALLEL_NESTED_LOOP(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ @@ -185,7 +185,7 @@ template void Copy_plane_permute(Lattice& lhs,Lattice &r int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane -#pragma omp parallel for collapse(2) +PARALLEL_NESTED_LOOP(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ int o =n*rhs._grid->_slice_stride[dimension]; diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h index 93801be5..c1d58b86 100644 --- a/lib/lattice/Grid_lattice_arith.h +++ b/lib/lattice/Grid_lattice_arith.h @@ -10,7 +10,7 @@ namespace Grid { template void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); @@ -21,7 +21,7 @@ namespace Grid { template void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); @@ -32,7 +32,7 @@ namespace Grid { template void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); @@ -42,7 +42,7 @@ namespace Grid { template void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); @@ -56,7 +56,7 @@ namespace Grid { template void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ conformable(lhs,ret); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; mult(&tmp,&lhs._odata[ss],&rhs); @@ -67,7 +67,7 @@ namespace Grid { template void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ conformable(lhs,ret); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; mac(&tmp,&lhs._odata[ss],&rhs); @@ -78,7 +78,7 @@ namespace Grid { template void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ conformable(lhs,ret); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; sub(&tmp,&lhs._odata[ss],&rhs); @@ -88,7 +88,7 @@ namespace Grid { template void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ conformable(lhs,ret); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; add(&tmp,&lhs._odata[ss],&rhs); @@ -102,7 +102,7 @@ namespace Grid { template void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ conformable(ret,rhs); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; mult(&tmp,&lhs,&rhs._odata[ss]); @@ -113,7 +113,7 @@ namespace Grid { template void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ conformable(ret,rhs); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; mac(&tmp,&lhs,&rhs._odata[ss]); @@ -124,7 +124,7 @@ namespace Grid { template void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ conformable(ret,rhs); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; sub(&tmp,&lhs,&rhs._odata[ss]); @@ -134,7 +134,7 @@ namespace Grid { template void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ conformable(ret,rhs); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ obj1 tmp; add(&tmp,&lhs,&rhs._odata[ss]); @@ -145,7 +145,7 @@ namespace Grid { template inline void axpy(Lattice &ret,sobj a,const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ vobj tmp = a*lhs._odata[ss]; vstream(ret._odata[ss],tmp+rhs._odata[ss]); diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h index 248bf1f7..cd376b32 100644 --- a/lib/lattice/Grid_lattice_base.h +++ b/lib/lattice/Grid_lattice_base.h @@ -64,7 +64,7 @@ public: //////////////////////////////////////////////////////////////////////////////// template inline Lattice & operator=(const LatticeUnaryExpression &expr) { -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ vobj tmp= eval(ss,expr); vstream(_odata[ss] ,tmp); @@ -73,7 +73,7 @@ public: } template inline Lattice & operator=(const LatticeBinaryExpression &expr) { -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ vobj tmp= eval(ss,expr); vstream(_odata[ss] ,tmp); @@ -82,7 +82,7 @@ public: } template inline Lattice & operator=(const LatticeTrinaryExpression &expr) { -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ vobj tmp= eval(ss,expr); vstream(_odata[ss] ,tmp); @@ -95,7 +95,7 @@ public: GridFromExpression(_grid,expr); assert(_grid!=nullptr); _odata.resize(_grid->oSites()); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ _odata[ss] = eval(ss,expr); } @@ -105,7 +105,7 @@ public: GridFromExpression(_grid,expr); assert(_grid!=nullptr); _odata.resize(_grid->oSites()); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ _odata[ss] = eval(ss,expr); } @@ -115,7 +115,7 @@ public: GridFromExpression(_grid,expr); assert(_grid!=nullptr); _odata.resize(_grid->oSites()); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ _odata[ss] = eval(ss,expr); } @@ -133,7 +133,7 @@ public: } template inline Lattice & operator = (const sobj & r){ -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ss<_grid->oSites();ss++){ this->_odata[ss]=r; } @@ -142,7 +142,7 @@ public: template inline Lattice & operator = (const Lattice & r){ conformable(*this,r); std::cout<<"Lattice operator ="<oSites();ss++){ this->_odata[ss]=r._odata[ss]; } @@ -167,7 +167,7 @@ public: inline friend Lattice operator / (const Lattice &lhs,const Lattice &rhs){ conformable(lhs,rhs); Lattice ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = lhs._odata[ss]/rhs._odata[ss]; } diff --git a/lib/lattice/Grid_lattice_comparison.h b/lib/lattice/Grid_lattice_comparison.h index 82c98572..431cf911 100644 --- a/lib/lattice/Grid_lattice_comparison.h +++ b/lib/lattice/Grid_lattice_comparison.h @@ -15,7 +15,7 @@ namespace Grid { inline Lattice LLComparison(vfunctor op,const Lattice &lhs,const Lattice &rhs) { Lattice ret(rhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); } @@ -25,7 +25,7 @@ namespace Grid { inline Lattice LSComparison(vfunctor op,const Lattice &lhs,const robj &rhs) { Lattice ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=op(lhs._odata[ss],rhs); } @@ -35,7 +35,7 @@ namespace Grid { inline Lattice SLComparison(vfunctor op,const lobj &lhs,const Lattice &rhs) { Lattice ret(rhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=op(lhs._odata[ss],rhs); } diff --git a/lib/lattice/Grid_lattice_local.h b/lib/lattice/Grid_lattice_local.h index 181f5be2..7b8bdf9b 100644 --- a/lib/lattice/Grid_lattice_local.h +++ b/lib/lattice/Grid_lattice_local.h @@ -16,7 +16,7 @@ namespace Grid { inline auto localNorm2 (const Lattice &rhs)-> Lattice { Lattice ret(rhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]); } @@ -29,7 +29,7 @@ namespace Grid { -> Lattice { Lattice ret(rhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]); } @@ -42,7 +42,7 @@ namespace Grid { inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); } diff --git a/lib/lattice/Grid_lattice_overload.h b/lib/lattice/Grid_lattice_overload.h index c4dd3373..24bdd70a 100644 --- a/lib/lattice/Grid_lattice_overload.h +++ b/lib/lattice/Grid_lattice_overload.h @@ -10,7 +10,7 @@ namespace Grid { inline Lattice operator -(const Lattice &r) { Lattice ret(r._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ vstream(ret._odata[ss], -r._odata[ss]); } @@ -47,7 +47,7 @@ namespace Grid { inline auto operator * (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; vstream(ret._odata[ss],tmp); @@ -59,7 +59,7 @@ namespace Grid { inline auto operator + (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss]; vstream(ret._odata[ss],tmp); @@ -71,7 +71,7 @@ namespace Grid { inline auto operator - (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss]; vstream(ret._odata[ss],tmp); @@ -83,7 +83,7 @@ namespace Grid { inline auto operator * (const Lattice &lhs,const right &rhs) -> Lattice { Lattice ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs; vstream(ret._odata[ss],tmp); @@ -95,7 +95,7 @@ namespace Grid { inline auto operator + (const Lattice &lhs,const right &rhs) -> Lattice { Lattice ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; vstream(ret._odata[ss],tmp); @@ -107,7 +107,7 @@ namespace Grid { inline auto operator - (const Lattice &lhs,const right &rhs) -> Lattice { Lattice ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs; vstream(ret._odata[ss],tmp); diff --git a/lib/lattice/Grid_lattice_peekpoke.h b/lib/lattice/Grid_lattice_peekpoke.h index ae87c5d8..ad7dfc50 100644 --- a/lib/lattice/Grid_lattice_peekpoke.h +++ b/lib/lattice/Grid_lattice_peekpoke.h @@ -15,7 +15,7 @@ namespace Grid { -> Lattice(lhs._odata[0]))> { Lattice(lhs._odata[0]))> ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = peekIndex(lhs._odata[ss]); } @@ -26,7 +26,7 @@ namespace Grid { -> Lattice(lhs._odata[0],i))> { Lattice(lhs._odata[0],i))> ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = peekIndex(lhs._odata[ss],i); } @@ -37,7 +37,7 @@ namespace Grid { -> Lattice(lhs._odata[0],i,j))> { Lattice(lhs._odata[0],i,j))> ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = peekIndex(lhs._odata[ss],i,j); } @@ -50,7 +50,7 @@ namespace Grid { template inline void pokeIndex(Lattice &lhs,const Lattice(lhs._odata[0]))> & rhs) { -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ pokeIndex(lhs._odata[ss],rhs._odata[ss]); } @@ -58,7 +58,7 @@ namespace Grid { template inline void pokeIndex(Lattice &lhs,const Lattice(lhs._odata[0],0))> & rhs,int i) { -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ pokeIndex(lhs._odata[ss],rhs._odata[ss],i); } @@ -66,7 +66,7 @@ namespace Grid { template inline void pokeIndex(Lattice &lhs,const Lattice(lhs._odata[0],0,0))> & rhs,int i,int j) { -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ pokeIndex(lhs._odata[ss],rhs._odata[ss],i,j); } diff --git a/lib/lattice/Grid_lattice_reality.h b/lib/lattice/Grid_lattice_reality.h index a3a7a27f..cf86d700 100644 --- a/lib/lattice/Grid_lattice_reality.h +++ b/lib/lattice/Grid_lattice_reality.h @@ -11,7 +11,7 @@ namespace Grid { template inline Lattice adj(const Lattice &lhs){ Lattice ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = adj(lhs._odata[ss]); } @@ -20,7 +20,7 @@ namespace Grid { template inline Lattice conj(const Lattice &lhs){ Lattice ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = conj(lhs._odata[ss]); } @@ -30,7 +30,7 @@ namespace Grid { template inline auto real(const Lattice &z) -> Lattice { Lattice ret(z._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = real(z._odata[ss]); } @@ -40,7 +40,7 @@ namespace Grid { template inline auto imag(const Lattice &z) -> Lattice { Lattice ret(z._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = imag(z._odata[ss]); } diff --git a/lib/lattice/Grid_lattice_reduction.h b/lib/lattice/Grid_lattice_reduction.h index 679effb0..a6d3e0dc 100644 --- a/lib/lattice/Grid_lattice_reduction.h +++ b/lib/lattice/Grid_lattice_reduction.h @@ -7,69 +7,85 @@ namespace Grid { #endif //////////////////////////////////////////////////////////////////////////////////////////////////// - // Reduction operations + // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// - template - inline RealD norm2(const Lattice &arg){ - - typedef typename vobj::scalar_type scalar; - typedef typename vobj::vector_type vector; - decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm; - scalar nrm; - //FIXME make this loop parallelisable - vnrm=zero; - for(int ss=0;ssoSites(); ss++){ - vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]); - } - vector vvnrm =TensorRemove(vnrm) ; - nrm = Reduce(vvnrm); - arg._grid->GlobalSum(nrm); - return real(nrm); - } + template inline RealD norm2(const Lattice &arg){ + ComplexD nrm = innerProduct(arg,arg); + return real(nrm); + } template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) - // inline auto innerProduct(const Lattice &left,const Lattice &right) - //->decltype(innerProduct(left._odata[0],right._odata[0])) { - typedef typename vobj::scalar_type scalar; - decltype(innerProduct(left._odata[0],right._odata[0])) vnrm; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + vector_type vnrm; + scalar_type nrm; - scalar nrm; - //FIXME make this loop parallelisable - vnrm=zero; - for(int ss=0;ssoSites(); ss++){ - vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]); + GridBase *grid = left._grid; + + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; } - nrm = Reduce(vnrm); + +PARALLEL_FOR_LOOP + for(int thr=0;thrSumArraySize();thr++){ + + int nwork, mywork, myoff; + GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); + + decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation + for(int ss=myoff;ssSumArraySize();i++){ + vvnrm = vvnrm+sumarray[i]; + } + nrm = Reduce(vvnrm);// sum across simd right._grid->GlobalSum(nrm); return nrm; } template - inline typename vobj::scalar_object sum(const Lattice &arg){ + inline typename vobj::scalar_object sum(const Lattice &arg){ GridBase *grid=arg._grid; int Nsimd = grid->Nsimd(); - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - - vobj vsum; - sobj ssum; - - vsum=zero; - ssum=zero; - //FIXME make this loop parallelisable - for(int ss=0;ssoSites(); ss++){ - vsum = vsum + arg._odata[ss]; + std::vector > sumarray(grid->SumArraySize()); + for(int i=0;iSumArraySize();i++){ + sumarray[i]=zero; } - + +PARALLEL_FOR_LOOP + for(int thr=0;thrSumArraySize();thr++){ + int nwork, mywork, myoff; + GridThread::GetWork(grid->oSites(),thr,mywork,myoff); + + vobj vvsum=zero; + for(int ss=myoff;ssSumArraySize();i++){ + vsum = vsum+sumarray[i]; + } + + typedef typename vobj::scalar_object sobj; + sobj ssum=zero; + std::vector buf(Nsimd); extract(vsum,buf); for(int i=0;iGlobalSum(ssum); return ssum; @@ -77,13 +93,15 @@ namespace Grid { - template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) { typedef typename vobj::scalar_object sobj; - GridBase *grid = Data._grid; assert(grid!=NULL); + + // FIXME + std::cout<<"WARNING ! SliceSum is unthreaded "<SumArraySize()<<" threads "<_ndimension; const int Nsimd = grid->Nsimd(); @@ -94,10 +112,8 @@ template inline void sliceSum(const Lattice &Data,std::vector< int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - sobj szero; szero=zero; - std::vector > lvSum(rd); // will locally sum vectors first - std::vector lsSum(ld,szero); // sum across these down to scalars + std::vector lsSum(ld,zero); // sum across these down to scalars std::vector extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file @@ -108,6 +124,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< std::vector coor(Nd); // sum over reduced dimension planes, breaking out orthog dir + for(int ss=0;ssoSites();ss++){ GridBase::CoorFromIndex(coor,ss,grid->_rdimensions); int r = coor[orthogdim]; diff --git a/lib/lattice/Grid_lattice_trace.h b/lib/lattice/Grid_lattice_trace.h index 8c7607d8..72f984f9 100644 --- a/lib/lattice/Grid_lattice_trace.h +++ b/lib/lattice/Grid_lattice_trace.h @@ -15,7 +15,7 @@ namespace Grid { -> Lattice { Lattice ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = trace(lhs._odata[ss]); } @@ -30,7 +30,7 @@ namespace Grid { -> Lattice(lhs._odata[0]))> { Lattice(lhs._odata[0]))> ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = traceIndex(lhs._odata[ss]); } diff --git a/lib/lattice/Grid_lattice_transfer.h b/lib/lattice/Grid_lattice_transfer.h index 53973741..27ae27e1 100644 --- a/lib/lattice/Grid_lattice_transfer.h +++ b/lib/lattice/Grid_lattice_transfer.h @@ -23,7 +23,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine) template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ half.checkerboard = cb; int ssh=0; -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; @@ -41,7 +41,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine) template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.checkerboard; int ssh=0; -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ std::vector coor; int cbos; diff --git a/lib/lattice/Grid_lattice_transpose.h b/lib/lattice/Grid_lattice_transpose.h index 7166d2b5..c8e7433f 100644 --- a/lib/lattice/Grid_lattice_transpose.h +++ b/lib/lattice/Grid_lattice_transpose.h @@ -13,7 +13,7 @@ namespace Grid { template inline Lattice transpose(const Lattice &lhs){ Lattice ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = transpose(lhs._odata[ss]); } @@ -28,7 +28,7 @@ namespace Grid { -> Lattice(lhs._odata[0]))> { Lattice(lhs._odata[0]))> ret(lhs._grid); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ ret._odata[ss] = transposeIndex(lhs._odata[ss]); } diff --git a/lib/Grid_where.h b/lib/lattice/Grid_lattice_where.h similarity index 95% rename from lib/Grid_where.h rename to lib/lattice/Grid_lattice_where.h index 588c0596..84d7471c 100644 --- a/lib/Grid_where.h +++ b/lib/lattice/Grid_lattice_where.h @@ -1,5 +1,5 @@ -#ifndef GRID_WHERE_H -#define GRID_WHERE_H +#ifndef GRID_LATTICE_WHERE_H +#define GRID_LATTICE_WHERE_H namespace Grid { // Must implement the predicate gating the // Must be able to reduce the predicate down to a single vInteger per site. @@ -27,7 +27,7 @@ inline void where(Lattice &ret,const Lattice &predicate,Lattice truevals (Nsimd); std::vector falsevals(Nsimd); -#pragma omp parallel for +PARALLEL_FOR_LOOP for(int ss=0;ssoSites(); ss++){ extract(iftrue._odata[ss] ,truevals); diff --git a/lib/math/Grid_math_reality.h b/lib/math/Grid_math_reality.h index 1bf34f2a..18a2f89b 100644 --- a/lib/math/Grid_math_reality.h +++ b/lib/math/Grid_math_reality.h @@ -6,23 +6,20 @@ namespace Grid { /////////////////////////////////////////// CONJ /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// -#ifdef GRID_WARN_SUBOPTIMAL -#warning "Optimisation alert switch over to two argument form to avoid copy back in perf critical timesI " -#endif /////////////////////////////////////////////// // multiply by I; make recursive. /////////////////////////////////////////////// template inline iScalar timesI(const iScalar&r) { iScalar ret; - ret._internal = timesI(r._internal); + timesI(ret._internal,r._internal); return ret; } template inline iVector timesI(const iVector&r) { iVector ret; for(int i=0;i inline iMatrix timesI(const iMatrix ret; for(int i=0;i inline void timesI(iScalar &ret,const iScalar&r) +{ + timesI(ret._internal,r._internal); +} +template inline void timesI(iVector &ret,const iVector&r) +{ + for(int i=0;i inline void timesI(iMatrix &ret,const iMatrix&r) +{ + for(int i=0;i inline iScalar timesMinusI(const iScalar&r) { iScalar ret; - ret._internal = timesMinusI(r._internal); + timesMinusI(ret._internal,r._internal); return ret; } template inline iVector timesMinusI(const iVector&r) { iVector ret; for(int i=0;i inline iMatrix timesMinusI(const iMatrix ret; for(int i=0;i inline void timesMinusI(iScalar &ret,const iScalar&r) +{ + timesMinusI(ret._internal,r._internal); +} +template inline void timesMinusI(iVector &ret,const iVector&r) +{ + for(int i=0;i inline void timesMinusI(iMatrix &ret,const iMatrix&r) +{ + for(int i=0;ioSites();sss++){ int ss = sss; diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h index ef5da859..f0bc2da4 100644 --- a/lib/simd/Grid_vComplexD.h +++ b/lib/simd/Grid_vComplexD.h @@ -13,6 +13,9 @@ namespace Grid { vzero(*this); return (*this); } + vComplexD( Zero & z){ + vzero(*this); + } vComplexD()=default; vComplexD(ComplexD a){ vsplat(*this,a); @@ -286,8 +289,8 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ return ret; } - friend inline vComplexD timesMinusI(const vComplexD &in){ - vComplexD ret; vzero(ret); + friend inline void timesMinusI(vComplexD &ret,const vComplexD &in){ + vzero(ret); vComplexD tmp; #if defined (AVX1)|| defined (AVX2) tmp.v =_mm256_addsub_pd(ret.v,in.v); // r,-i @@ -304,11 +307,10 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ #ifdef QPX assert(0); #endif - return ret; } - friend inline vComplexD timesI(const vComplexD &in){ - vComplexD ret; vzero(ret); + friend inline void timesI(vComplexD &ret, const vComplexD &in){ + vzero(ret); vComplexD tmp; #if defined (AVX1)|| defined (AVX2) tmp.v =_mm256_shuffle_pd(in.v,in.v,0x5); @@ -325,9 +327,21 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ #ifdef QPX assert(0); #endif + } + + friend inline vComplexD timesMinusI(const vComplexD &in){ + vComplexD ret; + timesMinusI(ret,in); return ret; } + friend inline vComplexD timesI(const vComplexD &in){ + vComplexD ret; + timesI(ret,in); + return ret; + } + + // REDUCE FIXME must be a cleaner implementation friend inline ComplexD Reduce(const vComplexD & in) { diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h index 34510333..202dce43 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Grid_vComplexF.h @@ -28,6 +28,9 @@ namespace Grid { vzero(*this); return (*this); } + vComplexF( Zero & z){ + vzero(*this); + } vComplexF()=default; vComplexF(ComplexF a){ vsplat(*this,a); @@ -363,8 +366,7 @@ namespace Grid { #endif return ret; } - friend inline vComplexF timesMinusI(const vComplexF &in){ - vComplexF ret; + friend inline void timesMinusI( vComplexF &ret,const vComplexF &in){ vzero(ret); #if defined (AVX1)|| defined (AVX2) cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i @@ -381,10 +383,9 @@ namespace Grid { #ifdef QPX assert(0); #endif - return ret; } - friend inline vComplexF timesI(const vComplexF &in){ - vComplexF ret; vzero(ret); + friend inline void timesI(vComplexF &ret,const vComplexF &in){ + vzero(ret); #if defined (AVX1)|| defined (AVX2) cvec tmp =_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));//i,r ret.v =_mm256_addsub_ps(ret.v,tmp); //i,-r @@ -400,8 +401,18 @@ namespace Grid { #ifdef QPX assert(0); #endif + } + friend inline vComplexF timesMinusI(const vComplexF &in){ + vComplexF ret; + timesMinusI(ret,in); return ret; } + friend inline vComplexF timesI(const vComplexF &in){ + vComplexF ret; + timesI(ret,in); + return ret; + } + // Unary negation friend inline vComplexF operator -(const vComplexF &r) { diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Grid_vRealD.h index 4d5e2f57..aef143f8 100644 --- a/lib/simd/Grid_vRealD.h +++ b/lib/simd/Grid_vRealD.h @@ -17,6 +17,10 @@ namespace Grid { vRealD(Zero &zero){ zeroit(*this); } + vRealD & operator = ( Zero & z){ + vzero(*this); + return (*this); + } friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);} friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);} diff --git a/lib/simd/Grid_vRealF.h b/lib/simd/Grid_vRealF.h index 79e1b91f..559a9eda 100644 --- a/lib/simd/Grid_vRealF.h +++ b/lib/simd/Grid_vRealF.h @@ -18,6 +18,10 @@ namespace Grid { vRealF(Zero &zero){ zeroit(*this); } + vRealF & operator = ( Zero & z){ + vzero(*this); + return (*this); + } //////////////////////////////////// // Arithmetic operator overloads +,-,* //////////////////////////////////// diff --git a/tests/Grid_gamma.cc b/tests/Grid_gamma.cc index ba298b19..f5582955 100644 --- a/tests/Grid_gamma.cc +++ b/tests/Grid_gamma.cc @@ -50,9 +50,8 @@ int main (int argc, char ** argv) // std::cout << " Is triv Scalar " < >::value << std::endl; // std::cout << " Is triv Scalar "< >::value << std::endl; - std::complex c(1.0); for(int a=0;a