diff --git a/benchmarks/Grid_wilson.cc b/benchmarks/Grid_wilson.cc index d1d21cf0..adfb6fd4 100644 --- a/benchmarks/Grid_wilson.cc +++ b/benchmarks/Grid_wilson.cc @@ -49,6 +49,13 @@ int main (int argc, char ** argv) volume=volume*latt_size[mu]; } + // Only one non-zero (y) + Umu=zero; + for(int nn=0;nn(Umu,U[nn],nn); + } + for(int mu=0;mu(Umu,mu); } @@ -56,7 +63,6 @@ int main (int argc, char ** argv) { // Naive wilson implementation ref = zero; for(int mu=0;mu + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + + LatticeFermion src(&Grid); random(pRNG,src); + std::cout << "src norm" << norm2(src)< U(4,&Grid); + + double volume=1; + for(int mu=0;mu(Umu,mu); + } + + RealD mass=0.5; + WilsonMatrix Dw(Umu,mass); + HermitianOperator HermOp(Dw); + ConjugateGradient CG(1.0e-8,10000); + CG(HermOp,src,result); + + + Grid_finalize(); +} diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am index 2fe40feb..77ce7663 100644 --- a/benchmarks/Makefile.am +++ b/benchmarks/Makefile.am @@ -5,12 +5,14 @@ AM_LDFLAGS = -L$(top_builddir)/lib # # Test code # -bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 - +bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec Grid_wilson_SOURCES = Grid_wilson.cc Grid_wilson_LDADD = -lGrid +Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc +Grid_wilson_cg_unprec_LDADD = -lGrid + Grid_comms_SOURCES = Grid_comms.cc Grid_comms_LDADD = -lGrid diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index af77591c..6d7411d3 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -50,15 +50,20 @@ namespace Grid { typedef std::complex Complex; inline RealF adj(const RealF & r){ return r; } - inline RealF conj(const RealF & r){ return r; } + inline RealF conjugate(const RealF & r){ return r; } inline RealF real(const RealF & r){ return r; } inline RealD adj(const RealD & r){ return r; } - inline RealD conj(const RealD & r){ return r; } + inline RealD conjugate(const RealD & r){ return r; } inline RealD real(const RealD & r){ return r; } - inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } - inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } + inline ComplexD conjugate(const ComplexD& r){ return(conj(r)); } + inline ComplexD adj(const ComplexD& r){ return(conjugate(r)); } + inline ComplexF conjugate(const ComplexF& r ){ return(conj(r)); } + inline ComplexF adj(const ComplexF& r ){ return(conjugate(r)); } + + inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conjugate(l)*r; } + inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; } inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; } inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; } @@ -70,15 +75,14 @@ namespace Grid { inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);} inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);} inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);} - inline ComplexD adj(const ComplexD& r){ return(conj(r)); } - // conj already supported for complex + // conjugate already supported for complex inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); } inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); } inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); } inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } - inline ComplexF adj(const ComplexF& r ){ return(conj(r)); } - //conj already supported for complex + + //conjugate already supported for complex inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index bb948b95..bdb6c387 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -7,8 +7,8 @@ namespace Grid { // LinearOperators Take a something and return a something. ///////////////////////////////////////////////////////////////////////////////////////////// // - // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugate (transpose if real): - // + // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real): + //SBase // i) F(a x + b y) = aF(x) + b F(y). // ii) = ^\ast // @@ -25,12 +25,13 @@ namespace Grid { ///////////////////////////////////////////////////////////////////////////////////////////// template class HermitianOperatorBase : public LinearOperatorBase { public: - virtual RealD OpAndNorm(const Field &in, Field &out); + virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2); void AdjOp(const Field &in, Field &out) { Op(in,out); }; void Op(const Field &in, Field &out) { - OpAndNorm(in,out); + double n1,n2; + OpAndNorm(in,out,n1,n2); }; }; @@ -80,8 +81,8 @@ namespace Grid { Matrix &_Mat; public: HermitianOperator(Matrix &Mat): _Mat(Mat) {}; - RealD OpAndNorm(const Field &in, Field &out){ - return _Mat.MdagM(in,out); + void OpAndNorm(const Field &in, Field &out,double &n1,double &n2){ + return _Mat.MdagM(in,out,n1,n2); } }; diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h index bcb6ab60..f7f8559e 100644 --- a/lib/algorithms/iterative/ConjugateGradient.h +++ b/lib/algorithms/iterative/ConjugateGradient.h @@ -18,6 +18,7 @@ public: std::cout << Tolerance< &Linop,const Field &src, Field &psi) {assert(0);}; void operator() (HermitianOperatorBase &Linop,const Field &src, Field &psi){ RealD cp,c,a,d,b,ssq,qq,b_pred; @@ -61,21 +62,27 @@ public: Linop.OpAndNorm(p,mmp,d,qq); + // std::cout < struct name\ GridUnopClass(UnarySub,-a); GridUnopClass(UnaryAdj,adj(a)); -GridUnopClass(UnaryConj,conj(a)); +GridUnopClass(UnaryConj,conjugate(a)); GridUnopClass(UnaryTrace,trace(a)); GridUnopClass(UnaryTranspose,transpose(a)); @@ -178,7 +178,7 @@ template inline auto op(const T1 &pred,con GRID_DEF_UNOP(operator -,UnarySub); GRID_DEF_UNOP(adj,UnaryAdj); -GRID_DEF_UNOP(conj,UnaryConj); +GRID_DEF_UNOP(conjugate,UnaryConj); GRID_DEF_UNOP(trace,UnaryTrace); GRID_DEF_UNOP(transpose,UnaryTranspose); diff --git a/lib/lattice/Grid_lattice_arith.h b/lib/lattice/Grid_lattice_arith.h index c0bbb2b6..f1e566a2 100644 --- a/lib/lattice/Grid_lattice_arith.h +++ b/lib/lattice/Grid_lattice_arith.h @@ -144,14 +144,44 @@ PARALLEL_FOR_LOOP } template strong_inline - void axpy(Lattice &ret,sobj a,const Lattice &lhs,const Lattice &rhs){ - conformable(lhs,rhs); + void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ + conformable(x,y); #pragma omp parallel for - for(int ss=0;ssoSites();ss++){ - vobj tmp = a*lhs._odata[ss]; - vstream(ret._odata[ss],tmp+rhs._odata[ss]); + for(int ss=0;ssoSites();ss++){ + vobj tmp = a*x._odata[ss]+y._odata[ss]; + vstream(ret._odata[ss],tmp); } } + template strong_inline + void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ + conformable(x,y); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + vobj tmp = a*x._odata[ss]+b*y._odata[ss]; + vstream(ret._odata[ss],tmp); + } + } + + template strong_inline + RealD axpy_norm(Lattice &ret,sobj a,const Lattice &x,const Lattice &y){ + conformable(x,y); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + vobj tmp = a*x._odata[ss]+y._odata[ss]; + vstream(ret._odata[ss],tmp); + } + return norm2(ret); + } + template strong_inline + RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice &y){ + conformable(x,y); +#pragma omp parallel for + for(int ss=0;ssoSites();ss++){ + vobj tmp = a*x._odata[ss]+b*y._odata[ss]; + vstream(ret._odata[ss],tmp); + } + return norm2(ret); // FIXME implement parallel norm in ss loop + } } #endif diff --git a/lib/lattice/Grid_lattice_base.h b/lib/lattice/Grid_lattice_base.h index ae606ae7..0016bafa 100644 --- a/lib/lattice/Grid_lattice_base.h +++ b/lib/lattice/Grid_lattice_base.h @@ -9,7 +9,7 @@ namespace Grid { // Functionality: // -=,+=,*=,() // add,+,sub,-,mult,mac,* -// adj,conj +// adj,conjugate // real,imag // transpose,transposeIndex // trace,traceIndex diff --git a/lib/lattice/Grid_lattice_reality.h b/lib/lattice/Grid_lattice_reality.h index cf86d700..2f52ed22 100644 --- a/lib/lattice/Grid_lattice_reality.h +++ b/lib/lattice/Grid_lattice_reality.h @@ -18,11 +18,11 @@ PARALLEL_FOR_LOOP return ret; }; - template inline Lattice conj(const Lattice &lhs){ + template inline Lattice conjugate(const Lattice &lhs){ Lattice ret(lhs._grid); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ - ret._odata[ss] = conj(lhs._odata[ss]); + ret._odata[ss] = conjugate(lhs._odata[ss]); } return ret; }; diff --git a/lib/math/Grid_math_reality.h b/lib/math/Grid_math_reality.h index 18a2f89b..40d4c624 100644 --- a/lib/math/Grid_math_reality.h +++ b/lib/math/Grid_math_reality.h @@ -98,26 +98,26 @@ template inline void timesMinusI(iMatrix &ret,const /////////////////////////////////////////////// // Conj function for scalar, vector, matrix /////////////////////////////////////////////// -template inline iScalar conj(const iScalar&r) +template inline iScalar conjugate(const iScalar&r) { iScalar ret; - ret._internal = conj(r._internal); + ret._internal = conjugate(r._internal); return ret; } -template inline iVector conj(const iVector&r) +template inline iVector conjugate(const iVector&r) { iVector ret; for(int i=0;i inline iMatrix conj(const iMatrix&r) +template inline iMatrix conjugate(const iMatrix&r) { iMatrix ret; for(int i=0;i(in,comm_buf,compressor); PARALLEL_FOR_LOOP @@ -140,13 +159,13 @@ PARALLEL_FOR_LOOP int ssu= ss; // Xp - offset = Stencil._offsets [Xp][ss]; - local = Stencil._is_local[Xp][ss]; - perm = Stencil._permute[Xp][ss]; - ptype = Stencil._permute_type[Xp]; + int idx = (Xp+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; - if ( local && perm ) - { + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjXp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -154,18 +173,17 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); - //prefetch(Umu._odata[ssu](Yp)); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); spReconXp(result,Uchi); // Yp - offset = Stencil._offsets [Yp][ss]; - local = Stencil._is_local[Yp][ss]; - perm = Stencil._permute[Yp][ss]; - ptype = Stencil._permute_type[Yp]; + idx = (Yp+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; - if ( local && perm ) - { + if ( local && perm ) { spProjYp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -173,18 +191,16 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); - // prefetch(Umu._odata[ssu](Zp)); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconYp(result,Uchi); // Zp - offset = Stencil._offsets [Zp][ss]; - local = Stencil._is_local[Zp][ss]; - perm = Stencil._permute[Zp][ss]; - ptype = Stencil._permute_type[Zp]; - - if ( local && perm ) - { + idx = (Zp+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjZp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -192,18 +208,16 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); - // prefetch(Umu._odata[ssu](Tp)); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconZp(result,Uchi); // Tp - offset = Stencil._offsets [Tp][ss]; - local = Stencil._is_local[Tp][ss]; - perm = Stencil._permute[Tp][ss]; - ptype = Stencil._permute_type[Tp]; - - if ( local && perm ) - { + idx = (Tp+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjTp(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -211,15 +225,15 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); - // prefetch(Umu._odata[ssu](Xm)); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconTp(result,Uchi); // Xm - offset = Stencil._offsets [Xm][ss]; - local = Stencil._is_local[Xm][ss]; - perm = Stencil._permute[Xm][ss]; - ptype = Stencil._permute_type[Xm]; + idx = (Xm+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; if ( local && perm ) { @@ -230,18 +244,18 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Xm),&chi()); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconXm(result,Uchi); // Ym - offset = Stencil._offsets [Ym][ss]; - local = Stencil._is_local[Ym][ss]; - perm = Stencil._permute[Ym][ss]; - ptype = Stencil._permute_type[Ym]; + idx = (Ym+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; - if ( local && perm ) - { + if ( local && perm ) { spProjYm(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -249,17 +263,16 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Ym),&chi()); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconYm(result,Uchi); // Zm - offset = Stencil._offsets [Zm][ss]; - local = Stencil._is_local[Zm][ss]; - perm = Stencil._permute[Zm][ss]; - ptype = Stencil._permute_type[Zm]; - - if ( local && perm ) - { + idx = (Zm+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjZm(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -267,17 +280,16 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Zm),&chi()); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconZm(result,Uchi); // Tm - offset = Stencil._offsets [Tm][ss]; - local = Stencil._is_local[Tm][ss]; - perm = Stencil._permute[Tm][ss]; - ptype = Stencil._permute_type[Tm]; - - if ( local && perm ) - { + idx = (Tm+dag*4)%8; + offset = Stencil._offsets [idx][ss]; + local = Stencil._is_local[idx][ss]; + perm = Stencil._permute[idx][ss]; + ptype = Stencil._permute_type[idx]; + if ( local && perm ) { spProjTm(tmp,in._odata[offset]); permute(chi,tmp,ptype); } else if ( local ) { @@ -285,7 +297,7 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](Tm),&chi()); + mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); accumReconTm(result,Uchi); vstream(out._odata[ss],result); @@ -294,10 +306,6 @@ PARALLEL_FOR_LOOP } -void WilsonMatrix::Dw(const LatticeFermion &in, LatticeFermion &out) -{ - return; -} diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h index 1098d330..1db95b33 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.h +++ b/lib/qcd/Grid_qcd_wilson_dop.h @@ -45,10 +45,7 @@ namespace Grid { virtual void MooeeInvDag (const LatticeFermion &in, LatticeFermion &out); // non-hermitian hopping term; half cb or both - void Dhop(const LatticeFermion &in, LatticeFermion &out); - - // m+4r -1/2 Dhop; both cb's - void Dw(const LatticeFermion &in, LatticeFermion &out); + void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag); typedef iScalar > matrix; diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h index 41c8509f..2b03c825 100644 --- a/lib/simd/Grid_vComplexD.h +++ b/lib/simd/Grid_vComplexD.h @@ -32,7 +32,7 @@ namespace Grid { friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);} friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);} friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);} - friend inline vComplexD adj(const vComplexD &in){ return conj(in); } + friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); } ////////////////////////////////// // Initialise to 1,0,i @@ -166,11 +166,11 @@ namespace Grid { // all subtypes; may not be a good assumption, but could // add the vector width as a template param for BG/Q for example //////////////////////////////////////////////////////////////////// - /* friend inline void permute(vComplexD &y,vComplexD b,int perm) { Gpermute(y,b,perm); } + /* friend inline void merge(vComplexD &y,std::vector &extracted) { Gmerge(y,extracted); @@ -269,7 +269,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ //////////////////////// // Conjugate //////////////////////// - friend inline vComplexD conj(const vComplexD &in){ + friend inline vComplexD conjugate(const vComplexD &in){ vComplexD ret ; vzero(ret); #if defined (AVX1)|| defined (AVX2) // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ... @@ -345,17 +345,17 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ // REDUCE FIXME must be a cleaner implementation friend inline ComplexD Reduce(const vComplexD & in) { -#if defined SSE4 - return ComplexD(in.v[0], in.v[1]); // inefficient +#ifdef SSE4 + return ComplexD(in.v[0],in.v[1]); +#endif +#if defined(AVX1) || defined (AVX2) + vComplexD v1; + permute(v1,in,0); // sse 128; paired complex single + v1=v1+in; + return ComplexD(v1.v[0],v1.v[1]); #endif -#if defined (AVX1) || defined(AVX2) - // return std::complex(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v)); - __attribute__ ((aligned(32))) double c_[4]; - _mm256_store_pd(c_,in.v); - return ComplexD(c_[0]+c_[2],c_[1]+c_[3]); -#endif #ifdef AVX512 - return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v)); + return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v)); #endif #ifdef QPX #endif @@ -387,7 +387,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ }; - inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; } + inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; } typedef vComplexD vDComplex; diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h index 8a892d24..713bbdd8 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Grid_vComplexF.h @@ -47,7 +47,7 @@ namespace Grid { friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); } friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); } friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); } - friend inline vComplexF adj(const vComplexF &in){ return conj(in); } + friend inline vComplexF adj(const vComplexF &in){ return conjugate(in); } ////////////////////////////////// // Initialise to 1,0,i @@ -228,42 +228,25 @@ namespace Grid { ret.v = {a,b,a,b}; #endif } - friend inline ComplexF Reduce(const vComplexF & in) - { + friend inline void permute(vComplexF &y,vComplexF b,int perm) + { + Gpermute(y,b,perm); + } + friend inline ComplexF Reduce(const vComplexF & in) + { #ifdef SSE4 - union { - cvec v1; // SSE 4 x float vector - float f[4]; // scalar array of 4 floats - } u128; - u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros - return ComplexF(u128.f[0], u128.f[1]); + vComplexF v1; + permute(v1,in,0); // sse 128; paired complex single + v1=v1+in; + return ComplexF(v1.v[0],v1.v[1]); #endif -#ifdef AVX1 - //it would be better passing 2 arguments to saturate the vector lanes - union { - __m256 v1; - float f[8]; - } u256; - //SWAP lanes - // FIXME .. icc complains with lib/lattice/Grid_lattice_reduction.h (49): (col. 20) warning #13211: Immediate parameter to intrinsic call too large - __m256 t0 = _mm256_permute2f128_ps(in.v, in.v, 1); - __m256 t1 = _mm256_permute_ps(in.v , 0b11011000);//real (0,2,1,3) - __m256 t2 = _mm256_permute_ps(t0 , 0b10001101);//imag (1,3,0,2) - t0 = _mm256_blend_ps(t1, t2, 0b0101000001010000);// (0,0,1,1,0,0,1,1) - t1 = _mm256_hadd_ps(t0,t0); - u256.v1 = _mm256_hadd_ps(t1, t1); - return ComplexF(u256.f[0], u256.f[4]); -#endif -#ifdef AVX2 - union { - __m256 v1; - float f[8]; - } u256; - const __m256i mask= _mm256_set_epi32( 7, 5, 3, 1, 6, 4, 2, 0); - __m256 tmp1 = _mm256_permutevar8x32_ps(in.v, mask); - __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1); - u256.v1 = _mm256_hadd_ps(tmp2, tmp2); - return ComplexF(u256.f[0], u256.f[4]); +#if defined(AVX1) || defined (AVX2) + vComplexF v1,v2; + permute(v1,in,0); // sse 128; paired complex single + v1=v1+in; + permute(v2,v1,1); // avx 256; quad complex single + v1=v1+v2; + return ComplexF(v1.v[0],v1.v[1]); #endif #ifdef AVX512 return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v)); @@ -345,13 +328,10 @@ namespace Grid { // Conjugate /////////////////////// - friend inline vComplexF conj(const vComplexF &in){ + friend inline vComplexF conjugate(const vComplexF &in){ vComplexF ret ; vzero(ret); #if defined (AVX1)|| defined (AVX2) - cvec tmp; - tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi - ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); - + ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f)); #endif #ifdef SSE4 ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f)); @@ -433,10 +413,6 @@ namespace Grid { return *this; } - friend inline void permute(vComplexF &y,vComplexF b,int perm) - { - Gpermute(y,b,perm); - } /* friend inline void merge(vComplexF &y,std::vector &extracted) { @@ -460,7 +436,7 @@ namespace Grid { inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) { - return conj(l)*r; + return conjugate(l)*r; } inline void zeroit(vComplexF &z){ vzero(z);} diff --git a/lib/simd/Grid_vInteger.h b/lib/simd/Grid_vInteger.h index 63e8c720..ee4a3633 100644 --- a/lib/simd/Grid_vInteger.h +++ b/lib/simd/Grid_vInteger.h @@ -117,7 +117,7 @@ namespace Grid { }; /////////////////////////////////////////////// - // mult, sub, add, adj,conj, mac functions + // mult, sub, add, adj,conjugate, mac functions /////////////////////////////////////////////// friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);} friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);} diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Grid_vRealD.h index dbfacb0c..83979d14 100644 --- a/lib/simd/Grid_vRealD.h +++ b/lib/simd/Grid_vRealD.h @@ -26,7 +26,7 @@ namespace Grid { friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);} friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);} friend inline vRealD adj(const vRealD &in) { return in; } - friend inline vRealD conj(const vRealD &in){ return in; } + friend inline vRealD conjugate(const vRealD &in){ return in; } friend inline void mac (vRealD &y,const vRealD a,const vRealD x){ #if defined (AVX1) || defined (SSE4) @@ -112,11 +112,12 @@ namespace Grid { // all subtypes; may not be a good assumption, but could // add the vector width as a template param for BG/Q for example //////////////////////////////////////////////////////////////////// - /* + friend inline void permute(vRealD &y,vRealD b,int perm) { Gpermute(y,b,perm); } + /* friend inline void merge(vRealD &y,std::vector &extracted) { Gmerge(y,extracted); @@ -209,48 +210,26 @@ namespace Grid { friend inline RealD Reduce(const vRealD & in) { -#if defined (SSE4) - // FIXME Hack - const RealD * ptr =(const RealD *) ∈ - RealD ret = 0; - for(int i=0;i(y,b,perm); } + /* friend inline void merge(vRealF &y,std::vector &extracted) { Gmerge(y,extracted); @@ -155,7 +156,6 @@ namespace Grid { Gextract(y,extracted); } */ - ///////////////////////////////////////////////////// // Broadcast a value across Nsimd copies. @@ -243,33 +243,26 @@ friend inline void vstore(const vRealF &ret, float *a){ } friend inline RealF Reduce(const vRealF & in) { -#if defined (SSE4) - // FIXME Hack - const RealF * ptr = (const RealF *) ∈ - RealF ret = 0; - for(int i=0;i inline Grid_simd< scalar_type, vector_type> innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r) { - return conj(l)*r; + return conjugate(l)*r; } template diff --git a/scripts/build-all b/scripts/build-all index d3a978b7..b97dca19 100755 --- a/scripts/build-all +++ b/scripts/build-all @@ -1,6 +1,6 @@ -#!/bin/bash +#!/bin/bash -e -DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi" +DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse" EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512" BLACK="\033[30m" RED="\033[31m" diff --git a/scripts/configure-all b/scripts/configure-all index 2f80b60a..f42b1960 100755 --- a/scripts/configure-all +++ b/scripts/configure-all @@ -1,6 +1,6 @@ #!/bin/bash -DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx" +DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse" for D in $DIRS do diff --git a/scripts/configure-commands b/scripts/configure-commands index 89b72294..c7c35e1c 100755 --- a/scripts/configure-commands +++ b/scripts/configure-commands @@ -34,6 +34,9 @@ icpc-avx512) icpc-mic) CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3 -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none ;; +clang-sse) +CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none + ;; clang-avx) CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none ;; @@ -47,16 +50,16 @@ clang-avx2-openmp) CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=none ;; clang-avx-openmp-mpi) -CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=mpi +CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi ;; clang-avx2-openmp-mpi) -CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp" LIBS="-lgmp -lmpfr" --enable-comms=mpi +CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi ;; clang-avx-mpi) -CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx " LIBS="-lgmp -lmpfr" --enable-comms=mpi +CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi ;; clang-avx2-mpi) -CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx " LIBS="-lgmp -lmpfr" --enable-comms=mpi +CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi ;; esac echo -e $NORMAL diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc index 45778922..1aa4fc43 100644 --- a/tests/Grid_main.cc +++ b/tests/Grid_main.cc @@ -140,7 +140,7 @@ int main (int argc, char ** argv) // -=,+=,*=,() // add,+,sub,-,mult,mac,* -// adj,conj +// adj,conjugate // real,imag // transpose,transposeIndex // trace,traceIndex @@ -437,7 +437,7 @@ int main (int argc, char ** argv) for(int r=0;r<3;r++){ for(int c=0;c<3;c++){ diff =shifted1()()(r,c)-shifted2()()(r,c); - nn=real(conj(diff)*diff); + nn=real(conjugate(diff)*diff); if ( nn > 0 ) cout<<"Shift fail (shifted1/shifted2-ref) "< 0 ) cout<<"Shift rb fail (shifted3/shifted2-ref) "< +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(4,vComplexF::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + std::vector seeds({1,2,3,4}); + + GridSerialRNG fsRNG; fsRNG.SeedFixedIntegers(seeds); + GridParallelRNG fpRNG(&Grid); fpRNG.SeedFixedIntegers(seeds); + + vComplexF tmp; random(fsRNG,tmp); + std::cout<<"Random vComplexF (fixed seed)\n"<< tmp< void operator()(vec &rr,vec &i1,vec &i2) const { rr = conj(i1);} + template void operator()(vec &rr,vec &i1,vec &i2) const { rr = conjugate(i1);} std::string name(void) const { return std::string("Conj"); } }; class funcAdj { @@ -49,6 +49,34 @@ public: template void operator()(vec &rr,vec &i1,vec &i2) const { rr = timesMinusI(i1);} std::string name(void) const { return std::string("timesMinusI"); } }; +class funcInnerProduct { +public: + funcInnerProduct() {}; + template void operator()(vec &rr,vec &i1,vec &i2) const { rr = innerProduct(i1,i2);} + std::string name(void) const { return std::string("innerProduct"); } +}; + +// FIXME still to test: +// +// innerProduct, +// norm2, +// Reduce, +// +// mac,mult,sub,add, vone,vzero,vcomplex_i, =zero, +// vset,vsplat,vstore,vstream,vload, scalar*vec, vec*scalar +// unary -, +// *= , -=, += +// outerproduct, +// zeroit +// permute + +class funcReduce { +public: + funcReduce() {}; +template void vfunc(reduce &rr,vec &i1,vec &i2) const { rr = Reduce(i1);} +template void sfunc(reduce &rr,scal &i1,scal &i2) const { rr = i1;} + std::string name(void) const { return std::string("Reduce"); } +}; template void Tester(const functor &func) @@ -96,8 +124,59 @@ void Tester(const functor &func) ok++; } } - if ( ok==0 ) std::cout << " OK!" < +void ReductionTester(const functor &func) +{ + GridSerialRNG sRNG; + sRNG.SeedRandomDevice(); + + int Nsimd = vec::Nsimd(); + + std::vector input1(Nsimd); + std::vector input2(Nsimd); + reduced result(0); + reduced reference(0); + reduced tmp; + + std::vector > buf(3); + vec & v_input1 = buf[0]; + vec & v_input2 = buf[1]; + + + for(int i=0;i(v_input1,input1); + merge(v_input2,input2); + + func.template vfunc(result,v_input1,v_input2); + + for(int i=0;i(tmp,input1[i],input2[i]); + reference+=tmp; + } + + std::cout << " " << func.name()< 1.0e-6 ){ // rounding is possible for reduce order + std::cout<< "*****" << std::endl; + std::cout<< abs(reference-result) << " " <(funcTimes()); Tester(funcConj()); Tester(funcAdj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); std::cout << "==================================="<< std::endl; std::cout << "Testing vComplexD "<(funcTimes()); Tester(funcConj()); Tester(funcAdj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); std::cout << "==================================="<< std::endl; std::cout << "Testing vRealF "<(funcMinus()); Tester(funcTimes()); Tester(funcAdj()); + Tester(funcConj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); std::cout << "==================================="<< std::endl; std::cout << "Testing vRealD "<(funcMinus()); Tester(funcTimes()); Tester(funcAdj()); + Tester(funcConj()); + Tester(funcInnerProduct()); + ReductionTester(funcReduce()); Grid_finalize(); } diff --git a/tests/Grid_stencil.cc b/tests/Grid_stencil.cc index 1fdb7265..9770ed7c 100644 --- a/tests/Grid_stencil.cc +++ b/tests/Grid_stencil.cc @@ -93,7 +93,7 @@ int main (int argc, char ** argv) for(int r=0;r<3;r++){ for(int c=0;c<3;c++){ diff =check()()(r,c)-bar()()(r,c); - double nn=real(conj(diff)*diff); + double nn=real(conjugate(diff)*diff); if ( nn > 0){ printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n", coor[0],coor[1],coor[2],coor[3],r,c, @@ -103,8 +103,8 @@ int main (int argc, char ** argv) real(bar()()(r,c)) ); } - snrmC=snrmC+real(conj(check()()(r,c))*check()()(r,c)); - snrmB=snrmB+real(conj(bar()()(r,c))*bar()()(r,c)); + snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c)); + snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c)); snrm=snrm+nn; }} diff --git a/tests/Makefile.am b/tests/Makefile.am index 82d1b3be..80f3a34c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_builddir)/lib # # Test code # -bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez +bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma Grid_simd Grid_rng Grid_remez Grid_rng_fixed Grid_main_SOURCES = Grid_main.cc Grid_main_LDADD = -lGrid @@ -13,6 +13,9 @@ Grid_main_LDADD = -lGrid Grid_rng_SOURCES = Grid_rng.cc Grid_rng_LDADD = -lGrid +Grid_rng_fixed_SOURCES = Grid_rng_fixed.cc +Grid_rng_fixed_LDADD = -lGrid + Grid_remez_SOURCES = Grid_remez.cc Grid_remez_LDADD = -lGrid diff --git a/tests/test_Grid_jacobi.cc b/tests/test_Grid_jacobi.cc index 95e808f1..0cf15a6f 100644 --- a/tests/test_Grid_jacobi.cc +++ b/tests/test_Grid_jacobi.cc @@ -186,7 +186,7 @@ int main (int argc, char ** argv) for(int r=0;r<3;r++){ for(int c=0;c<3;c++){ diff =check._internal._internal[r][c]-bar._internal._internal[r][c]; - double nn=real(conj(diff)*diff); + double nn=real(conjugate(diff)*diff); if ( nn > 0 ){ printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n", coor[0],coor[1],coor[2],coor[3],r,c,