From 8f5281563e66092010f7b83e2d8c73124f3b09e3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 9 Apr 2015 08:06:03 +0200 Subject: [PATCH] "where" and integer comparisons logic implemented for conditional assignment. LatticeCoordinate helper to get global (reduced) coordinate. Some more work of similar type perhaps needed, but the bulk of the required structure for masked array assignment is now in place. --- Grid.h | 2 +- Grid_Lattice.h | 36 ++-- Grid_QCD.h | 27 ++- Grid_comparison.h | 264 +++++++++++++++++++++++++ Grid_cshift_common.h | 1 - Grid_main.cc | 73 ++++++- Grid_math_type_mapper.h | 83 ++++++++ Grid_math_types.h | 414 +++++++++++++++++++++++++++++----------- Grid_predicated.h | 62 ++++++ Grid_simd.h | 90 +++++---- Grid_stencil.h | 352 +++++++++++++++++++++++++++++++++- Grid_vComplexD.h | 23 ++- Grid_vComplexF.h | 67 ++++++- Grid_vInteger.h | 69 ++++--- Grid_vRealD.h | 15 +- Grid_vRealF.h | 15 +- TODO | 67 +++++-- 17 files changed, 1420 insertions(+), 240 deletions(-) create mode 100644 Grid_comparison.h create mode 100644 Grid_math_type_mapper.h create mode 100644 Grid_predicated.h diff --git a/Grid.h b/Grid.h index 0a4024bf..caa4ed5e 100644 --- a/Grid.h +++ b/Grid.h @@ -41,12 +41,12 @@ #include #endif - #include #include #include #include #include +#include #include namespace Grid { diff --git a/Grid_Lattice.h b/Grid_Lattice.h index ebf68560..01b95755 100644 --- a/Grid_Lattice.h +++ b/Grid_Lattice.h @@ -19,16 +19,14 @@ public: typedef typename vobj::vector_type vector_type; public: - Lattice(GridBase *grid) : _grid(grid) { _odata.reserve(_grid->oSites()); assert((((uint64_t)&_odata[0])&0xF) ==0); checkerboard=0; } - #include - + template friend void conformable(const Lattice &lhs,const Lattice &rhs); @@ -156,23 +154,23 @@ public: v_ptr[i]=drand48(); } }; - + // FIXME for debug; deprecate this friend void lex_sites(Lattice &l){ - Real *v_ptr = (Real *)&l._odata[0]; - size_t o_len = l._grid->oSites(); - size_t v_len = sizeof(vobj)/sizeof(vRealF); - size_t vec_len = vRealF::Nsimd(); + Real *v_ptr = (Real *)&l._odata[0]; + size_t o_len = l._grid->oSites(); + size_t v_len = sizeof(vobj)/sizeof(vRealF); + size_t vec_len = vRealF::Nsimd(); - for(int i=0;i &l){ // Zero mean, unit variance. @@ -195,7 +193,7 @@ public: } return ret; } - // *=,+=,-= operators + // *=,+=,-= operators inherit behvour from correspond */+/- operation template inline Lattice &operator *=(const T &r) { *this = (*this)*r; @@ -351,7 +349,6 @@ public: inline auto operator * (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); - #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs*rhs._odata[ss]; @@ -383,7 +380,7 @@ public: { Lattice ret(lhs._grid); #pragma omp parallel for - for(int ss=0;ssoSites(); ss++){ + for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]*rhs; } return ret; @@ -409,5 +406,6 @@ public: return ret; } + } #endif diff --git a/Grid_QCD.h b/Grid_QCD.h index 3e852633..1d52f911 100644 --- a/Grid_QCD.h +++ b/Grid_QCD.h @@ -45,7 +45,7 @@ namespace QCD { typedef Lattice LatticeComplex; - typedef Lattice LatticeInteger; // Predicates for "where" + typedef Lattice LatticeInteger; // Predicates for "where" typedef Lattice LatticeColourMatrix; typedef Lattice LatticeSpinMatrix; @@ -92,6 +92,31 @@ namespace QCD { } return ret; } + + // FIXME for debug; deprecate this + inline void LatticeCoordinate(LatticeInteger &l,int mu){ + GridBase *grid = l._grid; + int Nsimd = grid->iSites(); + std::vector gcoor; + std::vector mergebuf(Nsimd); + std::vector mergeptr(Nsimd); + for(int o=0;ooSites();o++){ + for(int i=0;iiSites();i++){ + // RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); + grid->RankIndexToGlobalCoor(0,o,i,gcoor); + mergebuf[i]=gcoor[mu]; + mergeptr[i]=&mergebuf[i]; + } + merge(l._odata[o],mergeptr); + } + }; + +#include + +#if 0 + +#endif + } //namespace QCD } // Grid #endif diff --git a/Grid_comparison.h b/Grid_comparison.h new file mode 100644 index 00000000..c890d342 --- /dev/null +++ b/Grid_comparison.h @@ -0,0 +1,264 @@ +#ifndef GRID_COMPARISON_H +#define GRID_COMPARISON_H +namespace Grid { + + // Generic list of functors + template class veq { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs == rhs; + } + }; + template class vne { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs != rhs; + } + }; + template class vlt { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs < rhs; + } + }; + template class vle { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs <= rhs; + } + }; + template class vgt { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs > rhs; + } + }; + template class vge { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs >= rhs; + } + }; + + // Generic list of functors + template class seq { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs == rhs; + } + }; + template class sne { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs != rhs; + } + }; + template class slt { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs < rhs; + } + }; + template class sle { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs <= rhs; + } + }; + template class sgt { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs > rhs; + } + }; + template class sge { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs >= rhs; + } + }; + + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Integer gets extra relational functions. Could also implement these for RealF, RealD etc.. + ////////////////////////////////////////////////////////////////////////////////////////////////////// + template + inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs) + { + std::vector vlhs(vInteger::Nsimd()); // Use functors to reduce this to single implementation + std::vector vrhs(vInteger::Nsimd()); + vInteger ret; + extract(lhs,vlhs); + extract(rhs,vrhs); + for(int s=0;s(),lhs,rhs); + } + inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(sle(),lhs,rhs); + } + inline vInteger operator > (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(sgt(),lhs,rhs); + } + inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(sge(),lhs,rhs); + } + inline vInteger operator == (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(seq(),lhs,rhs); + } + inline vInteger operator != (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(sne(),lhs,rhs); + } + + ////////////////////////////////////////////////////////////////////////// + // relational operators + // + // Support <,>,<=,>=,==,!= + // + //Query supporting bitwise &, |, ^, ! + //Query supporting logical &&, ||, + ////////////////////////////////////////////////////////////////////////// + template + inline Lattice LLComparison(vfunctor op,const Lattice &lhs,const Lattice &rhs) + { + Lattice ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); + } + return ret; + } + template + inline Lattice LSComparison(vfunctor op,const Lattice &lhs,const robj &rhs) + { + Lattice ret(lhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=op(lhs._odata[ss],rhs); + } + return ret; + } + template + inline Lattice SLComparison(vfunctor op,const lobj &lhs,const Lattice &rhs) + { + Lattice ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=op(lhs._odata[ss],rhs); + } + return ret; + } + + // Less than + template + inline Lattice operator < (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vlt(),lhs,rhs); + } + template + inline Lattice operator < (const Lattice & lhs, const robj & rhs) { + return LSComparison(vlt(),lhs,rhs); + } + template + inline Lattice operator < (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vlt(),lhs,rhs); + } + + // Less than equal + template + inline Lattice operator <= (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vle(),lhs,rhs); + } + template + inline Lattice operator <= (const Lattice & lhs, const robj & rhs) { + return LSComparison(vle(),lhs,rhs); + } + template + inline Lattice operator <= (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vle(),lhs,rhs); + } + + // Greater than + template + inline Lattice operator > (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vgt(),lhs,rhs); + } + template + inline Lattice operator > (const Lattice & lhs, const robj & rhs) { + return LSComparison(vgt(),lhs,rhs); + } + template + inline Lattice operator > (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vgt(),lhs,rhs); + } + + + // Greater than equal + template + inline Lattice operator >= (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vge(),lhs,rhs); + } + template + inline Lattice operator >= (const Lattice & lhs, const robj & rhs) { + return LSComparison(vge(),lhs,rhs); + } + template + inline Lattice operator >= (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vge(),lhs,rhs); + } + + + // equal + template + inline Lattice operator == (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(veq(),lhs,rhs); + } + template + inline Lattice operator == (const Lattice & lhs, const robj & rhs) { + return LSComparison(veq(),lhs,rhs); + } + template + inline Lattice operator == (const lobj & lhs, const Lattice & rhs) { + return SLComparison(veq(),lhs,rhs); + } + + + // not equal + template + inline Lattice operator != (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vne(),lhs,rhs); + } + template + inline Lattice operator != (const Lattice & lhs, const robj & rhs) { + return LSComparison(vne(),lhs,rhs); + } + template + inline Lattice operator != (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vne(),lhs,rhs); + } + + +} +#endif diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h index 08e75cff..2910151c 100644 --- a/Grid_cshift_common.h +++ b/Grid_cshift_common.h @@ -265,7 +265,6 @@ friend void Copy_plane_permute(Lattice& lhs,Lattice &rhs, int dimens ////////////////////////////////////////////////////// // Local to node Cshift ////////////////////////////////////////////////////// - friend void Cshift_local(Lattice& ret,Lattice &rhs,int dimension,int shift) { int sshift[2]; diff --git a/Grid_main.cc b/Grid_main.cc index 62356436..1676a07b 100644 --- a/Grid_main.cc +++ b/Grid_main.cc @@ -19,10 +19,10 @@ int main (int argc, char ** argv) std::vector simd_layout(4); std::vector mpi_layout(4); - mpi_layout[0]=2; - mpi_layout[1]=2; - mpi_layout[2]=2; - mpi_layout[3]=2; + mpi_layout[0]=1; + mpi_layout[1]=1; + mpi_layout[2]=1; + mpi_layout[3]=1; #ifdef AVX512 for(int omp=128;omp<236;omp+=16){ @@ -121,12 +121,34 @@ int main (int argc, char ** argv) // Non-lattice (const objects) * Lattice ColourMatrix cm; SpinColourMatrix scm; - + vSpinColourMatrix vscm; + Complex cplx(1.0); + Integer myint=1; + double mydouble=1.0; + + // vSpinColourMatrix vscm; scMat = cMat*scMat; scm = cm * scm; // SpinColourMatrix = ColourMatrix * SpinColourMatrix scm = scm *cm; // SpinColourMatrix = SpinColourMartix * ColourMatrix scm = GammaFive * scm ; // SpinColourMatrix = SpinMatrix * SpinColourMatrix scm = scm* GammaFive ; // SpinColourMatrix = SpinColourMatrix * SpinMatrix + + scm = scm*cplx; + vscm = vscm*cplx; + scMat = scMat*cplx; + + scm = cplx*scm; + vscm = cplx*vscm; + scMat = cplx*scMat; + scm = myint*scm; + vscm = myint*vscm; + scMat = scMat*myint; + + scm = scm*mydouble; + vscm = vscm*mydouble; + scMat = scMat*mydouble; + scMat = mydouble*scMat; + cMat = mydouble*cMat; sMat = adj(sMat); // LatticeSpinMatrix adjoint sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix @@ -160,7 +182,35 @@ int main (int argc, char ** argv) */ lex_sites(Foo); + Integer mm[4]; + mm[0]=1; + mm[1]=Fine._rdimensions[0]; + mm[2]=Fine._ldimensions[0]*Fine._ldimensions[1]; + mm[3]=Fine._ldimensions[0]*Fine._ldimensions[1]*Fine._ldimensions[2]; + LatticeInteger lex(&Fine); + lex=zero; + for(int d=0;d<4;d++){ + LatticeInteger coor(&Fine); + LatticeCoordinate(coor,d); + lex = lex + coor*mm[d]; + } + Bar = zero; + Bar = where(lex<10,Foo,Bar); + { + std::vector coor(4); + for(coor[3]=0;coor[3] diff; @@ -305,7 +361,8 @@ int main (int argc, char ** argv) double nn=Ttr._internal._internal; if ( nn > 0 ) cout<<"Shift real trace fail "< >::scalar_type == ComplexD. +////////////////////////////////////////////////////////////////////////////////// + + template class GridTypeMapper { + public: + typedef typename T::scalar_type scalar_type; + typedef typename T::vector_type vector_type; + typedef typename T::tensor_reduced tensor_reduced; + }; + +////////////////////////////////////////////////////////////////////////////////// +// Recursion stops with these template specialisations +////////////////////////////////////////////////////////////////////////////////// + template<> class GridTypeMapper { + public: + typedef RealF scalar_type; + typedef RealF vector_type; + typedef RealF tensor_reduced ; + }; + template<> class GridTypeMapper { + public: + typedef RealD scalar_type; + typedef RealD vector_type; + typedef RealD tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef ComplexF scalar_type; + typedef ComplexF vector_type; + typedef ComplexF tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef ComplexD scalar_type; + typedef ComplexD vector_type; + typedef ComplexD tensor_reduced; + }; + + template<> class GridTypeMapper { + public: + typedef RealF scalar_type; + typedef vRealF vector_type; + typedef vRealF tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef RealD scalar_type; + typedef vRealD vector_type; + typedef vRealD tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef ComplexF scalar_type; + typedef vComplexF vector_type; + typedef vComplexF tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef ComplexD scalar_type; + typedef vComplexD vector_type; + typedef vComplexD tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef Integer scalar_type; + typedef vInteger vector_type; + typedef vInteger tensor_reduced; + }; + + // Again terminate the recursion. + inline vRealD TensorRemove(vRealD arg){ return arg;} + inline vRealF TensorRemove(vRealF arg){ return arg;} + inline vComplexF TensorRemove(vComplexF arg){ return arg;} + inline vComplexD TensorRemove(vComplexD arg){ return arg;} + inline vInteger TensorRemove(vInteger arg){ return arg;} + +} + +#endif diff --git a/Grid_math_types.h b/Grid_math_types.h index 25524966..a2a3f97c 100644 --- a/Grid_math_types.h +++ b/Grid_math_types.h @@ -1,63 +1,11 @@ #ifndef GRID_MATH_TYPES_H #define GRID_MATH_TYPES_H + +#include + namespace Grid { - -////////////////////////////////////////////////////////////////////////////////// -// Want to recurse: GridTypeMapper >::scalar_type == ComplexD. -////////////////////////////////////////////////////////////////////////////////// - - template class GridTypeMapper { - public: - typedef typename T::scalar_type scalar_type; - typedef typename T::vector_type vector_type; - }; - - template<> class GridTypeMapper { - public: - typedef RealF scalar_type; - typedef RealF vector_type; - }; - template<> class GridTypeMapper { - public: - typedef RealD scalar_type; - typedef RealD vector_type; - }; - template<> class GridTypeMapper { - public: - typedef ComplexF scalar_type; - typedef ComplexF vector_type; - }; - template<> class GridTypeMapper { - public: - typedef ComplexD scalar_type; - typedef ComplexD vector_type; - }; - - template<> class GridTypeMapper { - public: - typedef RealF scalar_type; - typedef vRealF vector_type; - }; - template<> class GridTypeMapper { - public: - typedef RealD scalar_type; - typedef vRealD vector_type; - }; - template<> class GridTypeMapper { - public: - typedef ComplexF scalar_type; - typedef vComplexF vector_type; - }; - template<> class GridTypeMapper { - public: - typedef ComplexD scalar_type; - typedef vComplexD vector_type; - }; - - - /////////////////////////////////////////////////// // Scalar, Vector, Matrix objects. // These can be composed to form tensor products of internal indices. @@ -70,9 +18,16 @@ public: typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; + typedef iScalar tensor_reduced; + iScalar(){}; + + iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type + iScalar(Zero &z){ *this = zero; }; + iScalar & operator= (const Zero &hero){ zeroit(*this); return *this; @@ -80,22 +35,27 @@ public: friend void zeroit(iScalar &that){ zeroit(that._internal); } - friend void permute(iScalar &out,iScalar &in,int permutetype){ + friend void permute(iScalar &out,const iScalar &in,int permutetype){ permute(out._internal,in._internal,permutetype); } - friend void extract(iScalar &in,std::vector &out){ + friend void extract(const iScalar &in,std::vector &out){ extract(in._internal,out); // extract advances the pointers in out } friend void merge(iScalar &in,std::vector &out){ merge(in._internal,out); // extract advances the pointers in out } + friend inline iScalar::vector_type TensorRemove(iScalar arg) + { + return TensorRemove(arg._internal); + } + // Unary negation friend inline iScalar operator -(const iScalar &r) { iScalar ret; ret._internal= -r._internal; return ret; } - // *=,+=,-= operators + // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour inline iScalar &operator *=(const iScalar &r) { *this = (*this)*r; return *this; @@ -117,6 +77,9 @@ public: typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; + typedef iScalar tensor_reduced; + iVector(Zero &z){ *this = zero; }; iVector() {}; @@ -129,12 +92,12 @@ public: zeroit(that._internal[i]); } } - friend void permute(iVector &out,iVector &in,int permutetype){ + friend void permute(iVector &out,const iVector &in,int permutetype){ for(int i=0;i &in,std::vector &out){ + friend void extract(const iVector &in,std::vector &out){ for(int i=0;i &operator *=(const iScalar &r) { *this = (*this)*r; return *this; @@ -163,10 +126,8 @@ public: *this = (*this)+r; return *this; } - }; - template class iMatrix { public: @@ -174,62 +135,64 @@ public: typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; + typedef iScalar tensor_reduced; - iMatrix(Zero &z){ *this = zero; }; - iMatrix() {}; - iMatrix & operator= (Zero &hero){ - zeroit(*this); - return *this; - } - friend void zeroit(iMatrix &that){ - for(int i=0;i &out,iMatrix &in,int permutetype){ - for(int i=0;i & operator= (Zero &hero){ + zeroit(*this); + return *this; + } + friend void zeroit(iMatrix &that){ + for(int i=0;i &out,const iMatrix &in,int permutetype){ + for(int i=0;i &in,std::vector &out){ - for(int i=0;i &in,std::vector &out){ + for(int i=0;i &in,std::vector &out){ - for(int i=0;i &in,std::vector &out){ + for(int i=0;i operator -(const iMatrix &r) { - iMatrix ret; - for(int i=0;i - inline iMatrix &operator *=(const T &r) { - *this = (*this)*r; - return *this; - } - template - inline iMatrix &operator -=(const T &r) { - *this = (*this)-r; - return *this; - } - template - inline iMatrix &operator +=(const T &r) { - *this = (*this)+r; - return *this; - } + }} + } + // Unary negation + friend inline iMatrix operator -(const iMatrix &r) { + iMatrix ret; + for(int i=0;i + inline iMatrix &operator *=(const T &r) { + *this = (*this)*r; + return *this; + } + template + inline iMatrix &operator -=(const T &r) { + *this = (*this)-r; + return *this; + } + template + inline iMatrix &operator +=(const T &r) { + *this = (*this)+r; + return *this; + } }; @@ -642,7 +605,8 @@ iVector operator * (const iVector& lhs,const iScalar& r // mat x vec = vec // vec x scal = vec // scal x vec = vec - + // + // We can special case scalar_type ?? template inline auto operator * (const iScalar& lhs,const iScalar& rhs) -> iScalar { @@ -715,6 +679,229 @@ auto operator * (const iVector& lhs,const iScalar& rhs) -> iVector inline iScalar operator * (const iScalar& lhs,const typename iScalar::scalar_type rhs) +{ + typename iScalar::tensor_reduced srhs(rhs); + return lhs*srhs; +} +template inline iScalar operator * (const typename iScalar::scalar_type lhs,const iScalar& rhs) { return rhs*lhs; } + +template inline iVector operator * (const iVector& lhs,const typename iScalar::scalar_type rhs) +{ + typename iVector::tensor_reduced srhs(rhs); + return lhs*srhs; +} +template inline iVector operator * (const typename iScalar::scalar_type lhs,const iVector& rhs) { return rhs*lhs; } + +template inline iMatrix operator * (const iMatrix& lhs,const typename iScalar::scalar_type &rhs) +{ + typename iMatrix::tensor_reduced srhs(rhs); + return lhs*srhs; +} +template inline iMatrix operator * (const typename iScalar::scalar_type & lhs,const iMatrix& rhs) { return rhs*lhs; } + +//////////////////////////////////////////////////////////////////// +// Double support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator * (const iScalar& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iScalar operator * (double lhs,const iScalar& rhs) { return rhs*lhs; } + +template inline iVector operator * (const iVector& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iVector operator * (double lhs,const iVector& rhs) { return rhs*lhs; } + +template inline iMatrix operator * (const iMatrix& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iMatrix operator * (double lhs,const iMatrix& rhs) { return rhs*lhs; } + +//////////////////////////////////////////////////////////////////// +// Integer support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator * (const iScalar& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iScalar operator * (Integer lhs,const iScalar& rhs) { return rhs*lhs; } + +template inline iVector operator * (const iVector& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iVector operator * (Integer lhs,const iVector& rhs) { return rhs*lhs; } + +template inline iMatrix operator * (const iMatrix& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iMatrix operator * (Integer lhs,const iMatrix& rhs) { return rhs*lhs; } + + + +/////////////////////////////////////////////////////////////////////////////////////////////// +// addition by fundamental scalar type applies to matrix(down diag) and scalar +/////////////////////////////////////////////////////////////////////////////////////////////// +template inline iScalar operator + (const iScalar& lhs,const typename iScalar::scalar_type rhs) +{ + typename iScalar::tensor_reduced srhs(rhs); + return lhs+srhs; +} +template inline iScalar operator + (const typename iScalar::scalar_type lhs,const iScalar& rhs) { return rhs+lhs; } + +template inline iMatrix operator + (const iMatrix& lhs,const typename iScalar::scalar_type rhs) +{ + typename iMatrix::tensor_reduced srhs(rhs); + return lhs+srhs; +} +template inline iMatrix operator + (const typename iScalar::scalar_type lhs,const iMatrix& rhs) { return rhs+lhs; } + +//////////////////////////////////////////////////////////////////// +// Double support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator + (const iScalar& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs+srhs; +} +template inline iScalar operator + (double lhs,const iScalar& rhs) { return rhs+lhs; } + +template inline iMatrix operator + (const iMatrix& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs+srhs; +} +template inline iMatrix operator + (double lhs,const iMatrix& rhs) { return rhs+lhs; } + +//////////////////////////////////////////////////////////////////// +// Integer support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator + (const iScalar& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs+srhs; +} +template inline iScalar operator + (Integer lhs,const iScalar& rhs) { return rhs+lhs; } + +template inline iMatrix operator + (const iMatrix& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs+srhs; +} +template inline iMatrix operator + (Integer lhs,const iMatrix& rhs) { return rhs+lhs; } + + +/////////////////////////////////////////////////////////////////////////////////////////////// +// subtraction of fundamental scalar type applies to matrix(down diag) and scalar +/////////////////////////////////////////////////////////////////////////////////////////////// +template inline iScalar operator - (const iScalar& lhs,const typename iScalar::scalar_type rhs) +{ + typename iScalar::tensor_reduced srhs(rhs); + return lhs-srhs; +} +template inline iScalar operator - (const typename iScalar::scalar_type lhs,const iScalar& rhs) +{ + typename iScalar::tensor_reduced slhs(lhs); + return slhs-rhs; +} + +template inline iMatrix operator - (const iMatrix& lhs,const typename iScalar::scalar_type rhs) +{ + typename iScalar::tensor_reduced srhs(rhs); + return lhs-srhs; +} +template inline iMatrix operator - (const typename iScalar::scalar_type lhs,const iMatrix& rhs) +{ + typename iScalar::tensor_reduced slhs(lhs); + return slhs-rhs; +} + +//////////////////////////////////////////////////////////////////// +// Double support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator - (const iScalar& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs-srhs; +} +template inline iScalar operator - (double lhs,const iScalar& rhs) +{ + typename iScalar::scalar_type t(lhs); + typename iScalar::tensor_reduced slhs(t); + return slhs-rhs; +} + +template inline iMatrix operator - (const iMatrix& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs-srhs; +} +template inline iMatrix operator - (double lhs,const iMatrix& rhs) +{ + typename iScalar::scalar_type t(lhs); + typename iScalar::tensor_reduced slhs(t); + return slhs-rhs; +} + +//////////////////////////////////////////////////////////////////// +// Integer support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator - (const iScalar& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs-srhs; +} +template inline iScalar operator - (Integer lhs,const iScalar& rhs) +{ + typename iScalar::scalar_type t(lhs); + typename iScalar::tensor_reduced slhs(t); + return slhs-rhs; +} +template inline iMatrix operator - (const iMatrix& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs-srhs; +} +template inline iMatrix operator - (Integer lhs,const iMatrix& rhs) +{ + typename iScalar::scalar_type t(lhs); + typename iScalar::tensor_reduced slhs(t); + return slhs-rhs; +} + + + + /////////////////////////////////////////////////////////////////////////////////////// // localInnerProduct Scalar x Scalar -> Scalar // localInnerProduct Vector x Vector -> Scalar @@ -907,6 +1094,7 @@ inline auto trace(const iScalar &arg) -> iScalar +inline void where(Lattice &ret,const LatticeInteger &predicate,Lattice &iftrue,Lattice &iffalse) +{ + conformable(iftrue,iffalse); + conformable(iftrue,predicate); + conformable(iftrue,ret); + + GridBase *grid=iftrue._grid; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + const int Nsimd = grid->Nsimd(); + const int words = sizeof(vobj)/sizeof(vector_type); + + std::vector mask(Nsimd); + std::vector > truevals (Nsimd,std::vector(words) ); + std::vector > falsevals(Nsimd,std::vector(words) ); + std::vector pointers(Nsimd); + +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + + for(int s=0;s +inline Lattice where(const LatticeInteger &predicate,Lattice &iftrue,Lattice &iffalse) +{ + conformable(iftrue,iffalse); + conformable(iftrue,predicate); + + Lattice ret(iftrue._grid); + + where(ret,predicate,iftrue,iffalse); + + return ret; +} + +#endif diff --git a/Grid_simd.h b/Grid_simd.h index 53cc8909..1229d328 100644 --- a/Grid_simd.h +++ b/Grid_simd.h @@ -11,28 +11,15 @@ // Vector types are arch dependent //////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////// - // SIMD Alignment controls - //////////////////////////////////////////////////////////// -#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED -#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A))) -#else -#define ALIGN_DIRECTIVE(A) __declspec(align(A)) -#endif #ifdef SSE2 #include -#define SIMDalign ALIGN_DIRECTIVE(16) #endif - #if defined(AVX1) || defined (AVX2) #include -#define SIMDalign ALIGN_DIRECTIVE(32) #endif - #ifdef AVX512 #include -#define SIMDalign ALIGN_DIRECTIVE(64) #endif namespace Grid { @@ -137,41 +124,66 @@ namespace Grid { // Generic extract/merge/permute ///////////////////////////////////////////////////////////////// template -inline void Gextract(vsimd &y,std::vector &extracted){ -#if 1 +inline void Gextract(const vsimd &y,std::vector &extracted){ // FIXME: bounce off stack is painful // temporary hack while I figure out better way. // There are intrinsics to do this work without the storage. - int Nsimd = extracted.size(); - { - std::vector > buf(Nsimd); - vstore(y,&buf[0]); - for(int i=0;i > buf(Nsimd); + vstore(y,&buf[0]); + for(int i=0;i inline void Gmerge(vsimd &y,std::vector &extracted){ -#if 1 - int Nsimd = extracted.size(); + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + std::vector buf(Nsimd); - for(int i=0;i +inline void Gextract(const vsimd &y,std::vector &extracted){ + // FIXME: bounce off stack is painful + // temporary hack while I figure out better way. + // There are intrinsics to do this work without the storage. + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + + std::vector > buf(Nsimd); + + vstore(y,&buf[0]); + + for(int i=0;i +inline void Gmerge(vsimd &y,std::vector &extracted){ + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + + std::vector buf(Nsimd); + for(int i=0;i &extracted){ // Permute 4 possible on half precision @512bit vectors. ////////////////////////////////////////////////////////// template -inline void Gpermute(vsimd &y,vsimd b,int perm){ +inline void Gpermute(vsimd &y,const vsimd &b,int perm){ switch (perm){ #if defined(AVX1)||defined(AVX2) // 8x32 bits=>3 permutes @@ -214,10 +226,10 @@ inline void Gpermute(vsimd &y,vsimd b,int perm){ }; }; +#include #include #include #include #include -#include #endif diff --git a/Grid_stencil.h b/Grid_stencil.h index d93204fc..4d73f436 100644 --- a/Grid_stencil.h +++ b/Grid_stencil.h @@ -8,5 +8,355 @@ // Lattice could also allocate haloes which get used for stencil code. // // Grid could create a neighbour index table for a given stencil. -// Could also implement CovariantCshift. +// +// Could also implement CovariantCshift, to fuse the loops and enhance performance. +// +// +// General stencil computation: +// +// Generic services +// 0) Prebuild neighbour tables +// 1) Compute sizes of all haloes/comms buffers; allocate them. +// +// 2) Gather all faces, and communicate. +// 3) Loop over result sites, giving nbr index/offnode info for each +// +// Could take a +// SpinProjectFaces +// start comms +// complete comms +// Reconstruct Umu +// +// Approach. +// ////////////////////////////////////////////////////////////////////////////////////////// + +namespace Grid { + + class Stencil { + public: + + Stencil(GridBase *grid, + int npoints, + int checkerboard, + std::vector directions, + std::vector distances); + + void Stencil_local (int dimension,int shift,int cbmask); + void Stencil_comms (int dimension,int shift,int cbmask); + void Stencil_comms_simd(int dimension,int shift,int cbmask); + // Will need to implement actions for + // + Copy_plane; + Copy_plane_permute; + Gather_plane; + + + + // The offsets to all neibours in stencil in each direction + int _checkerboard; + int _npoints; // Move to template param? + GridBase * _grid; + + // Store these as SIMD Integer needed + // + // std::vector< iVector > _offsets; + // std::vector< iVector > _local; + // std::vector< iVector > _comm_buf_size; + // std::vector< iVector > _permute; + + std::vector > _offsets; + std::vector > _local; + std::vector _comm_buf_size; + std::vector _permute; + + }; + + Stencil::Stencil(GridBase *grid, + int npoints, + int checkerboard, + std::vector directions, + std::vector distances){ + + _npoints = npoints; + _grid = grid; + + for(int i=0;i_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + _checkerboard = checkerboard; + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + int sshift[2]; + + if ( !comm_dim ) { + sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); + sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + + if ( sshift[0] == sshift[1] ) { + Stencil_local(dimension,shift,0x3); + } else { + Stencil_local(dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Stencil_local(dimension,shift,0x2);// both with block stride loop iteration + } + } else if ( splice_dim ) { + sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); + sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + + if ( sshift[0] == sshift[1] ) { + Stencil_comms_simd(dimension,shift,0x3); + } else { + Stencil_comms_simd(dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Stencil_comms_simd(dimension,shift,0x2);// both with block stride loop iteration + } + } else { + // Cshift_comms(ret,rhs,dimension,shift); + sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); + sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + if ( sshift[0] == sshift[1] ) { + Stencil_comms(dimension,shift,0x3); + } else { + Stencil_comms(dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Stencil_comms(dimension,shift,0x2);// both with block stride loop iteration + } + } + } + } + + + void Stencil::Stencil_local (int dimension,int shift,int cbmask) + { + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int gd = _grid->_gdimensions[dimension]; + + // Map to always positive shift modulo global full dimension. + shift = (shift+fd)%fd; + + // the permute type + int permute_dim =_grid->PermuteDim(dimension); + int permute_type=_grid->PermuteType(dimension); + + for(int x=0;x_ostride[dimension]; + + int cb= (cbmask==0x2)? 1 : 0; + + int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); + int sx = (x+sshift)%rd; + + int permute_slice=0; + if(permute_dim){ + int wrap = sshift/rd; + int num = sshift%rd; + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + } + + if ( permute_slice ) Copy_plane_permute(dimension,x,sx,cbmask,permute_type); + else Copy_plane (dimension,x,sx,cbmask); + + } + } + + void Stencil::Stencil_comms (int dimension,int shift,int cbmask) + { + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + GridBase *grid=_grid; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; + // FIXME: Do something with buffer_size?? + + int cb= (cbmask==0x2)? 1 : 0; + int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); + + for(int x=0;x= rd ); + int sx = (x+sshift)%rd; + int comm_proc = (x+sshift)/rd; + + if (!offnode) { + + Copy_plane(dimension,x,sx,cbmask); + + } else { + + int words = send_buf.size(); + if (cbmask != 0x3) words=words>>1; + + int bytes = words * sizeof(vobj); + + Gather_plane_simple (dimension,sx,cbmask); + + int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + /* + grid->SendToRecvFrom((void *)&send_buf[0], + xmit_to_rank, + (void *)&recv_buf[0], + recv_from_rank, + bytes); + */ + Scatter_plane_simple (dimension,x,cbmask); + } + } + } + + void Stencil::Stencil_comms_simd(int dimension,int shift,int cbmask) + { + GridBase *grid=_grid; + const int Nsimd = _grid->Nsimd(); + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(comm_dim==1); + assert(simd_layout==2); + assert(shift>=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = _grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; + // FIXME do something with buffer size + + std::vector pointers(Nsimd); // + std::vector rpointers(Nsimd); // received pointers + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + + int cb = (cbmask==0x2)? 1 : 0; + int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); + + std::vector comm_offnode(simd_layout); + std::vector comm_proc (simd_layout); //relative processor coord in dim=dimension + std::vector icoor(grid->Nd()); + + for(int x=0;x= ld; + comm_any = comm_any | comm_offnode[s]; + comm_proc[s] = shifted_x/ld; + } + + int o = 0; + int bo = x*grid->_ostride[dimension]; + int sx = (x+sshift)%rd; + + if ( comm_any ) { + + for(int i=0;iiCoorFromIindex(icoor,i); + s = icoor[dimension]; + + if(comm_offnode[s]){ + + int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank); + + /* + grid->SendToRecvFrom((void *)&send_buf_extract[i][0], + xmit_to_rank, + (void *)&recv_buf_extract[i][0], + recv_from_rank, + bytes); + */ + + rpointers[i] = (scalar_type *)&recv_buf_extract[i][0]; + + } else { + + rpointers[i] = (scalar_type *)&send_buf_extract[i][0]; + + } + + } + + // Permute by swizzling pointers in merge + int permute_slice=0; + int lshift=sshift%ld; + int wrap =lshift/rd; + int num =lshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + int toggle_bit = (Nsimd>>(permute_type+1)); + int PermuteMap; + for(int i=0;i(y,extracted); } - friend inline void extract(vComplexD &y,std::vector &extracted) + friend inline void extract(const vComplexD &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vComplexD &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vComplexD &y,std::vector &extracted) { Gextract(y,extracted); } @@ -184,6 +198,11 @@ namespace Grid { /////////////////////// // Splat /////////////////////// + friend inline void vsplat(vComplexD &ret,ComplexD c){ + float a= real(c); + float b= imag(c); + vsplat(ret,a,b); + } friend inline void vsplat(vComplexD &ret,double rl,double ig){ #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_pd(ig,rl,ig,rl); @@ -215,7 +234,7 @@ namespace Grid { #endif } -friend inline void vstore(vComplexD &ret, ComplexD *a){ +friend inline void vstore(const vComplexD &ret, ComplexD *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_pd((double *)a,ret.v); #endif diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h index b7fb3d6a..b6b7ebe9 100644 --- a/Grid_vComplexF.h +++ b/Grid_vComplexF.h @@ -20,6 +20,12 @@ namespace Grid { return (*this); } vComplexF(){}; + vComplexF(ComplexF a){ + vsplat(*this,a); + }; + vComplexF(double a){ + vsplat(*this,ComplexF(a)); + }; /////////////////////////////////////////////// // mac, mult, sub, add, adj @@ -161,7 +167,7 @@ namespace Grid { vsplat(ret,a,b); } -friend inline void vstore(vComplexF &ret, ComplexF *a){ +friend inline void vstore(const vComplexF &ret, ComplexF *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_ps((float *)a,ret.v); #endif @@ -210,27 +216,47 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){ #endif } - friend inline vComplexF operator * (const Complex &a, vComplexF b){ vComplexF va; vsplat(va,a); return va*b; } friend inline vComplexF operator * (vComplexF b,const Complex &a){ + return a*b; + } + + /* + template + friend inline vComplexF operator * (vComplexF b,const real &a){ vComplexF va; - vsplat(va,a); + Complex ca(a,0); + vsplat(va,ca); return va*b; } + template + friend inline vComplexF operator * (const real &a,vComplexF b){ + return a*b; + } + friend inline vComplexF operator + (const Complex &a, vComplexF b){ vComplexF va; vsplat(va,a); return va+b; } friend inline vComplexF operator + (vComplexF b,const Complex &a){ - vComplexF va; - vsplat(va,a); - return b+va; + return a+b; } + template + friend inline vComplexF operator + (vComplexF b,const real &a){ + vComplexF va; + Complex ca(a,0); + vsplat(va,ca); + return va+b; + } + template + friend inline vComplexF operator + (const real &a,vComplexF b){ + return a+b; + } friend inline vComplexF operator - (const Complex &a, vComplexF b){ vComplexF va; vsplat(va,a); @@ -241,7 +267,24 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){ vsplat(va,a); return b-va; } - // NB: Template the following on "type Complex" and then implement *,+,- for ComplexF, ComplexD, RealF, RealD above to + template + friend inline vComplexF operator - (vComplexF b,const real &a){ + vComplexF va; + Complex ca(a,0); + vsplat(va,ca); + return b-va; + } + template + friend inline vComplexF operator - (const real &a,vComplexF b){ + vComplexF va; + Complex ca(a,0); + vsplat(va,ca); + return va-b; + } + */ + + // NB: Template the following on "type Complex" and then implement *,+,- for + // ComplexF, ComplexD, RealF, RealD above to // get full generality of binops with scalars. friend inline void mac (vComplexF *__restrict__ y,const Complex *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; friend inline void mult(vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); } @@ -304,7 +347,15 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){ { Gmerge(y,extracted); } - friend inline void extract(vComplexF &y,std::vector &extracted) + friend inline void extract(const vComplexF &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vComplexF &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vComplexF &y,std::vector &extracted) { Gextract(y,extracted); } diff --git a/Grid_vInteger.h b/Grid_vInteger.h index 6ddce191..82adbd8e 100644 --- a/Grid_vInteger.h +++ b/Grid_vInteger.h @@ -10,7 +10,7 @@ namespace Grid { typedef uint32_t Integer; - class vInteger { + class vInteger { protected: public: @@ -21,6 +21,13 @@ namespace Grid { typedef Integer scalar_type; vInteger(){}; + vInteger & operator = (const Zero & z){ + vzero(*this); + return (*this); + } + vInteger(Integer a){ + vsplat(*this,a); + }; //////////////////////////////////// // Arithmetic operator overloads +,-,* //////////////////////////////////// @@ -166,18 +173,18 @@ namespace Grid { #endif } -friend inline void vstore(vInteger &ret, Integer *a){ + friend inline void vstore(const vInteger &ret, Integer *a){ #if defined (AVX1)|| defined (AVX2) - _mm256_store_si256((__m256i*)a,ret.v); + _mm256_store_si256((__m256i*)a,ret.v); #endif #ifdef SSE2 - _mm_store_si128(a,ret.v); + _mm_store_si128(a,ret.v); #endif #ifdef AVX512 - _mm512_store_si512(a,ret.v); + _mm512_store_si512(a,ret.v); #endif #ifdef QPX - assert(0); + assert(0); #endif } @@ -185,6 +192,7 @@ friend inline void vstore(vInteger &ret, Integer *a){ { _mm_prefetch((const char*)&v.v,_MM_HINT_T0); } + // Unary negation friend inline vInteger operator -(const vInteger &r) { vInteger ret; @@ -210,9 +218,32 @@ friend inline void vstore(vInteger &ret, Integer *a){ *this = *this-r; return *this; } + + friend inline void permute(vInteger &y,const vInteger b,int perm) + { + Gpermute(y,b,perm); + } + friend inline void merge(vInteger &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vInteger &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vInteger &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vInteger &y,std::vector &extracted) + { + Gextract(y,extracted); + } + + public: - static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);} - }; + static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);} + }; inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; } @@ -222,27 +253,7 @@ friend inline void vstore(vInteger &ret, Integer *a){ { return l*r; } - - - class vIntegerF : public vInteger - { - public: - static inline int Nsimd(void) { return sizeof(ivec)/sizeof(float);} - - friend inline void permute(vIntegerF &y,vIntegerF b,int perm) - { - Gpermute(y,b,perm); - } - friend inline void merge(vIntegerF &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(vIntegerF &y,std::vector &extracted) - { - Gextract(y,extracted); - } - }; - + } #endif diff --git a/Grid_vRealD.h b/Grid_vRealD.h index 13ceedbe..a5b59be3 100644 --- a/Grid_vRealD.h +++ b/Grid_vRealD.h @@ -13,6 +13,9 @@ namespace Grid { typedef RealD scalar_type; vRealD(){}; + vRealD(RealD a){ + vsplat(*this,a); + }; friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);} friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);} @@ -112,7 +115,15 @@ namespace Grid { { Gmerge(y,extracted); } - friend inline void extract(vRealD &y,std::vector &extracted) + friend inline void extract(const vRealD &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vRealD &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vRealD &y,std::vector &extracted) { Gextract(y,extracted); } @@ -157,7 +168,7 @@ namespace Grid { #endif } - friend inline void vstore(vRealD &ret, double *a){ + friend inline void vstore(const vRealD &ret, double *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_pd(a,ret.v); #endif diff --git a/Grid_vRealF.h b/Grid_vRealF.h index 185f4da8..0fe68f43 100644 --- a/Grid_vRealF.h +++ b/Grid_vRealF.h @@ -14,6 +14,9 @@ namespace Grid { typedef RealF scalar_type; vRealF(){}; + vRealF(RealF a){ + vsplat(*this,a); + }; //////////////////////////////////// // Arithmetic operator overloads +,-,* //////////////////////////////////// @@ -133,7 +136,15 @@ namespace Grid { { Gmerge(y,extracted); } - friend inline void extract(vRealF &y,std::vector &extracted) + friend inline void extract(const vRealF &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vRealF &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vRealF &y,std::vector &extracted) { Gextract(y,extracted); } @@ -180,7 +191,7 @@ namespace Grid { //////////////////////////////////////////////////////////////////////// // FIXME: gonna remove these load/store, get, set, prefetch //////////////////////////////////////////////////////////////////////// -friend inline void vstore(vRealF &ret, float *a){ +friend inline void vstore(const vRealF &ret, float *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_ps(a,ret.v); #endif diff --git a/TODO b/TODO index d2373ba1..fb7d177a 100644 --- a/TODO +++ b/TODO @@ -1,29 +1,68 @@ + * FIXME audit * Remove vload/store etc.. * Replace vset with a call to merge. * Replace vset with a call to merge. +* Const audit +* extract / merge extra implementation removal -* Conditional execution Subset, where etc... -* Coordinate information, integers etc... -* Integer type padding/union to vector. -* LatticeCoordinate[mu] +* Conditional execution, where etc... -----DONE, simple test +* Integer relational support -----DONE +* Coordinate information, integers etc... -----DONE +* Integer type padding/union to vector. -----DONE +* LatticeCoordinate[mu] -----DONE -* Optimise the extract/merge SIMD routines -* Broadcast, reduction tests. +* Stencil operator support -----Initial thoughts + +* Subset support, slice sums etc... -----Only need slice sum? + -----Generic cartesian subslicing? + -----Array ranges / boost extents? + -----Multigrid grid transferral. + -----Suggests generalised cartesian subblocking + sums, returning modified grid. + + Two classes of subset; +i) red black parit subsetting. + (pick checkerboard). + +ii) Need to be able to project one Grid to another Grid. + Generic concept is to subdivide (based on RD so applies to red/black or full). + Return a type on SUB-grid from CellSum TOP-grid + SUB-grid need not distribute but be replicated in some dims if that is how the + cartesian communicator works. + +iii) No general permutation map. + + +* Consider switch std::vector to boost arrays. + boost::multi_array A()... to replace multi1d, multi2d etc.. + +*? Cell definition <-> sliceSum. + ? Replicated arrays. + + + +* Check for missing functionality - partially audited against QDP++ layout + +* Optimise the extract/merge SIMD routines; Azusa?? + + -- I have collated into single location at least. + + -- Need to use _mm_*insert/extract routines. + +* Conformable test in Cshift routines. + +* Gamma/Dirac structures + +* Fourspin, two spin project + +* Broadcast, reduction tests. innerProduct, localInnerProduct * QDP++ regression suite and comparative benchmark * NERSC Lattice loading, plaquette test -* Conformable test in Cshift routines. - -* Gamma/Dirac structures -* Fourspin, two spin project - -* Stencil operator support - -* Check for missing functionality * I/O support - MPI IO?