diff --git a/Grid.h b/Grid.h index 0a4024bf..caa4ed5e 100644 --- a/Grid.h +++ b/Grid.h @@ -41,12 +41,12 @@ #include #endif - #include #include #include #include #include +#include #include namespace Grid { diff --git a/Grid_Lattice.h b/Grid_Lattice.h index ebf68560..01b95755 100644 --- a/Grid_Lattice.h +++ b/Grid_Lattice.h @@ -19,16 +19,14 @@ public: typedef typename vobj::vector_type vector_type; public: - Lattice(GridBase *grid) : _grid(grid) { _odata.reserve(_grid->oSites()); assert((((uint64_t)&_odata[0])&0xF) ==0); checkerboard=0; } - #include - + template friend void conformable(const Lattice &lhs,const Lattice &rhs); @@ -156,23 +154,23 @@ public: v_ptr[i]=drand48(); } }; - + // FIXME for debug; deprecate this friend void lex_sites(Lattice &l){ - Real *v_ptr = (Real *)&l._odata[0]; - size_t o_len = l._grid->oSites(); - size_t v_len = sizeof(vobj)/sizeof(vRealF); - size_t vec_len = vRealF::Nsimd(); + Real *v_ptr = (Real *)&l._odata[0]; + size_t o_len = l._grid->oSites(); + size_t v_len = sizeof(vobj)/sizeof(vRealF); + size_t vec_len = vRealF::Nsimd(); - for(int i=0;i &l){ // Zero mean, unit variance. @@ -195,7 +193,7 @@ public: } return ret; } - // *=,+=,-= operators + // *=,+=,-= operators inherit behvour from correspond */+/- operation template inline Lattice &operator *=(const T &r) { *this = (*this)*r; @@ -351,7 +349,6 @@ public: inline auto operator * (const left &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs._grid); - #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs*rhs._odata[ss]; @@ -383,7 +380,7 @@ public: { Lattice ret(lhs._grid); #pragma omp parallel for - for(int ss=0;ssoSites(); ss++){ + for(int ss=0;ssoSites(); ss++){ ret._odata[ss]=lhs._odata[ss]*rhs; } return ret; @@ -409,5 +406,6 @@ public: return ret; } + } #endif diff --git a/Grid_QCD.h b/Grid_QCD.h index 3e852633..1d52f911 100644 --- a/Grid_QCD.h +++ b/Grid_QCD.h @@ -45,7 +45,7 @@ namespace QCD { typedef Lattice LatticeComplex; - typedef Lattice LatticeInteger; // Predicates for "where" + typedef Lattice LatticeInteger; // Predicates for "where" typedef Lattice LatticeColourMatrix; typedef Lattice LatticeSpinMatrix; @@ -92,6 +92,31 @@ namespace QCD { } return ret; } + + // FIXME for debug; deprecate this + inline void LatticeCoordinate(LatticeInteger &l,int mu){ + GridBase *grid = l._grid; + int Nsimd = grid->iSites(); + std::vector gcoor; + std::vector mergebuf(Nsimd); + std::vector mergeptr(Nsimd); + for(int o=0;ooSites();o++){ + for(int i=0;iiSites();i++){ + // RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); + grid->RankIndexToGlobalCoor(0,o,i,gcoor); + mergebuf[i]=gcoor[mu]; + mergeptr[i]=&mergebuf[i]; + } + merge(l._odata[o],mergeptr); + } + }; + +#include + +#if 0 + +#endif + } //namespace QCD } // Grid #endif diff --git a/Grid_comparison.h b/Grid_comparison.h new file mode 100644 index 00000000..c890d342 --- /dev/null +++ b/Grid_comparison.h @@ -0,0 +1,264 @@ +#ifndef GRID_COMPARISON_H +#define GRID_COMPARISON_H +namespace Grid { + + // Generic list of functors + template class veq { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs == rhs; + } + }; + template class vne { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs != rhs; + } + }; + template class vlt { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs < rhs; + } + }; + template class vle { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs <= rhs; + } + }; + template class vgt { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs > rhs; + } + }; + template class vge { + public: + vInteger operator()(const lobj &lhs, const robj &rhs) + { + return lhs >= rhs; + } + }; + + // Generic list of functors + template class seq { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs == rhs; + } + }; + template class sne { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs != rhs; + } + }; + template class slt { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs < rhs; + } + }; + template class sle { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs <= rhs; + } + }; + template class sgt { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs > rhs; + } + }; + template class sge { + public: + Integer operator()(const lobj &lhs, const robj &rhs) + { + return lhs >= rhs; + } + }; + + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Integer gets extra relational functions. Could also implement these for RealF, RealD etc.. + ////////////////////////////////////////////////////////////////////////////////////////////////////// + template + inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs) + { + std::vector vlhs(vInteger::Nsimd()); // Use functors to reduce this to single implementation + std::vector vrhs(vInteger::Nsimd()); + vInteger ret; + extract(lhs,vlhs); + extract(rhs,vrhs); + for(int s=0;s(),lhs,rhs); + } + inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(sle(),lhs,rhs); + } + inline vInteger operator > (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(sgt(),lhs,rhs); + } + inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(sge(),lhs,rhs); + } + inline vInteger operator == (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(seq(),lhs,rhs); + } + inline vInteger operator != (const vInteger & lhs, const vInteger & rhs) + { + return Comparison(sne(),lhs,rhs); + } + + ////////////////////////////////////////////////////////////////////////// + // relational operators + // + // Support <,>,<=,>=,==,!= + // + //Query supporting bitwise &, |, ^, ! + //Query supporting logical &&, ||, + ////////////////////////////////////////////////////////////////////////// + template + inline Lattice LLComparison(vfunctor op,const Lattice &lhs,const Lattice &rhs) + { + Lattice ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); + } + return ret; + } + template + inline Lattice LSComparison(vfunctor op,const Lattice &lhs,const robj &rhs) + { + Lattice ret(lhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=op(lhs._odata[ss],rhs); + } + return ret; + } + template + inline Lattice SLComparison(vfunctor op,const lobj &lhs,const Lattice &rhs) + { + Lattice ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=op(lhs._odata[ss],rhs); + } + return ret; + } + + // Less than + template + inline Lattice operator < (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vlt(),lhs,rhs); + } + template + inline Lattice operator < (const Lattice & lhs, const robj & rhs) { + return LSComparison(vlt(),lhs,rhs); + } + template + inline Lattice operator < (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vlt(),lhs,rhs); + } + + // Less than equal + template + inline Lattice operator <= (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vle(),lhs,rhs); + } + template + inline Lattice operator <= (const Lattice & lhs, const robj & rhs) { + return LSComparison(vle(),lhs,rhs); + } + template + inline Lattice operator <= (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vle(),lhs,rhs); + } + + // Greater than + template + inline Lattice operator > (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vgt(),lhs,rhs); + } + template + inline Lattice operator > (const Lattice & lhs, const robj & rhs) { + return LSComparison(vgt(),lhs,rhs); + } + template + inline Lattice operator > (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vgt(),lhs,rhs); + } + + + // Greater than equal + template + inline Lattice operator >= (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vge(),lhs,rhs); + } + template + inline Lattice operator >= (const Lattice & lhs, const robj & rhs) { + return LSComparison(vge(),lhs,rhs); + } + template + inline Lattice operator >= (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vge(),lhs,rhs); + } + + + // equal + template + inline Lattice operator == (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(veq(),lhs,rhs); + } + template + inline Lattice operator == (const Lattice & lhs, const robj & rhs) { + return LSComparison(veq(),lhs,rhs); + } + template + inline Lattice operator == (const lobj & lhs, const Lattice & rhs) { + return SLComparison(veq(),lhs,rhs); + } + + + // not equal + template + inline Lattice operator != (const Lattice & lhs, const Lattice & rhs) { + return LLComparison(vne(),lhs,rhs); + } + template + inline Lattice operator != (const Lattice & lhs, const robj & rhs) { + return LSComparison(vne(),lhs,rhs); + } + template + inline Lattice operator != (const lobj & lhs, const Lattice & rhs) { + return SLComparison(vne(),lhs,rhs); + } + + +} +#endif diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h index 08e75cff..2910151c 100644 --- a/Grid_cshift_common.h +++ b/Grid_cshift_common.h @@ -265,7 +265,6 @@ friend void Copy_plane_permute(Lattice& lhs,Lattice &rhs, int dimens ////////////////////////////////////////////////////// // Local to node Cshift ////////////////////////////////////////////////////// - friend void Cshift_local(Lattice& ret,Lattice &rhs,int dimension,int shift) { int sshift[2]; diff --git a/Grid_main.cc b/Grid_main.cc index 62356436..1676a07b 100644 --- a/Grid_main.cc +++ b/Grid_main.cc @@ -19,10 +19,10 @@ int main (int argc, char ** argv) std::vector simd_layout(4); std::vector mpi_layout(4); - mpi_layout[0]=2; - mpi_layout[1]=2; - mpi_layout[2]=2; - mpi_layout[3]=2; + mpi_layout[0]=1; + mpi_layout[1]=1; + mpi_layout[2]=1; + mpi_layout[3]=1; #ifdef AVX512 for(int omp=128;omp<236;omp+=16){ @@ -121,12 +121,34 @@ int main (int argc, char ** argv) // Non-lattice (const objects) * Lattice ColourMatrix cm; SpinColourMatrix scm; - + vSpinColourMatrix vscm; + Complex cplx(1.0); + Integer myint=1; + double mydouble=1.0; + + // vSpinColourMatrix vscm; scMat = cMat*scMat; scm = cm * scm; // SpinColourMatrix = ColourMatrix * SpinColourMatrix scm = scm *cm; // SpinColourMatrix = SpinColourMartix * ColourMatrix scm = GammaFive * scm ; // SpinColourMatrix = SpinMatrix * SpinColourMatrix scm = scm* GammaFive ; // SpinColourMatrix = SpinColourMatrix * SpinMatrix + + scm = scm*cplx; + vscm = vscm*cplx; + scMat = scMat*cplx; + + scm = cplx*scm; + vscm = cplx*vscm; + scMat = cplx*scMat; + scm = myint*scm; + vscm = myint*vscm; + scMat = scMat*myint; + + scm = scm*mydouble; + vscm = vscm*mydouble; + scMat = scMat*mydouble; + scMat = mydouble*scMat; + cMat = mydouble*cMat; sMat = adj(sMat); // LatticeSpinMatrix adjoint sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix @@ -160,7 +182,35 @@ int main (int argc, char ** argv) */ lex_sites(Foo); + Integer mm[4]; + mm[0]=1; + mm[1]=Fine._rdimensions[0]; + mm[2]=Fine._ldimensions[0]*Fine._ldimensions[1]; + mm[3]=Fine._ldimensions[0]*Fine._ldimensions[1]*Fine._ldimensions[2]; + LatticeInteger lex(&Fine); + lex=zero; + for(int d=0;d<4;d++){ + LatticeInteger coor(&Fine); + LatticeCoordinate(coor,d); + lex = lex + coor*mm[d]; + } + Bar = zero; + Bar = where(lex<10,Foo,Bar); + { + std::vector coor(4); + for(coor[3]=0;coor[3] diff; @@ -305,7 +361,8 @@ int main (int argc, char ** argv) double nn=Ttr._internal._internal; if ( nn > 0 ) cout<<"Shift real trace fail "< >::scalar_type == ComplexD. +////////////////////////////////////////////////////////////////////////////////// + + template class GridTypeMapper { + public: + typedef typename T::scalar_type scalar_type; + typedef typename T::vector_type vector_type; + typedef typename T::tensor_reduced tensor_reduced; + }; + +////////////////////////////////////////////////////////////////////////////////// +// Recursion stops with these template specialisations +////////////////////////////////////////////////////////////////////////////////// + template<> class GridTypeMapper { + public: + typedef RealF scalar_type; + typedef RealF vector_type; + typedef RealF tensor_reduced ; + }; + template<> class GridTypeMapper { + public: + typedef RealD scalar_type; + typedef RealD vector_type; + typedef RealD tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef ComplexF scalar_type; + typedef ComplexF vector_type; + typedef ComplexF tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef ComplexD scalar_type; + typedef ComplexD vector_type; + typedef ComplexD tensor_reduced; + }; + + template<> class GridTypeMapper { + public: + typedef RealF scalar_type; + typedef vRealF vector_type; + typedef vRealF tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef RealD scalar_type; + typedef vRealD vector_type; + typedef vRealD tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef ComplexF scalar_type; + typedef vComplexF vector_type; + typedef vComplexF tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef ComplexD scalar_type; + typedef vComplexD vector_type; + typedef vComplexD tensor_reduced; + }; + template<> class GridTypeMapper { + public: + typedef Integer scalar_type; + typedef vInteger vector_type; + typedef vInteger tensor_reduced; + }; + + // Again terminate the recursion. + inline vRealD TensorRemove(vRealD arg){ return arg;} + inline vRealF TensorRemove(vRealF arg){ return arg;} + inline vComplexF TensorRemove(vComplexF arg){ return arg;} + inline vComplexD TensorRemove(vComplexD arg){ return arg;} + inline vInteger TensorRemove(vInteger arg){ return arg;} + +} + +#endif diff --git a/Grid_math_types.h b/Grid_math_types.h index 25524966..a2a3f97c 100644 --- a/Grid_math_types.h +++ b/Grid_math_types.h @@ -1,63 +1,11 @@ #ifndef GRID_MATH_TYPES_H #define GRID_MATH_TYPES_H + +#include + namespace Grid { - -////////////////////////////////////////////////////////////////////////////////// -// Want to recurse: GridTypeMapper >::scalar_type == ComplexD. -////////////////////////////////////////////////////////////////////////////////// - - template class GridTypeMapper { - public: - typedef typename T::scalar_type scalar_type; - typedef typename T::vector_type vector_type; - }; - - template<> class GridTypeMapper { - public: - typedef RealF scalar_type; - typedef RealF vector_type; - }; - template<> class GridTypeMapper { - public: - typedef RealD scalar_type; - typedef RealD vector_type; - }; - template<> class GridTypeMapper { - public: - typedef ComplexF scalar_type; - typedef ComplexF vector_type; - }; - template<> class GridTypeMapper { - public: - typedef ComplexD scalar_type; - typedef ComplexD vector_type; - }; - - template<> class GridTypeMapper { - public: - typedef RealF scalar_type; - typedef vRealF vector_type; - }; - template<> class GridTypeMapper { - public: - typedef RealD scalar_type; - typedef vRealD vector_type; - }; - template<> class GridTypeMapper { - public: - typedef ComplexF scalar_type; - typedef vComplexF vector_type; - }; - template<> class GridTypeMapper { - public: - typedef ComplexD scalar_type; - typedef vComplexD vector_type; - }; - - - /////////////////////////////////////////////////// // Scalar, Vector, Matrix objects. // These can be composed to form tensor products of internal indices. @@ -70,9 +18,16 @@ public: typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; + typedef iScalar tensor_reduced; + iScalar(){}; + + iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type + iScalar(Zero &z){ *this = zero; }; + iScalar & operator= (const Zero &hero){ zeroit(*this); return *this; @@ -80,22 +35,27 @@ public: friend void zeroit(iScalar &that){ zeroit(that._internal); } - friend void permute(iScalar &out,iScalar &in,int permutetype){ + friend void permute(iScalar &out,const iScalar &in,int permutetype){ permute(out._internal,in._internal,permutetype); } - friend void extract(iScalar &in,std::vector &out){ + friend void extract(const iScalar &in,std::vector &out){ extract(in._internal,out); // extract advances the pointers in out } friend void merge(iScalar &in,std::vector &out){ merge(in._internal,out); // extract advances the pointers in out } + friend inline iScalar::vector_type TensorRemove(iScalar arg) + { + return TensorRemove(arg._internal); + } + // Unary negation friend inline iScalar operator -(const iScalar &r) { iScalar ret; ret._internal= -r._internal; return ret; } - // *=,+=,-= operators + // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour inline iScalar &operator *=(const iScalar &r) { *this = (*this)*r; return *this; @@ -117,6 +77,9 @@ public: typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; + typedef iScalar tensor_reduced; + iVector(Zero &z){ *this = zero; }; iVector() {}; @@ -129,12 +92,12 @@ public: zeroit(that._internal[i]); } } - friend void permute(iVector &out,iVector &in,int permutetype){ + friend void permute(iVector &out,const iVector &in,int permutetype){ for(int i=0;i &in,std::vector &out){ + friend void extract(const iVector &in,std::vector &out){ for(int i=0;i &operator *=(const iScalar &r) { *this = (*this)*r; return *this; @@ -163,10 +126,8 @@ public: *this = (*this)+r; return *this; } - }; - template class iMatrix { public: @@ -174,62 +135,64 @@ public: typedef typename GridTypeMapper::scalar_type scalar_type; typedef typename GridTypeMapper::vector_type vector_type; + typedef typename GridTypeMapper::tensor_reduced tensor_reduced_v; + typedef iScalar tensor_reduced; - iMatrix(Zero &z){ *this = zero; }; - iMatrix() {}; - iMatrix & operator= (Zero &hero){ - zeroit(*this); - return *this; - } - friend void zeroit(iMatrix &that){ - for(int i=0;i &out,iMatrix &in,int permutetype){ - for(int i=0;i & operator= (Zero &hero){ + zeroit(*this); + return *this; + } + friend void zeroit(iMatrix &that){ + for(int i=0;i &out,const iMatrix &in,int permutetype){ + for(int i=0;i &in,std::vector &out){ - for(int i=0;i &in,std::vector &out){ + for(int i=0;i &in,std::vector &out){ - for(int i=0;i &in,std::vector &out){ + for(int i=0;i operator -(const iMatrix &r) { - iMatrix ret; - for(int i=0;i - inline iMatrix &operator *=(const T &r) { - *this = (*this)*r; - return *this; - } - template - inline iMatrix &operator -=(const T &r) { - *this = (*this)-r; - return *this; - } - template - inline iMatrix &operator +=(const T &r) { - *this = (*this)+r; - return *this; - } + }} + } + // Unary negation + friend inline iMatrix operator -(const iMatrix &r) { + iMatrix ret; + for(int i=0;i + inline iMatrix &operator *=(const T &r) { + *this = (*this)*r; + return *this; + } + template + inline iMatrix &operator -=(const T &r) { + *this = (*this)-r; + return *this; + } + template + inline iMatrix &operator +=(const T &r) { + *this = (*this)+r; + return *this; + } }; @@ -642,7 +605,8 @@ iVector operator * (const iVector& lhs,const iScalar& r // mat x vec = vec // vec x scal = vec // scal x vec = vec - + // + // We can special case scalar_type ?? template inline auto operator * (const iScalar& lhs,const iScalar& rhs) -> iScalar { @@ -715,6 +679,229 @@ auto operator * (const iVector& lhs,const iScalar& rhs) -> iVector inline iScalar operator * (const iScalar& lhs,const typename iScalar::scalar_type rhs) +{ + typename iScalar::tensor_reduced srhs(rhs); + return lhs*srhs; +} +template inline iScalar operator * (const typename iScalar::scalar_type lhs,const iScalar& rhs) { return rhs*lhs; } + +template inline iVector operator * (const iVector& lhs,const typename iScalar::scalar_type rhs) +{ + typename iVector::tensor_reduced srhs(rhs); + return lhs*srhs; +} +template inline iVector operator * (const typename iScalar::scalar_type lhs,const iVector& rhs) { return rhs*lhs; } + +template inline iMatrix operator * (const iMatrix& lhs,const typename iScalar::scalar_type &rhs) +{ + typename iMatrix::tensor_reduced srhs(rhs); + return lhs*srhs; +} +template inline iMatrix operator * (const typename iScalar::scalar_type & lhs,const iMatrix& rhs) { return rhs*lhs; } + +//////////////////////////////////////////////////////////////////// +// Double support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator * (const iScalar& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iScalar operator * (double lhs,const iScalar& rhs) { return rhs*lhs; } + +template inline iVector operator * (const iVector& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iVector operator * (double lhs,const iVector& rhs) { return rhs*lhs; } + +template inline iMatrix operator * (const iMatrix& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iMatrix operator * (double lhs,const iMatrix& rhs) { return rhs*lhs; } + +//////////////////////////////////////////////////////////////////// +// Integer support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator * (const iScalar& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iScalar operator * (Integer lhs,const iScalar& rhs) { return rhs*lhs; } + +template inline iVector operator * (const iVector& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iVector operator * (Integer lhs,const iVector& rhs) { return rhs*lhs; } + +template inline iMatrix operator * (const iMatrix& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs*srhs; +} +template inline iMatrix operator * (Integer lhs,const iMatrix& rhs) { return rhs*lhs; } + + + +/////////////////////////////////////////////////////////////////////////////////////////////// +// addition by fundamental scalar type applies to matrix(down diag) and scalar +/////////////////////////////////////////////////////////////////////////////////////////////// +template inline iScalar operator + (const iScalar& lhs,const typename iScalar::scalar_type rhs) +{ + typename iScalar::tensor_reduced srhs(rhs); + return lhs+srhs; +} +template inline iScalar operator + (const typename iScalar::scalar_type lhs,const iScalar& rhs) { return rhs+lhs; } + +template inline iMatrix operator + (const iMatrix& lhs,const typename iScalar::scalar_type rhs) +{ + typename iMatrix::tensor_reduced srhs(rhs); + return lhs+srhs; +} +template inline iMatrix operator + (const typename iScalar::scalar_type lhs,const iMatrix& rhs) { return rhs+lhs; } + +//////////////////////////////////////////////////////////////////// +// Double support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator + (const iScalar& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs+srhs; +} +template inline iScalar operator + (double lhs,const iScalar& rhs) { return rhs+lhs; } + +template inline iMatrix operator + (const iMatrix& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs+srhs; +} +template inline iMatrix operator + (double lhs,const iMatrix& rhs) { return rhs+lhs; } + +//////////////////////////////////////////////////////////////////// +// Integer support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator + (const iScalar& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs+srhs; +} +template inline iScalar operator + (Integer lhs,const iScalar& rhs) { return rhs+lhs; } + +template inline iMatrix operator + (const iMatrix& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs+srhs; +} +template inline iMatrix operator + (Integer lhs,const iMatrix& rhs) { return rhs+lhs; } + + +/////////////////////////////////////////////////////////////////////////////////////////////// +// subtraction of fundamental scalar type applies to matrix(down diag) and scalar +/////////////////////////////////////////////////////////////////////////////////////////////// +template inline iScalar operator - (const iScalar& lhs,const typename iScalar::scalar_type rhs) +{ + typename iScalar::tensor_reduced srhs(rhs); + return lhs-srhs; +} +template inline iScalar operator - (const typename iScalar::scalar_type lhs,const iScalar& rhs) +{ + typename iScalar::tensor_reduced slhs(lhs); + return slhs-rhs; +} + +template inline iMatrix operator - (const iMatrix& lhs,const typename iScalar::scalar_type rhs) +{ + typename iScalar::tensor_reduced srhs(rhs); + return lhs-srhs; +} +template inline iMatrix operator - (const typename iScalar::scalar_type lhs,const iMatrix& rhs) +{ + typename iScalar::tensor_reduced slhs(lhs); + return slhs-rhs; +} + +//////////////////////////////////////////////////////////////////// +// Double support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator - (const iScalar& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs-srhs; +} +template inline iScalar operator - (double lhs,const iScalar& rhs) +{ + typename iScalar::scalar_type t(lhs); + typename iScalar::tensor_reduced slhs(t); + return slhs-rhs; +} + +template inline iMatrix operator - (const iMatrix& lhs,double rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs-srhs; +} +template inline iMatrix operator - (double lhs,const iMatrix& rhs) +{ + typename iScalar::scalar_type t(lhs); + typename iScalar::tensor_reduced slhs(t); + return slhs-rhs; +} + +//////////////////////////////////////////////////////////////////// +// Integer support; cast to "scalar_type" through constructor +//////////////////////////////////////////////////////////////////// +template inline iScalar operator - (const iScalar& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs-srhs; +} +template inline iScalar operator - (Integer lhs,const iScalar& rhs) +{ + typename iScalar::scalar_type t(lhs); + typename iScalar::tensor_reduced slhs(t); + return slhs-rhs; +} +template inline iMatrix operator - (const iMatrix& lhs,Integer rhs) +{ + typename iScalar::scalar_type t(rhs); + typename iScalar::tensor_reduced srhs(t); + return lhs-srhs; +} +template inline iMatrix operator - (Integer lhs,const iMatrix& rhs) +{ + typename iScalar::scalar_type t(lhs); + typename iScalar::tensor_reduced slhs(t); + return slhs-rhs; +} + + + + /////////////////////////////////////////////////////////////////////////////////////// // localInnerProduct Scalar x Scalar -> Scalar // localInnerProduct Vector x Vector -> Scalar @@ -907,6 +1094,7 @@ inline auto trace(const iScalar &arg) -> iScalar +inline void where(Lattice &ret,const LatticeInteger &predicate,Lattice &iftrue,Lattice &iffalse) +{ + conformable(iftrue,iffalse); + conformable(iftrue,predicate); + conformable(iftrue,ret); + + GridBase *grid=iftrue._grid; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + const int Nsimd = grid->Nsimd(); + const int words = sizeof(vobj)/sizeof(vector_type); + + std::vector mask(Nsimd); + std::vector > truevals (Nsimd,std::vector(words) ); + std::vector > falsevals(Nsimd,std::vector(words) ); + std::vector pointers(Nsimd); + +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + + for(int s=0;s +inline Lattice where(const LatticeInteger &predicate,Lattice &iftrue,Lattice &iffalse) +{ + conformable(iftrue,iffalse); + conformable(iftrue,predicate); + + Lattice ret(iftrue._grid); + + where(ret,predicate,iftrue,iffalse); + + return ret; +} + +#endif diff --git a/Grid_simd.h b/Grid_simd.h index 53cc8909..1229d328 100644 --- a/Grid_simd.h +++ b/Grid_simd.h @@ -11,28 +11,15 @@ // Vector types are arch dependent //////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////// - // SIMD Alignment controls - //////////////////////////////////////////////////////////// -#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED -#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A))) -#else -#define ALIGN_DIRECTIVE(A) __declspec(align(A)) -#endif #ifdef SSE2 #include -#define SIMDalign ALIGN_DIRECTIVE(16) #endif - #if defined(AVX1) || defined (AVX2) #include -#define SIMDalign ALIGN_DIRECTIVE(32) #endif - #ifdef AVX512 #include -#define SIMDalign ALIGN_DIRECTIVE(64) #endif namespace Grid { @@ -137,41 +124,66 @@ namespace Grid { // Generic extract/merge/permute ///////////////////////////////////////////////////////////////// template -inline void Gextract(vsimd &y,std::vector &extracted){ -#if 1 +inline void Gextract(const vsimd &y,std::vector &extracted){ // FIXME: bounce off stack is painful // temporary hack while I figure out better way. // There are intrinsics to do this work without the storage. - int Nsimd = extracted.size(); - { - std::vector > buf(Nsimd); - vstore(y,&buf[0]); - for(int i=0;i > buf(Nsimd); + vstore(y,&buf[0]); + for(int i=0;i inline void Gmerge(vsimd &y,std::vector &extracted){ -#if 1 - int Nsimd = extracted.size(); + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + std::vector buf(Nsimd); - for(int i=0;i +inline void Gextract(const vsimd &y,std::vector &extracted){ + // FIXME: bounce off stack is painful + // temporary hack while I figure out better way. + // There are intrinsics to do this work without the storage. + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + + std::vector > buf(Nsimd); + + vstore(y,&buf[0]); + + for(int i=0;i +inline void Gmerge(vsimd &y,std::vector &extracted){ + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + + std::vector buf(Nsimd); + for(int i=0;i &extracted){ // Permute 4 possible on half precision @512bit vectors. ////////////////////////////////////////////////////////// template -inline void Gpermute(vsimd &y,vsimd b,int perm){ +inline void Gpermute(vsimd &y,const vsimd &b,int perm){ switch (perm){ #if defined(AVX1)||defined(AVX2) // 8x32 bits=>3 permutes @@ -214,10 +226,10 @@ inline void Gpermute(vsimd &y,vsimd b,int perm){ }; }; +#include #include #include #include #include -#include #endif diff --git a/Grid_stencil.h b/Grid_stencil.h index d93204fc..4d73f436 100644 --- a/Grid_stencil.h +++ b/Grid_stencil.h @@ -8,5 +8,355 @@ // Lattice could also allocate haloes which get used for stencil code. // // Grid could create a neighbour index table for a given stencil. -// Could also implement CovariantCshift. +// +// Could also implement CovariantCshift, to fuse the loops and enhance performance. +// +// +// General stencil computation: +// +// Generic services +// 0) Prebuild neighbour tables +// 1) Compute sizes of all haloes/comms buffers; allocate them. +// +// 2) Gather all faces, and communicate. +// 3) Loop over result sites, giving nbr index/offnode info for each +// +// Could take a +// SpinProjectFaces +// start comms +// complete comms +// Reconstruct Umu +// +// Approach. +// ////////////////////////////////////////////////////////////////////////////////////////// + +namespace Grid { + + class Stencil { + public: + + Stencil(GridBase *grid, + int npoints, + int checkerboard, + std::vector directions, + std::vector distances); + + void Stencil_local (int dimension,int shift,int cbmask); + void Stencil_comms (int dimension,int shift,int cbmask); + void Stencil_comms_simd(int dimension,int shift,int cbmask); + // Will need to implement actions for + // + Copy_plane; + Copy_plane_permute; + Gather_plane; + + + + // The offsets to all neibours in stencil in each direction + int _checkerboard; + int _npoints; // Move to template param? + GridBase * _grid; + + // Store these as SIMD Integer needed + // + // std::vector< iVector > _offsets; + // std::vector< iVector > _local; + // std::vector< iVector > _comm_buf_size; + // std::vector< iVector > _permute; + + std::vector > _offsets; + std::vector > _local; + std::vector _comm_buf_size; + std::vector _permute; + + }; + + Stencil::Stencil(GridBase *grid, + int npoints, + int checkerboard, + std::vector directions, + std::vector distances){ + + _npoints = npoints; + _grid = grid; + + for(int i=0;i_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + + _checkerboard = checkerboard; + + // the permute type + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim); + + int sshift[2]; + + if ( !comm_dim ) { + sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); + sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + + if ( sshift[0] == sshift[1] ) { + Stencil_local(dimension,shift,0x3); + } else { + Stencil_local(dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Stencil_local(dimension,shift,0x2);// both with block stride loop iteration + } + } else if ( splice_dim ) { + sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); + sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + + if ( sshift[0] == sshift[1] ) { + Stencil_comms_simd(dimension,shift,0x3); + } else { + Stencil_comms_simd(dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Stencil_comms_simd(dimension,shift,0x2);// both with block stride loop iteration + } + } else { + // Cshift_comms(ret,rhs,dimension,shift); + sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0); + sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1); + if ( sshift[0] == sshift[1] ) { + Stencil_comms(dimension,shift,0x3); + } else { + Stencil_comms(dimension,shift,0x1);// if checkerboard is unfavourable take two passes + Stencil_comms(dimension,shift,0x2);// both with block stride loop iteration + } + } + } + } + + + void Stencil::Stencil_local (int dimension,int shift,int cbmask) + { + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int gd = _grid->_gdimensions[dimension]; + + // Map to always positive shift modulo global full dimension. + shift = (shift+fd)%fd; + + // the permute type + int permute_dim =_grid->PermuteDim(dimension); + int permute_type=_grid->PermuteType(dimension); + + for(int x=0;x_ostride[dimension]; + + int cb= (cbmask==0x2)? 1 : 0; + + int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); + int sx = (x+sshift)%rd; + + int permute_slice=0; + if(permute_dim){ + int wrap = sshift/rd; + int num = sshift%rd; + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + } + + if ( permute_slice ) Copy_plane_permute(dimension,x,sx,cbmask,permute_type); + else Copy_plane (dimension,x,sx,cbmask); + + } + } + + void Stencil::Stencil_comms (int dimension,int shift,int cbmask) + { + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + GridBase *grid=_grid; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; + // FIXME: Do something with buffer_size?? + + int cb= (cbmask==0x2)? 1 : 0; + int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); + + for(int x=0;x= rd ); + int sx = (x+sshift)%rd; + int comm_proc = (x+sshift)/rd; + + if (!offnode) { + + Copy_plane(dimension,x,sx,cbmask); + + } else { + + int words = send_buf.size(); + if (cbmask != 0x3) words=words>>1; + + int bytes = words * sizeof(vobj); + + Gather_plane_simple (dimension,sx,cbmask); + + int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + /* + grid->SendToRecvFrom((void *)&send_buf[0], + xmit_to_rank, + (void *)&recv_buf[0], + recv_from_rank, + bytes); + */ + Scatter_plane_simple (dimension,x,cbmask); + } + } + } + + void Stencil::Stencil_comms_simd(int dimension,int shift,int cbmask) + { + GridBase *grid=_grid; + const int Nsimd = _grid->Nsimd(); + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + assert(comm_dim==1); + assert(simd_layout==2); + assert(shift>=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = _grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; + // FIXME do something with buffer size + + std::vector pointers(Nsimd); // + std::vector rpointers(Nsimd); // received pointers + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + + int cb = (cbmask==0x2)? 1 : 0; + int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb); + + std::vector comm_offnode(simd_layout); + std::vector comm_proc (simd_layout); //relative processor coord in dim=dimension + std::vector icoor(grid->Nd()); + + for(int x=0;x= ld; + comm_any = comm_any | comm_offnode[s]; + comm_proc[s] = shifted_x/ld; + } + + int o = 0; + int bo = x*grid->_ostride[dimension]; + int sx = (x+sshift)%rd; + + if ( comm_any ) { + + for(int i=0;iiCoorFromIindex(icoor,i); + s = icoor[dimension]; + + if(comm_offnode[s]){ + + int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank); + + /* + grid->SendToRecvFrom((void *)&send_buf_extract[i][0], + xmit_to_rank, + (void *)&recv_buf_extract[i][0], + recv_from_rank, + bytes); + */ + + rpointers[i] = (scalar_type *)&recv_buf_extract[i][0]; + + } else { + + rpointers[i] = (scalar_type *)&send_buf_extract[i][0]; + + } + + } + + // Permute by swizzling pointers in merge + int permute_slice=0; + int lshift=sshift%ld; + int wrap =lshift/rd; + int num =lshift%rd; + + if ( x< rd-num ) permute_slice=wrap; + else permute_slice = 1-wrap; + + int toggle_bit = (Nsimd>>(permute_type+1)); + int PermuteMap; + for(int i=0;i(y,extracted); } - friend inline void extract(vComplexD &y,std::vector &extracted) + friend inline void extract(const vComplexD &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vComplexD &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vComplexD &y,std::vector &extracted) { Gextract(y,extracted); } @@ -184,6 +198,11 @@ namespace Grid { /////////////////////// // Splat /////////////////////// + friend inline void vsplat(vComplexD &ret,ComplexD c){ + float a= real(c); + float b= imag(c); + vsplat(ret,a,b); + } friend inline void vsplat(vComplexD &ret,double rl,double ig){ #if defined (AVX1)|| defined (AVX2) ret.v = _mm256_set_pd(ig,rl,ig,rl); @@ -215,7 +234,7 @@ namespace Grid { #endif } -friend inline void vstore(vComplexD &ret, ComplexD *a){ +friend inline void vstore(const vComplexD &ret, ComplexD *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_pd((double *)a,ret.v); #endif diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h index b7fb3d6a..b6b7ebe9 100644 --- a/Grid_vComplexF.h +++ b/Grid_vComplexF.h @@ -20,6 +20,12 @@ namespace Grid { return (*this); } vComplexF(){}; + vComplexF(ComplexF a){ + vsplat(*this,a); + }; + vComplexF(double a){ + vsplat(*this,ComplexF(a)); + }; /////////////////////////////////////////////// // mac, mult, sub, add, adj @@ -161,7 +167,7 @@ namespace Grid { vsplat(ret,a,b); } -friend inline void vstore(vComplexF &ret, ComplexF *a){ +friend inline void vstore(const vComplexF &ret, ComplexF *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_ps((float *)a,ret.v); #endif @@ -210,27 +216,47 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){ #endif } - friend inline vComplexF operator * (const Complex &a, vComplexF b){ vComplexF va; vsplat(va,a); return va*b; } friend inline vComplexF operator * (vComplexF b,const Complex &a){ + return a*b; + } + + /* + template + friend inline vComplexF operator * (vComplexF b,const real &a){ vComplexF va; - vsplat(va,a); + Complex ca(a,0); + vsplat(va,ca); return va*b; } + template + friend inline vComplexF operator * (const real &a,vComplexF b){ + return a*b; + } + friend inline vComplexF operator + (const Complex &a, vComplexF b){ vComplexF va; vsplat(va,a); return va+b; } friend inline vComplexF operator + (vComplexF b,const Complex &a){ - vComplexF va; - vsplat(va,a); - return b+va; + return a+b; } + template + friend inline vComplexF operator + (vComplexF b,const real &a){ + vComplexF va; + Complex ca(a,0); + vsplat(va,ca); + return va+b; + } + template + friend inline vComplexF operator + (const real &a,vComplexF b){ + return a+b; + } friend inline vComplexF operator - (const Complex &a, vComplexF b){ vComplexF va; vsplat(va,a); @@ -241,7 +267,24 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){ vsplat(va,a); return b-va; } - // NB: Template the following on "type Complex" and then implement *,+,- for ComplexF, ComplexD, RealF, RealD above to + template + friend inline vComplexF operator - (vComplexF b,const real &a){ + vComplexF va; + Complex ca(a,0); + vsplat(va,ca); + return b-va; + } + template + friend inline vComplexF operator - (const real &a,vComplexF b){ + vComplexF va; + Complex ca(a,0); + vsplat(va,ca); + return va-b; + } + */ + + // NB: Template the following on "type Complex" and then implement *,+,- for + // ComplexF, ComplexD, RealF, RealD above to // get full generality of binops with scalars. friend inline void mac (vComplexF *__restrict__ y,const Complex *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); }; friend inline void mult(vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); } @@ -304,7 +347,15 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){ { Gmerge(y,extracted); } - friend inline void extract(vComplexF &y,std::vector &extracted) + friend inline void extract(const vComplexF &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vComplexF &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vComplexF &y,std::vector &extracted) { Gextract(y,extracted); } diff --git a/Grid_vInteger.h b/Grid_vInteger.h index 6ddce191..82adbd8e 100644 --- a/Grid_vInteger.h +++ b/Grid_vInteger.h @@ -10,7 +10,7 @@ namespace Grid { typedef uint32_t Integer; - class vInteger { + class vInteger { protected: public: @@ -21,6 +21,13 @@ namespace Grid { typedef Integer scalar_type; vInteger(){}; + vInteger & operator = (const Zero & z){ + vzero(*this); + return (*this); + } + vInteger(Integer a){ + vsplat(*this,a); + }; //////////////////////////////////// // Arithmetic operator overloads +,-,* //////////////////////////////////// @@ -166,18 +173,18 @@ namespace Grid { #endif } -friend inline void vstore(vInteger &ret, Integer *a){ + friend inline void vstore(const vInteger &ret, Integer *a){ #if defined (AVX1)|| defined (AVX2) - _mm256_store_si256((__m256i*)a,ret.v); + _mm256_store_si256((__m256i*)a,ret.v); #endif #ifdef SSE2 - _mm_store_si128(a,ret.v); + _mm_store_si128(a,ret.v); #endif #ifdef AVX512 - _mm512_store_si512(a,ret.v); + _mm512_store_si512(a,ret.v); #endif #ifdef QPX - assert(0); + assert(0); #endif } @@ -185,6 +192,7 @@ friend inline void vstore(vInteger &ret, Integer *a){ { _mm_prefetch((const char*)&v.v,_MM_HINT_T0); } + // Unary negation friend inline vInteger operator -(const vInteger &r) { vInteger ret; @@ -210,9 +218,32 @@ friend inline void vstore(vInteger &ret, Integer *a){ *this = *this-r; return *this; } + + friend inline void permute(vInteger &y,const vInteger b,int perm) + { + Gpermute(y,b,perm); + } + friend inline void merge(vInteger &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vInteger &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vInteger &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vInteger &y,std::vector &extracted) + { + Gextract(y,extracted); + } + + public: - static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);} - }; + static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);} + }; inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; } @@ -222,27 +253,7 @@ friend inline void vstore(vInteger &ret, Integer *a){ { return l*r; } - - - class vIntegerF : public vInteger - { - public: - static inline int Nsimd(void) { return sizeof(ivec)/sizeof(float);} - - friend inline void permute(vIntegerF &y,vIntegerF b,int perm) - { - Gpermute(y,b,perm); - } - friend inline void merge(vIntegerF &y,std::vector &extracted) - { - Gmerge(y,extracted); - } - friend inline void extract(vIntegerF &y,std::vector &extracted) - { - Gextract(y,extracted); - } - }; - + } #endif diff --git a/Grid_vRealD.h b/Grid_vRealD.h index 13ceedbe..a5b59be3 100644 --- a/Grid_vRealD.h +++ b/Grid_vRealD.h @@ -13,6 +13,9 @@ namespace Grid { typedef RealD scalar_type; vRealD(){}; + vRealD(RealD a){ + vsplat(*this,a); + }; friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);} friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);} @@ -112,7 +115,15 @@ namespace Grid { { Gmerge(y,extracted); } - friend inline void extract(vRealD &y,std::vector &extracted) + friend inline void extract(const vRealD &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vRealD &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vRealD &y,std::vector &extracted) { Gextract(y,extracted); } @@ -157,7 +168,7 @@ namespace Grid { #endif } - friend inline void vstore(vRealD &ret, double *a){ + friend inline void vstore(const vRealD &ret, double *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_pd(a,ret.v); #endif diff --git a/Grid_vRealF.h b/Grid_vRealF.h index 185f4da8..0fe68f43 100644 --- a/Grid_vRealF.h +++ b/Grid_vRealF.h @@ -14,6 +14,9 @@ namespace Grid { typedef RealF scalar_type; vRealF(){}; + vRealF(RealF a){ + vsplat(*this,a); + }; //////////////////////////////////// // Arithmetic operator overloads +,-,* //////////////////////////////////// @@ -133,7 +136,15 @@ namespace Grid { { Gmerge(y,extracted); } - friend inline void extract(vRealF &y,std::vector &extracted) + friend inline void extract(const vRealF &y,std::vector &extracted) + { + Gextract(y,extracted); + } + friend inline void merge(vRealF &y,std::vector &extracted) + { + Gmerge(y,extracted); + } + friend inline void extract(const vRealF &y,std::vector &extracted) { Gextract(y,extracted); } @@ -180,7 +191,7 @@ namespace Grid { //////////////////////////////////////////////////////////////////////// // FIXME: gonna remove these load/store, get, set, prefetch //////////////////////////////////////////////////////////////////////// -friend inline void vstore(vRealF &ret, float *a){ +friend inline void vstore(const vRealF &ret, float *a){ #if defined (AVX1)|| defined (AVX2) _mm256_store_ps(a,ret.v); #endif diff --git a/TODO b/TODO index d2373ba1..fb7d177a 100644 --- a/TODO +++ b/TODO @@ -1,29 +1,68 @@ + * FIXME audit * Remove vload/store etc.. * Replace vset with a call to merge. * Replace vset with a call to merge. +* Const audit +* extract / merge extra implementation removal -* Conditional execution Subset, where etc... -* Coordinate information, integers etc... -* Integer type padding/union to vector. -* LatticeCoordinate[mu] +* Conditional execution, where etc... -----DONE, simple test +* Integer relational support -----DONE +* Coordinate information, integers etc... -----DONE +* Integer type padding/union to vector. -----DONE +* LatticeCoordinate[mu] -----DONE -* Optimise the extract/merge SIMD routines -* Broadcast, reduction tests. +* Stencil operator support -----Initial thoughts + +* Subset support, slice sums etc... -----Only need slice sum? + -----Generic cartesian subslicing? + -----Array ranges / boost extents? + -----Multigrid grid transferral. + -----Suggests generalised cartesian subblocking + sums, returning modified grid. + + Two classes of subset; +i) red black parit subsetting. + (pick checkerboard). + +ii) Need to be able to project one Grid to another Grid. + Generic concept is to subdivide (based on RD so applies to red/black or full). + Return a type on SUB-grid from CellSum TOP-grid + SUB-grid need not distribute but be replicated in some dims if that is how the + cartesian communicator works. + +iii) No general permutation map. + + +* Consider switch std::vector to boost arrays. + boost::multi_array A()... to replace multi1d, multi2d etc.. + +*? Cell definition <-> sliceSum. + ? Replicated arrays. + + + +* Check for missing functionality - partially audited against QDP++ layout + +* Optimise the extract/merge SIMD routines; Azusa?? + + -- I have collated into single location at least. + + -- Need to use _mm_*insert/extract routines. + +* Conformable test in Cshift routines. + +* Gamma/Dirac structures + +* Fourspin, two spin project + +* Broadcast, reduction tests. innerProduct, localInnerProduct * QDP++ regression suite and comparative benchmark * NERSC Lattice loading, plaquette test -* Conformable test in Cshift routines. - -* Gamma/Dirac structures -* Fourspin, two spin project - -* Stencil operator support - -* Check for missing functionality * I/O support - MPI IO?