diff --git a/Grid_Communicator.h b/Grid_Communicator.h index c7e71f12..0e15765c 100644 --- a/Grid_Communicator.h +++ b/Grid_Communicator.h @@ -42,13 +42,31 @@ class CartesianCommunicator { //////////////////////////////////////////////////////////// // Reduction //////////////////////////////////////////////////////////// - void GlobalSum(float &); - void GlobalSumVector(float *,int N); + void GlobalSum(RealF &); + void GlobalSumVector(RealF *,int N); - void GlobalSum(double &); - void GlobalSumVector(double *,int N); + void GlobalSum(RealD &); + void GlobalSumVector(RealD *,int N); + + void GlobalSum(ComplexF &c) + { + GlobalSumVector((float *)&c,2); + } + void GlobalSumVector(ComplexF *c,int N) + { + GlobalSumVector((float *)c,2*N); + } + + void GlobalSum(ComplexD &c) + { + GlobalSumVector((double *)&c,2); + } + void GlobalSumVector(ComplexD *c,int N) + { + GlobalSumVector((double *)c,2*N); + } - template void GlobalSumObj(obj &o){ + template void GlobalSum(obj &o){ typedef typename obj::scalar_type scalar_type; int words = sizeof(obj)/sizeof(scalar_type); diff --git a/Grid_Lattice.h b/Grid_Lattice.h index 1e2c820e..a94a13a4 100644 --- a/Grid_Lattice.h +++ b/Grid_Lattice.h @@ -407,18 +407,83 @@ public: return ret; } + //////////////////////////////////////////////////////////////////////////////////////////////////// + // Reduction operations + //////////////////////////////////////////////////////////////////////////////////////////////////// template inline RealD norm2(const Lattice &arg){ typedef typename vobj::scalar_type scalar; - decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero; + decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero; scalar nrm; + //FIXME make this loop parallelisable for(int ss=0;ssoSites(); ss++){ - vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]); + vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]); } nrm = Reduce(TensorRemove(vnrm)); + arg._grid->GlobalSum(nrm); return real(nrm); } + + template + inline auto innerProduct(const Lattice &left,const Lattice &right) ->decltype(innerProduct(left._odata[0],right._odata[0])) + { + typedef typename vobj::scalar_type scalar; + decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; + + scalar nrm; + //FIXME make this loop parallelisable + for(int ss=0;ssoSites(); ss++){ + vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]); + } + nrm = Reduce(TensorRemove(vnrm)); + right._grid->GlobalSum(nrm); + return nrm; + } + + ///////////////////////////////////////////////////// + // Non site reduced routines + ///////////////////////////////////////////////////// + + // localNorm2, + template + inline auto localNorm2 (const Lattice &rhs)-> Lattice + { + Lattice ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=innerProduct(rhs,rhs); + } + return ret; + } + + // localInnerProduct + template + inline auto localInnerProduct (const Lattice &lhs,const Lattice &rhs) + -> Lattice + { + Lattice ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]); + } + return ret; + } + + // outerProduct Scalar x Scalar -> Scalar + // Vector x Vector -> Matrix + template + inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice + { + Lattice ret(rhs._grid); +#pragma omp parallel for + for(int ss=0;ssoSites(); ss++){ + ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); + } + return ret; + } + + } #endif diff --git a/Grid_QCD.h b/Grid_QCD.h index 1d52f911..72a8ee67 100644 --- a/Grid_QCD.h +++ b/Grid_QCD.h @@ -57,41 +57,6 @@ namespace QCD { typedef Lattice LatticeSpinVector; typedef Lattice LatticeColourVector; - // localNorm2, - template - inline LatticeComplex localNorm2 (const Lattice &rhs) - { - LatticeComplex ret(rhs._grid); -#pragma omp parallel for - for(int ss=0;ssoSites(); ss++){ - ret._odata[ss]=trace(adj(rhs)*rhs); - } - return ret; - } - // localInnerProduct - template - inline LatticeComplex localInnerProduct (const Lattice &lhs,const Lattice &rhs) - { - LatticeComplex ret(rhs._grid); -#pragma omp parallel for - for(int ss=0;ssoSites(); ss++){ - ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]); - } - return ret; - } - - // outerProduct Scalar x Scalar -> Scalar - // Vector x Vector -> Matrix - template - inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice - { - Lattice ret(rhs._grid); -#pragma omp parallel for - for(int ss=0;ssoSites(); ss++){ - ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); - } - return ret; - } // FIXME for debug; deprecate this inline void LatticeCoordinate(LatticeInteger &l,int mu){ diff --git a/Grid_communicator_mpi.cc b/Grid_communicator_mpi.cc index a4943478..dba5f039 100644 --- a/Grid_communicator_mpi.cc +++ b/Grid_communicator_mpi.cc @@ -29,21 +29,20 @@ CartesianCommunicator::CartesianCommunicator(std::vector &processors) } void CartesianCommunicator::GlobalSum(float &f){ - MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator); + MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); } void CartesianCommunicator::GlobalSumVector(float *f,int N) { - MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator); + MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); } void CartesianCommunicator::GlobalSum(double &d) { - MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator); + MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); } void CartesianCommunicator::GlobalSumVector(double *d,int N) { - MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator); + MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); } - void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) { MPI_Cart_shift(communicator,dim,shift,&source,&dest); diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h index e962f6ea..2910151c 100644 --- a/Grid_cshift_common.h +++ b/Grid_cshift_common.h @@ -8,7 +8,6 @@ friend void Gather_plane_simple (Lattice &rhs,std::vector_rdimensions[dimension]; - printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout); if ( !rhs._grid->CheckerBoarded(dimension) ) { int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane @@ -19,7 +18,6 @@ friend void Gather_plane_simple (Lattice &rhs,std::vector_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ - printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout); buffer[bo++]=rhs._odata[so+o+b]; } o +=rhs._grid->_slice_stride[dimension]; diff --git a/Grid_math_types.h b/Grid_math_types.h index e3432645..01fc7bf2 100644 --- a/Grid_math_types.h +++ b/Grid_math_types.h @@ -903,38 +903,38 @@ template inline iMatrix operator - (Integer lhs,const iMatri /////////////////////////////////////////////////////////////////////////////////////// - // localInnerProduct Scalar x Scalar -> Scalar - // localInnerProduct Vector x Vector -> Scalar - // localInnerProduct Matrix x Matrix -> Scalar + // innerProduct Scalar x Scalar -> Scalar + // innerProduct Vector x Vector -> Scalar + // innerProduct Matrix x Matrix -> Scalar /////////////////////////////////////////////////////////////////////////////////////// template inline - auto localInnerProduct (const iVector& lhs,const iVector& rhs) -> iScalar + auto innerProduct (const iVector& lhs,const iVector& rhs) -> iScalar { - typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t; + typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; iScalar ret=zero; for(int c1=0;c1 inline - auto localInnerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar + auto innerProduct (const iMatrix& lhs,const iMatrix& rhs) -> iScalar { - typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; iScalar ret=zero; iScalar tmp; for(int c1=0;c1 inline - auto localInnerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar + auto innerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar { - typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t; + typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; iScalar ret; - ret._internal = localInnerProduct(lhs._internal,rhs._internal); + ret._internal = innerProduct(lhs._internal,rhs._internal); return ret; } diff --git a/Grid_simd.h b/Grid_simd.h index 23061632..4f9f4b0d 100644 --- a/Grid_simd.h +++ b/Grid_simd.h @@ -33,10 +33,10 @@ namespace Grid { inline RealF adj(const RealF & r){ return r; } inline RealF conj(const RealF & r){ return r; } - inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } - inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } - inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; } - inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; } + inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } + inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } + inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; } + inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; } //////////////////////////////////////////////////////////////////////////////// //Provide support functions for basic real and complex data types required by Grid diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h index f1c335e8..133e354b 100644 --- a/Grid_vComplexD.h +++ b/Grid_vComplexD.h @@ -313,7 +313,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ }; - inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; } + inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; } typedef vComplexD vDComplex; diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h index 18cf1749..9dc5f051 100644 --- a/Grid_vComplexF.h +++ b/Grid_vComplexF.h @@ -362,7 +362,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ }; - inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) + inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; } diff --git a/Grid_vRealD.h b/Grid_vRealD.h index c51e41cf..84c80698 100644 --- a/Grid_vRealD.h +++ b/Grid_vRealD.h @@ -240,7 +240,7 @@ namespace Grid { static int Nsimd(void) { return sizeof(dvec)/sizeof(double);} }; - inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; } + inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; } inline void zeroit(vRealD &z){ vzero(z);} inline vRealD outerProduct(const vRealD &l, const vRealD& r) diff --git a/Grid_vRealF.h b/Grid_vRealF.h index 0fe68f43..aeb454f0 100644 --- a/Grid_vRealF.h +++ b/Grid_vRealF.h @@ -260,7 +260,7 @@ friend inline void vstore(const vRealF &ret, float *a){ public: static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);} }; - inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; } + inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; } inline void zeroit(vRealF &z){ vzero(z);} inline vRealF outerProduct(const vRealF &l, const vRealF& r) diff --git a/TODO b/TODO index 2c8fdcf8..010a06b7 100644 --- a/TODO +++ b/TODO @@ -1,15 +1,14 @@ +AUDITS: * FIXME audit - +* const audit * Replace vset with a call to merge.; - * care in Gmerge,Gextract over vset . - -* Const audit - * extract / merge extra implementation removal +FUNCTIONALITY: + * Conditional execution, where etc... -----DONE, simple test * Integer relational support -----DONE @@ -22,6 +21,11 @@ * Stencil operator support -----Initial thoughts, trial implementation DONE. -----some simple tests that Stencil matches Cshift. + -----do all permute in comms phase, so that copy permute + -----cases move into a buffer. + +* CovariantShift support -----Use a class to store gauge field? (parallel transport?) + * Subset support, slice sums etc... -----Only need slice sum? -----Generic cartesian subslicing?