mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-24 17:54:47 +01:00 
			
		
		
		
	Reduce now going through MPI.
This commit is contained in:
		| @@ -42,13 +42,31 @@ class CartesianCommunicator { | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     // Reduction | ||||
|     //////////////////////////////////////////////////////////// | ||||
|     void GlobalSum(float &); | ||||
|     void GlobalSumVector(float *,int N); | ||||
|     void GlobalSum(RealF &); | ||||
|     void GlobalSumVector(RealF *,int N); | ||||
|  | ||||
|     void GlobalSum(double &); | ||||
|     void GlobalSumVector(double *,int N); | ||||
|     void GlobalSum(RealD &); | ||||
|     void GlobalSumVector(RealD *,int N); | ||||
|  | ||||
|     template<class obj> void GlobalSumObj(obj &o){ | ||||
|     void GlobalSum(ComplexF &c) | ||||
|     { | ||||
|       GlobalSumVector((float *)&c,2); | ||||
|     } | ||||
|     void GlobalSumVector(ComplexF *c,int N) | ||||
|     { | ||||
|       GlobalSumVector((float *)c,2*N); | ||||
|     } | ||||
|  | ||||
|     void GlobalSum(ComplexD &c) | ||||
|     { | ||||
|       GlobalSumVector((double *)&c,2); | ||||
|     } | ||||
|     void GlobalSumVector(ComplexD *c,int N) | ||||
|     { | ||||
|       GlobalSumVector((double *)c,2*N); | ||||
|     } | ||||
|      | ||||
|     template<class obj> void GlobalSum(obj &o){ | ||||
|  | ||||
|       typedef typename obj::scalar_type scalar_type; | ||||
|       int words = sizeof(obj)/sizeof(scalar_type); | ||||
|   | ||||
| @@ -407,18 +407,83 @@ public: | ||||
|         return ret; | ||||
|     } | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Reduction operations | ||||
|     //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|     template<class vobj> | ||||
|     inline RealD norm2(const Lattice<vobj> &arg){ | ||||
|  | ||||
|       typedef typename vobj::scalar_type scalar; | ||||
|       decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero; | ||||
|       decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero; | ||||
|  | ||||
|       scalar nrm; | ||||
|       //FIXME make this loop parallelisable | ||||
|       for(int ss=0;ss<arg._grid->oSites(); ss++){ | ||||
| 	vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]); | ||||
| 	vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]); | ||||
|       } | ||||
|       nrm = Reduce(TensorRemove(vnrm)); | ||||
|       arg._grid->GlobalSum(nrm); | ||||
|       return real(nrm); | ||||
|     } | ||||
|  | ||||
|     template<class vobj> | ||||
|     inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0])) | ||||
|     { | ||||
|       typedef typename vobj::scalar_type scalar; | ||||
|       decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; | ||||
|  | ||||
|       scalar nrm; | ||||
|       //FIXME make this loop parallelisable | ||||
|       for(int ss=0;ss<left._grid->oSites(); ss++){ | ||||
| 	vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]); | ||||
|       } | ||||
|       nrm = Reduce(TensorRemove(vnrm)); | ||||
|       right._grid->GlobalSum(nrm); | ||||
|       return nrm; | ||||
|     } | ||||
|  | ||||
|     ///////////////////////////////////////////////////// | ||||
|     // Non site reduced routines | ||||
|     ///////////////////////////////////////////////////// | ||||
|  | ||||
|     // localNorm2, | ||||
|     template<class vobj> | ||||
|     inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> | ||||
|     { | ||||
|       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); | ||||
| #pragma omp parallel for | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	  ret._odata[ss]=innerProduct(rhs,rhs); | ||||
|         } | ||||
|         return ret; | ||||
|     } | ||||
|  | ||||
|     // localInnerProduct | ||||
|     template<class vobj> | ||||
|     inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) | ||||
|       -> Lattice<typename vobj::tensor_reduced> | ||||
|     { | ||||
|       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); | ||||
| #pragma omp parallel for | ||||
|       for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
| 	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||
|       } | ||||
|       return ret; | ||||
|     } | ||||
|      | ||||
|     // outerProduct Scalar x Scalar -> Scalar | ||||
|     //              Vector x Vector -> Matrix | ||||
|     template<class ll,class rr> | ||||
|     inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> | ||||
|     { | ||||
|         Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid); | ||||
| #pragma omp parallel for | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|             ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|      } | ||||
|  | ||||
|  | ||||
| } | ||||
| #endif | ||||
|   | ||||
							
								
								
									
										35
									
								
								Grid_QCD.h
									
									
									
									
									
								
							
							
						
						
									
										35
									
								
								Grid_QCD.h
									
									
									
									
									
								
							| @@ -57,41 +57,6 @@ namespace QCD { | ||||
|     typedef Lattice<vSpinVector>       LatticeSpinVector; | ||||
|     typedef Lattice<vColourVector>     LatticeColourVector; | ||||
|  | ||||
|     // localNorm2, | ||||
|     template<class tt> | ||||
|     inline LatticeComplex localNorm2 (const Lattice<tt> &rhs) | ||||
|     { | ||||
|         LatticeComplex ret(rhs._grid); | ||||
| #pragma omp parallel for | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|             ret._odata[ss]=trace(adj(rhs)*rhs); | ||||
|         } | ||||
|         return ret; | ||||
|     } | ||||
|     // localInnerProduct | ||||
|     template<class tt> | ||||
|     inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs) | ||||
|     { | ||||
|         LatticeComplex ret(rhs._grid); | ||||
| #pragma omp parallel for | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|             ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|     } | ||||
|      | ||||
|     // outerProduct Scalar x Scalar -> Scalar | ||||
|     //              Vector x Vector -> Matrix | ||||
|     template<class ll,class rr> | ||||
|     inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> | ||||
|     { | ||||
|         Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid); | ||||
| #pragma omp parallel for | ||||
|         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||
|             ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||
|         } | ||||
|         return ret; | ||||
|      } | ||||
|  | ||||
|     // FIXME for debug; deprecate this | ||||
|    inline void LatticeCoordinate(LatticeInteger &l,int mu){ | ||||
|   | ||||
| @@ -29,21 +29,20 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors) | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::GlobalSum(float &f){ | ||||
|   MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator); | ||||
|   MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); | ||||
| } | ||||
| void CartesianCommunicator::GlobalSumVector(float *f,int N) | ||||
| { | ||||
|   MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator); | ||||
|   MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); | ||||
| } | ||||
| void CartesianCommunicator::GlobalSum(double &d) | ||||
| { | ||||
|   MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator); | ||||
|   MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); | ||||
| } | ||||
| void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||
| { | ||||
|   MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator); | ||||
|   MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||
| { | ||||
|   MPI_Cart_shift(communicator,dim,shift,&source,&dest); | ||||
|   | ||||
| @@ -8,7 +8,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo | ||||
| { | ||||
|   int rd = rhs._grid->_rdimensions[dimension]; | ||||
|  | ||||
|   printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout); | ||||
|   if ( !rhs._grid->CheckerBoarded(dimension) ) { | ||||
|  | ||||
|     int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||
| @@ -19,7 +18,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo | ||||
| #pragma omp parallel for collapse(2) | ||||
|     for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){ | ||||
|       for(int b=0;b<rhs._grid->_slice_block[dimension];b++){ | ||||
| 	printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout); | ||||
| 	buffer[bo++]=rhs._odata[so+o+b]; | ||||
|       } | ||||
|       o +=rhs._grid->_slice_stride[dimension]; | ||||
|   | ||||
| @@ -903,38 +903,38 @@ template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatri | ||||
|  | ||||
|  | ||||
|     /////////////////////////////////////////////////////////////////////////////////////// | ||||
|     // localInnerProduct Scalar x Scalar -> Scalar | ||||
|     // localInnerProduct Vector x Vector -> Scalar | ||||
|     // localInnerProduct Matrix x Matrix -> Scalar | ||||
|     // innerProduct Scalar x Scalar -> Scalar | ||||
|     // innerProduct Vector x Vector -> Scalar | ||||
|     // innerProduct Matrix x Matrix -> Scalar | ||||
|     /////////////////////////////////////////////////////////////////////////////////////// | ||||
|     template<class l,class r,int N> inline | ||||
|     auto localInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0],rhs._internal[0]))> | ||||
|     auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))> | ||||
|     { | ||||
|         typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t; | ||||
|         typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; | ||||
|         iScalar<ret_t> ret=zero; | ||||
|         for(int c1=0;c1<N;c1++){ | ||||
|             ret._internal += localInnerProduct(lhs._internal[c1],rhs._internal[c1]); | ||||
|             ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]); | ||||
|         } | ||||
|         return ret; | ||||
|     } | ||||
|     template<class l,class r,int N> inline | ||||
|     auto localInnerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0]))> | ||||
|     auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))> | ||||
|     { | ||||
|         typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; | ||||
|         typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t; | ||||
|         iScalar<ret_t> ret=zero; | ||||
|         iScalar<ret_t> tmp; | ||||
|         for(int c1=0;c1<N;c1++){ | ||||
|         for(int c2=0;c2<N;c2++){ | ||||
| 	  ret._internal+=localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]); | ||||
| 	  ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]); | ||||
|         }} | ||||
|         return ret; | ||||
|     } | ||||
|     template<class l,class r> inline | ||||
|     auto localInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal,rhs._internal))> | ||||
|     auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))> | ||||
|     { | ||||
|         typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t; | ||||
|         typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; | ||||
|         iScalar<ret_t> ret; | ||||
|         ret._internal = localInnerProduct(lhs._internal,rhs._internal); | ||||
|         ret._internal = innerProduct(lhs._internal,rhs._internal); | ||||
|         return ret; | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -33,10 +33,10 @@ namespace Grid { | ||||
|  | ||||
|   inline RealF adj(const RealF  & r){ return r; } | ||||
|   inline RealF conj(const RealF  & r){ return r; } | ||||
|   inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } | ||||
|   inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } | ||||
|   inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; } | ||||
|   inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; } | ||||
|   inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; } | ||||
|   inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; } | ||||
|   inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; } | ||||
|   inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; } | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////////// | ||||
|     //Provide support functions for basic real and complex data types required by Grid | ||||
|   | ||||
| @@ -313,7 +313,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ | ||||
|     }; | ||||
|  | ||||
|  | ||||
|     inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; } | ||||
|     inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; } | ||||
|  | ||||
|  | ||||
|     typedef  vComplexD vDComplex; | ||||
|   | ||||
| @@ -362,7 +362,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ | ||||
|  | ||||
|     }; | ||||
|  | ||||
|     inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r)  | ||||
|     inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r)  | ||||
|     { | ||||
|       return conj(l)*r;  | ||||
|     } | ||||
|   | ||||
| @@ -240,7 +240,7 @@ namespace Grid { | ||||
|         static int Nsimd(void) { return sizeof(dvec)/sizeof(double);} | ||||
|     }; | ||||
|  | ||||
|    inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; } | ||||
|    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; } | ||||
|     inline void zeroit(vRealD &z){ vzero(z);} | ||||
|  | ||||
|     inline vRealD outerProduct(const vRealD &l, const vRealD& r) | ||||
|   | ||||
| @@ -260,7 +260,7 @@ friend inline void vstore(const vRealF &ret, float *a){ | ||||
|     public: | ||||
|         static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);} | ||||
|     }; | ||||
|     inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; } | ||||
|     inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; } | ||||
|     inline void  zeroit(vRealF &z){ vzero(z);} | ||||
|  | ||||
|     inline vRealF outerProduct(const vRealF &l, const vRealF& r) | ||||
|   | ||||
							
								
								
									
										14
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								TODO
									
									
									
									
									
								
							| @@ -1,15 +1,14 @@ | ||||
|  | ||||
| AUDITS: | ||||
| * FIXME audit | ||||
|  | ||||
| * const audit | ||||
| * Replace vset with a call to merge.;  | ||||
|  | ||||
| * care in Gmerge,Gextract over vset . | ||||
|  | ||||
| * Const audit | ||||
|  | ||||
| * extract / merge extra implementation removal       | ||||
|  | ||||
|  | ||||
| FUNCTIONALITY: | ||||
|  | ||||
| * Conditional execution, where etc...                -----DONE, simple test | ||||
|  | ||||
| * Integer relational support                         -----DONE | ||||
| @@ -22,6 +21,11 @@ | ||||
|  | ||||
| * Stencil operator support                           -----Initial thoughts, trial implementation DONE. | ||||
|                                                      -----some simple tests that Stencil matches Cshift. | ||||
|                                                      -----do all permute in comms phase, so that copy permute | ||||
| 						     -----cases move into a buffer. | ||||
|  | ||||
| * CovariantShift support                             -----Use a class to store gauge field? (parallel transport?) | ||||
|  | ||||
|  | ||||
| * Subset support, slice sums etc...                  -----Only need slice sum? | ||||
|                                                      -----Generic cartesian subslicing? | ||||
|   | ||||
		Reference in New Issue
	
	Block a user