Reduce now going through MPI.

2025-07-04 15:37:05 +01:00 · 2015-04-14 22:40:40 +01:00
parent f1876b7e95
commit ab9a764bb1
12 changed files with 123 additions and 74 deletions
--- a/Grid_Communicator.h
+++ b/Grid_Communicator.h
@ -42,13 +42,31 @@ class CartesianCommunicator {
    ////////////////////////////////////////////////////////////
    // Reduction
    ////////////////////////////////////////////////////////////
-    void GlobalSum(float &);
-    void GlobalSumVector(float *,int N);
+    void GlobalSum(RealF &);
+    void GlobalSumVector(RealF *,int N);

-    void GlobalSum(double &);
-    void GlobalSumVector(double *,int N);
+    void GlobalSum(RealD &);
+    void GlobalSumVector(RealD *,int N);

-    template<class obj> void GlobalSumObj(obj &o){
+    void GlobalSum(ComplexF &c)
+    {
+      GlobalSumVector((float *)&c,2);
+    }
+    void GlobalSumVector(ComplexF *c,int N)
+    {
+      GlobalSumVector((float *)c,2*N);
+    }
+
+    void GlobalSum(ComplexD &c)
+    {
+      GlobalSumVector((double *)&c,2);
+    }
+    void GlobalSumVector(ComplexD *c,int N)
+    {
+      GlobalSumVector((double *)c,2*N);
+    }
+    
+    template<class obj> void GlobalSum(obj &o){

      typedef typename obj::scalar_type scalar_type;
      int words = sizeof(obj)/sizeof(scalar_type);
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@ -407,18 +407,83 @@ public:
        return ret;
    }

+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Reduction operations
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class vobj>
    inline RealD norm2(const Lattice<vobj> &arg){

      typedef typename vobj::scalar_type scalar;
-      decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
+      decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;

      scalar nrm;
+      //FIXME make this loop parallelisable
      for(int ss=0;ss<arg._grid->oSites(); ss++){
-	vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]);
+	vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
      }
      nrm = Reduce(TensorRemove(vnrm));
+      arg._grid->GlobalSum(nrm);
      return real(nrm);
    }
+
+    template<class vobj>
+    inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
+    {
+      typedef typename vobj::scalar_type scalar;
+      decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
+
+      scalar nrm;
+      //FIXME make this loop parallelisable
+      for(int ss=0;ss<left._grid->oSites(); ss++){
+	vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
+      }
+      nrm = Reduce(TensorRemove(vnrm));
+      right._grid->GlobalSum(nrm);
+      return nrm;
+    }
+
+    /////////////////////////////////////////////////////
+    // Non site reduced routines
+    /////////////////////////////////////////////////////
+
+    // localNorm2,
+    template<class vobj>
+    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=innerProduct(rhs,rhs);
+        }
+        return ret;
+    }
+
+    // localInnerProduct
+    template<class vobj>
+    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
+      -> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
+    }
+    
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    template<class ll,class rr>
+    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
+    {
+        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+     }
+
+
 }
 #endif
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@ -57,41 +57,6 @@ namespace QCD {
    typedef Lattice<vSpinVector>       LatticeSpinVector;
    typedef Lattice<vColourVector>     LatticeColourVector;

-    // localNorm2,
-    template<class tt>
-    inline LatticeComplex localNorm2 (const Lattice<tt> &rhs)
-    {
-        LatticeComplex ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=trace(adj(rhs)*rhs);
-        }
-        return ret;
-    }
-    // localInnerProduct
-    template<class tt>
-    inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs)
-    {
-        LatticeComplex ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-    }
-    
-    // outerProduct Scalar x Scalar -> Scalar
-    //              Vector x Vector -> Matrix
-    template<class ll,class rr>
-    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
-    {
-        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-     }

    // FIXME for debug; deprecate this
   inline void LatticeCoordinate(LatticeInteger &l,int mu){
--- a/Grid_communicator_mpi.cc
+++ b/Grid_communicator_mpi.cc
@ -29,21 +29,20 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
 }

 void CartesianCommunicator::GlobalSum(float &f){
-  MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
-  MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
-  MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
-  MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 }
-
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  MPI_Cart_shift(communicator,dim,shift,&source,&dest);
--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@ -8,7 +8,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
 {
  int rd = rhs._grid->_rdimensions[dimension];

-  printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout);
  if ( !rhs._grid->CheckerBoarded(dimension) ) {

    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
@ -19,7 +18,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
 #pragma omp parallel for collapse(2)
    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-	printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout);
 	buffer[bo++]=rhs._odata[so+o+b];
      }
      o +=rhs._grid->_slice_stride[dimension];
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@ -903,38 +903,38 @@ template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatri


    ///////////////////////////////////////////////////////////////////////////////////////
-    // localInnerProduct Scalar x Scalar -> Scalar
-    // localInnerProduct Vector x Vector -> Scalar
-    // localInnerProduct Matrix x Matrix -> Scalar
+    // innerProduct Scalar x Scalar -> Scalar
+    // innerProduct Vector x Vector -> Scalar
+    // innerProduct Matrix x Matrix -> Scalar
    ///////////////////////////////////////////////////////////////////////////////////////
    template<class l,class r,int N> inline
-    auto localInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0],rhs._internal[0]))>
+    auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
    {
-        typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+        typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
        iScalar<ret_t> ret=zero;
        for(int c1=0;c1<N;c1++){
-            ret._internal += localInnerProduct(lhs._internal[c1],rhs._internal[c1]);
+            ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
        }
        return ret;
    }
    template<class l,class r,int N> inline
-    auto localInnerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
+    auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
    {
-        typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
+        typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
        iScalar<ret_t> ret=zero;
        iScalar<ret_t> tmp;
        for(int c1=0;c1<N;c1++){
        for(int c2=0;c2<N;c2++){
-	  ret._internal+=localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
+	  ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
        }}
        return ret;
    }
    template<class l,class r> inline
-    auto localInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal,rhs._internal))>
+    auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
    {
-        typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t;
+        typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
        iScalar<ret_t> ret;
-        ret._internal = localInnerProduct(lhs._internal,rhs._internal);
+        ret._internal = innerProduct(lhs._internal,rhs._internal);
        return ret;
    }

--- a/Grid_simd.h
+++ b/Grid_simd.h
@ -33,10 +33,10 @@ namespace Grid {

  inline RealF adj(const RealF  & r){ return r; }
  inline RealF conj(const RealF  & r){ return r; }
-  inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
-  inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
-  inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; }
-  inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; }
+  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
+  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
+  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
+  inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }

    ////////////////////////////////////////////////////////////////////////////////
    //Provide support functions for basic real and complex data types required by Grid
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@ -313,7 +313,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
    };


-    inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
+    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }


    typedef  vComplexD vDComplex;
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@ -362,7 +362,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){

    };

-    inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) 
+    inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) 
    {
      return conj(l)*r; 
    }
--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@ -240,7 +240,7 @@ namespace Grid {
        static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
    };

-   inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
+   inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
    inline void zeroit(vRealD &z){ vzero(z);}

    inline vRealD outerProduct(const vRealD &l, const vRealD& r)
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@ -260,7 +260,7 @@ friend inline void vstore(const vRealF &ret, float *a){
    public:
        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
    };
-    inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
+    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
    inline void  zeroit(vRealF &z){ vzero(z);}

    inline vRealF outerProduct(const vRealF &l, const vRealF& r)
--- a/14
+++ b/14
@ -1,15 +1,14 @@

+AUDITS:
 * FIXME audit
-
+* const audit
 * Replace vset with a call to merge.; 
-
 * care in Gmerge,Gextract over vset .
-
-* Const audit
-
 * extract / merge extra implementation removal      


+FUNCTIONALITY:
+
 * Conditional execution, where etc...                -----DONE, simple test

 * Integer relational support                         -----DONE
@ -22,6 +21,11 @@

 * Stencil operator support                           -----Initial thoughts, trial implementation DONE.
                                                     -----some simple tests that Stencil matches Cshift.
+                                                     -----do all permute in comms phase, so that copy permute
+						     -----cases move into a buffer.
+
+* CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
+

 * Subset support, slice sums etc...                  -----Only need slice sum?
                                                     -----Generic cartesian subslicing?