Comms and memory benchmarks added

2025-12-18 03:34:40 +00:00 · 2015-05-03 09:44:47 +01:00
parent 99a1ff423d
commit 193860dbc8
14 changed files with 300 additions and 59 deletions
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -14,6 +14,7 @@

 #include <complex>
 #include <vector>
+#include <valarray>
 #include <iostream>
 #include <cassert>
 #include <random>
--- a/lib/Grid_lattice.h
+++ b/lib/Grid_lattice.h
@@ -26,7 +26,8 @@ class Lattice
 public:
    GridBase *_grid;
    int checkerboard;
-    std::vector<vobj,alignedAllocator<vobj> > _odata;
+    //std::vector<vobj,alignedAllocator<vobj> > _odata;
+    std::valarray<vobj> _odata;
 public:

    typedef typename vobj::scalar_type scalar_type;
@@ -36,9 +37,9 @@ public:
    // Constructor requires "grid" passed.
    // what about a default grid?
    //////////////////////////////////////////////////////////////////
-    Lattice(GridBase *grid) : _grid(grid) {
+ Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
      //        _odata.reserve(_grid->oSites());
-        _odata.resize(_grid->oSites());
+      //        _odata.resize(_grid->oSites());
        assert((((uint64_t)&_odata[0])&0xF) ==0);
        checkerboard=0;
    }
--- a/lib/communicator/Grid_communicator_mpi.cc
+++ b/lib/communicator/Grid_communicator_mpi.cc
@@ -93,7 +93,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
-  ierr=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  
  assert(ierr==0);
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -3,6 +3,9 @@

 namespace Grid {

+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // unary negation
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj>
  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
  {
@@ -13,25 +16,10 @@ namespace Grid {
    }
    return ret;
  }
-  
-  template<class vobj>
-  inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
-    conformable(lhs,rhs);
-#pragma omp parallel for
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
-      axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
-    }
-  }
-  template<class vobj>
-  inline void axpy(Lattice<vobj> &ret,std::complex<double> a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
-    conformable(lhs,rhs);
-#pragma omp parallel for
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
-      axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
-    }
-  }
-  
-  
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class obj1,class obj2,class obj3>
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
    conformable(lhs,rhs);
@@ -69,7 +57,89 @@ namespace Grid {
    }
  }
  
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class obj1,class obj2,class obj3>
+    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mult(&ret._odata[ss],&lhs._odata[ss],&rhs);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mac(&ret._odata[ss],&lhs._odata[ss],&rhs);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
+    }
+  }
+  template<class obj1,class obj2,class obj3>
+    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class obj1,class obj2,class obj3>
+    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+    }
+  }
+  template<class obj1,class obj2,class obj3>
+    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+    }
+  }
+  
+  /////////////////////////////////////////////////////////////////////////////////////
  // Lattice BinOp Lattice,
+  /////////////////////////////////////////////////////////////////////////////////////
  template<class left,class right>
    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
  {
@@ -156,5 +226,17 @@ namespace Grid {
      }
      return ret;
    }
+
+  template<class sobj,class vobj>
+  inline void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+    conformable(lhs,rhs);
+    vobj tmp;
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      tmp = a*lhs._odata[ss];
+      ret._odata[ss]= tmp+rhs._odata[ss];
+    }
+  }
+
 }
 #endif
--- a/lib/math/Grid_math_arith_mac.h
+++ b/lib/math/Grid_math_arith_mac.h
@@ -7,6 +7,7 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////// MAC         ///////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////

    ///////////////////////////
    // Legal multiplication table
@@ -74,8 +75,6 @@ inline void mac(iVector<rrtype,N> * __restrict__ ret,const iVector<ltype,N> * __
    }
    return;
 }
-
-
 }

 #endif
--- a/lib/math/Grid_math_arith_mul.h
+++ b/lib/math/Grid_math_arith_mul.h
@@ -7,7 +7,6 @@ namespace Grid {
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////// MUL         ///////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////////////////////////
-
    
 template<class rtype,class vtype,class mtype>
 inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> * __restrict__ lhs,const iScalar<vtype> * __restrict__ rhs){
--- a/lib/math/Grid_math_tensors.h
+++ b/lib/math/Grid_math_tensors.h
@@ -16,7 +16,7 @@ namespace Grid {
 // However note that doing this eliminates some syntactical sugar such as 
 // calling the constructor explicitly or implicitly
 //
-#define TENSOR_IS_POD
+#undef TENSOR_IS_POD

 template<class vtype> class iScalar
 {
@@ -36,7 +36,7 @@ public:
  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;

 #ifndef TENSOR_IS_POD
-  iScalar(){;};
+  iScalar()=default;
  iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
  iScalar(const Zero &z){ *this = zero; };
 #endif
@@ -126,7 +126,7 @@ public:

 #ifndef TENSOR_IS_POD
  iVector(const Zero &z){ *this = zero; };
-  iVector() {};// Empty constructure
+  iVector() =default;
 #endif

    iVector<vtype,N> & operator= (const Zero &hero){
@@ -189,7 +189,7 @@ public:

 #ifndef TENSOR_IS_POD
  iMatrix(const Zero &z){ *this = zero; };
-  iMatrix() {};
+  iMatrix() =default;
 #endif

  iMatrix<vtype,N> & operator= (const Zero &hero){
--- a/lib/simd/Grid_vComplexD.h
+++ b/lib/simd/Grid_vComplexD.h
@@ -13,7 +13,7 @@ namespace Grid {
            vzero(*this);
            return (*this);
        }
-        vComplexD(){};
+        vComplexD()=default;
        vComplexD(ComplexD a){
 	  vsplat(*this,a);
 	};
--- a/lib/simd/Grid_vComplexF.h
+++ b/lib/simd/Grid_vComplexF.h
@@ -28,7 +28,7 @@ namespace Grid {
            vzero(*this);
            return (*this);
        }
-        vComplexF(){};
+        vComplexF()=default;
        vComplexF(ComplexF a){
 	  vsplat(*this,a);
 	};
--- a/lib/simd/Grid_vRealD.h
+++ b/lib/simd/Grid_vRealD.h
@@ -10,10 +10,13 @@ namespace Grid {
 	typedef dvec  vector_type;
 	typedef RealD scalar_type;

-        vRealD(){};
+        vRealD()=default;
        vRealD(RealD a){
 	  vsplat(*this,a);
 	};
+        vRealD(Zero &zero){
+	  zeroit(*this);
+	}

        friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
--- a/lib/simd/Grid_vRealF.h
+++ b/lib/simd/Grid_vRealF.h
@@ -8,14 +8,16 @@ namespace Grid {
        fvec v;

    public:
-
 	typedef fvec  vector_type;
 	typedef RealF scalar_type;

-        vRealF(){};
+        vRealF()=default;
        vRealF(RealF a){
 	  vsplat(*this,a);
 	};
+        vRealF(Zero &zero){
+	  zeroit(*this);
+	}
        ////////////////////////////////////
        // Arithmetic operator overloads +,-,*
        ////////////////////////////////////