Merge branch 'develop' into feature/hadrons

2026-06-24 04:23:30 +01:00 · 2018-05-01 14:07:32 +01:00
parent edc28dcfbf aaf37ee4d7
commit ca639c195f
15 changed files with 694 additions and 104 deletions
@@ -479,15 +479,13 @@ until convergence
 	Field B(grid); B.checkerboard = evec[0].checkerboard;

 	//  power of two search pattern;  not every evalue in eval2 is assessed.
+	int allconv =1;
 	for(int jj = 1; jj<=Nstop; jj*=2){
 	  int j = Nstop-jj;
 	  RealD e = eval2_copy[j]; // Discard the evalue
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
-	  if( _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
-	    if ( j > Nconv ) {
-	      Nconv=j+1;
-	      jj=Nstop; // Terminate the scan
-	    }
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
+	    allconv=0;
 	  }
 	}
 	// Do evec[0] for good measure
@@ -495,8 +493,10 @@ until convergence
 	  int j=0;
 	  RealD e = eval2_copy[0]; 
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
-	  _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox);
+	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
 	}
+	if ( allconv ) Nconv = Nstop;
+
 	// test if we converged, if so, terminate
 	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
 	//	if( Nconv>=Nstop || beta_k < betastp){
@@ -48,6 +48,7 @@ struct LanczosParams : Serializable {
 struct LocalCoherenceLanczosParams : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
+				  bool, saveEvecs,
 				  bool, doFine,
 				  bool, doFineRead,
 				  bool, doCoarse,
@@ -277,7 +277,9 @@ public:
    uint8_t *cp = (uint8_t *)ptr;
    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
+#ifdef GRID_OMP
 #pragma omp parallel for schedule(static)
+#endif
      for(size_type n=0;n<bytes;n+=4096){
 	cp[n]=0;
      }
@@ -45,31 +45,33 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
  int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+  int ent = 0;
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);

  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o  = n*stride;
 	int bo = n*e2;
-	buffer[off+bo+b]=rhs._odata[so+o+b];
+	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
      }
    }
  } else { 
     int bo=0;
-     std::vector<std::pair<int,int> > table;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*stride;
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	 if ( ocb &cbmask ) {
-	   table.push_back(std::pair<int,int> (bo++,o+b));
+	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 	 }
       }
     }
-     parallel_for(int i=0;i<table.size();i++){
-       buffer[off+table[i].first]=rhs._odata[so+table[i].second];
-     }
+  }
+  parallel_for(int i=0;i<ent;i++){
+    buffer[table[i].first]=rhs._odata[table[i].second];
  }
 }

@@ -140,31 +142,35 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int stride=rhs._grid->_slice_stride[dimension];
-  
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent    =0;
+
  if ( cbmask ==0x3 ) {
-    parallel_for_nest2(int n=0;n<e1;n++){
+
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int bo  =n*rhs._grid->_slice_block[dimension];
-	rhs._odata[so+o+b]=buffer[bo+b];
+	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
      }
    }
+
  } else { 
-    std::vector<std::pair<int,int> > table;
    int bo=0;
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
-	  table.push_back(std::pair<int,int> (so+o+b,bo++));
+	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
 	}
      }
    }
-    parallel_for(int i=0;i<table.size();i++){
-       //       std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
-       rhs._odata[table[i].first]=buffer[table[i].second];
-     }
+  }
+
+  parallel_for(int i=0;i<ent;i++){
+    rhs._odata[table[i].first]=buffer[table[i].second];
  }
 }

@@ -228,29 +234,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
  int stride = rhs._grid->_slice_stride[dimension];
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent=0;
+
  if(cbmask == 0x3 ){
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
- 
        int o =n*stride+b;
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
-	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
  } else { 
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
- 
        int o =n*stride+b;
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
-	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	}
      }
    }
  }
-  
+
+  parallel_for(int i=0;i<ent;i++){
+    lhs._odata[table[i].first]=rhs._odata[table[i].second];
+  }
+
 }

 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
@@ -269,16 +278,28 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int e2=rhs._grid->_slice_block [dimension];
  int stride = rhs._grid->_slice_stride[dimension];

-  parallel_for_nest2(int n=0;n<e1;n++){
-  for(int b=0;b<e2;b++){
+  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
+  int ent=0;

+  double t_tab,t_perm;
+  if ( cbmask == 0x3 ) {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
+      int o  =n*stride;
+      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  } else {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
      int o  =n*stride;
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) {
-	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
-      }
+      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  }

-  }}
+  parallel_for(int i=0;i<ent;i++){
+    permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
+  }
 }

 //////////////////////////////////////////////////////
@@ -291,6 +312,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);

+  double t_local;
+  
  if ( sshift[0] == sshift[1] ) {
    Cshift_local(ret,rhs,dimension,shift,0x3);
  } else {
@@ -299,7 +322,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
  }
 }

-template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid = rhs._grid;
  int fd = grid->_fdimensions[dimension];
@@ -325,11 +348,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice

    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;
-
-    // FIXME : This must change where we have a 
-    // Rotate slice.
    
-    // Document how this works ; why didn't I do this when I first wrote it...
    // wrap is whether sshift > rd.
    //  num is sshift mod rd.
    // 
@@ -365,10 +384,8 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice

    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-
  
  }
-  return ret;
 }
 }
 #endif
@@ -256,13 +256,42 @@ public:
      _odata[ss]=r._odata[ss];
    }  	
  }
-  
+
  Lattice(Lattice&& r){ // move constructor
    _grid = r._grid;
    checkerboard = r.checkerboard;
    _odata=std::move(r._odata);
  }
  
+  inline Lattice<vobj> & operator = (Lattice<vobj> && r)
+  {
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata       =std::move(r._odata);
+    return *this;
+  }
+
+  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata.resize(_grid->oSites());// essential
+    
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      _odata[ss]=r._odata[ss];
+    }  	
+    return *this;
+  }
+
+  template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+    this->checkerboard = r.checkerboard;
+    conformable(*this,r);
+    
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      this->_odata[ss]=r._odata[ss];
+    }
+    return *this;
+  }
+
  virtual ~Lattice(void) = default;
    
  void reset(GridBase* grid) {
@@ -281,33 +310,6 @@ public:
    return *this;
  }
  
-  template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
-    this->checkerboard = r.checkerboard;
-    conformable(*this,r);
-    
-    parallel_for(int ss=0;ss<_grid->oSites();ss++){
-      this->_odata[ss]=r._odata[ss];
-    }
-    return *this;
-  }
-
-  strong_inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
-    _grid        = r._grid;
-    checkerboard = r.checkerboard;
-    _odata.resize(_grid->oSites());// essential
-    
-    parallel_for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss]=r._odata[ss];
-    }  	
-    return *this;
-  }
-  strong_inline Lattice<vobj> & operator = (Lattice<vobj> && r)
-  {
-    _grid        = r._grid;
-    checkerboard = r.checkerboard;
-    _odata       =std::move(r._odata);
-    return *this;
-  }
  
  // *=,+=,-= operators inherit behvour from correspond */+/- operation
  template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
@@ -179,7 +179,7 @@ namespace Grid {
      return ret;
    }

-#define DECLARE_RELATIONAL(op,functor) \
+#define DECLARE_RELATIONAL_EQ(op,functor) \
  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
    {\
@@ -198,11 +198,6 @@ namespace Grid {
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
-  template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
-    {									\
-      return lhs._internal op rhs._internal;				\
-    }									\
  template<class vsimd>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
    {									\
@@ -212,14 +207,21 @@ namespace Grid {
    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
    {									\
      return lhs op rhs._internal;					\
-    }									
+    }									\

+#define DECLARE_RELATIONAL(op,functor) \
+  DECLARE_RELATIONAL_EQ(op,functor)    \
+  template<class vsimd>\
+    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
+    {									\
+      return lhs._internal op rhs._internal;				\
+    }									

 DECLARE_RELATIONAL(<,slt);
 DECLARE_RELATIONAL(<=,sle);
 DECLARE_RELATIONAL(>,sgt);
 DECLARE_RELATIONAL(>=,sge);
-DECLARE_RELATIONAL(==,seq);
+DECLARE_RELATIONAL_EQ(==,seq);
 DECLARE_RELATIONAL(!=,sne);

 #undef DECLARE_RELATIONAL
@@ -110,11 +110,11 @@ class BinaryIO {
      lsites = 1;
    }

-    #pragma omp parallel
+PARALLEL_REGION
    {
      uint32_t nersc_csum_thr = 0;

-      #pragma omp for
+PARALLEL_FOR_LOOP_INTERN
      for (uint64_t local_site = 0; local_site < lsites; local_site++)
      {
        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
@@ -124,7 +124,7 @@ class BinaryIO {
        }
      }

-      #pragma omp critical
+PARALLEL_CRITICAL
      {
        nersc_csum += nersc_csum_thr;
      }
@@ -146,14 +146,14 @@ class BinaryIO {
    std::vector<int> local_start =grid->LocalStarts();
    std::vector<int> global_vol  =grid->FullDimensions();

-#pragma omp parallel
+PARALLEL_REGION
    { 
      std::vector<int> coor(nd);
      uint32_t scidac_csuma_thr=0;
      uint32_t scidac_csumb_thr=0;
      uint32_t site_crc=0;

-#pragma omp for
+PARALLEL_FOR_LOOP_INTERN
      for(uint64_t local_site=0;local_site<lsites;local_site++){

 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
@@ -183,7 +183,7 @@ class BinaryIO {
 	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
      }

-#pragma omp critical
+PARALLEL_CRITICAL
      {
 	scidac_csuma^= scidac_csuma_thr;
 	scidac_csumb^= scidac_csumb_thr;
@@ -40,7 +40,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)")
 #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
-#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
+#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for schedule(static) collapse(2)")
 #define PARALLEL_REGION       _Pragma("omp parallel")
 #define PARALLEL_CRITICAL     _Pragma("omp critical")
 #else