Merge branch 'master' of https://github.com/paboyle/Grid

2025-09-17 16:51:04 +01:00 · 2015-12-09 12:48:44 +00:00
parent 967be91692 26161addd0
commit a32a59fc43
24 changed files with 406 additions and 221 deletions
--- a/lib/Config.h.in
+++ b/lib/Config.h.in
@@ -137,9 +137,6 @@
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME

-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION

--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -17,7 +17,6 @@

 #define __X86_64

-
 #ifdef HAVE_EXECINFO_H
 #include <execinfo.h>
 #endif
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -5,22 +5,29 @@
 #include <ctime>
 #include <chrono>
 #include <string.h>
-#include <sys/ioctl.h>
-#include <sys/syscall.h>
-#include <linux/perf_event.h>

+#include <sys/ioctl.h>
+
+#ifdef __linux__
+#include <syscall.h>
+#include <linux/perf_event.h>
+#else
+#include <sys/syscall.h>
+#endif
 namespace Grid {


+#ifdef __linux__
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 			    int cpu, int group_fd, unsigned long flags)
 {
-  int ret;
+  int ret=0;

  ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
 		group_fd, flags);
  return ret;
 }
+#endif


 class PerformanceCounter {
@@ -63,7 +70,6 @@ public:
    
  int PCT;

-  struct perf_event_attr pe;
  long long count;
  int fd;
  uint64_t elapsed;
@@ -74,15 +80,19 @@ public:
  }

  PerformanceCounter(int _pct) {
+#ifdef __linux__
    assert(_pct>=0);
    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
    fd=-1;
    count=0;
    PCT =_pct;
    Open();
+#endif
  }
  void Open(void) 
  {
+#ifdef __linux__
+    struct perf_event_attr pe;
    memset(&pe, 0, sizeof(struct perf_event_attr));
    pe.size = sizeof(struct perf_event_attr);

@@ -99,32 +109,48 @@ public:
      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
      perror("Error is");
    }
+#endif
  }

  void Start(void)
  {
+#ifdef __linux__
    if ( fd!= -1) {
      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
    }
    begin  =__rdtsc();
+#else
+    begin = 0;
+#endif
  }

  void Stop(void) {
    count=0;
+#ifdef __linux__
    if ( fd!= -1) {
      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
      ::read(fd, &count, sizeof(long long));
    }
    elapsed = __rdtsc() - begin;
+#else
+    elapsed = 0;
+#endif
+
  }
  void Report(void) {
+#ifdef __linux__
    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
+#else
+    printf("%llu cycles \n", elapsed );
+#endif
  }

  ~PerformanceCounter()
  {
+#ifdef __linux__
    close(fd);
+#endif
  }

 };
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -44,7 +44,7 @@ class GridThread {
  };
  static void SetMaxThreads(void) { 
 #ifdef GRID_OMP
-    setenv("KMP_AFFINITY","balanced",1);
+    //    setenv("KMP_AFFINITY","balanced",1);
    _threads = omp_get_max_threads();
    omp_set_num_threads(_threads);
 #else 
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -264,6 +264,9 @@ PARALLEL_FOR_LOOP

      for(int i=0;i<nbasis;i++){
 	phi=Subspace.subspace[i];
+	
+	std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
+
 	for(int p=0;p<geom.npoint;p++){ 

 	  int dir   = geom.directions[p];
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -166,7 +166,6 @@ namespace Grid {
      Field *Tn  = &T1;
      Field *Tnp = &T2;

-      std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
      // Tn=T1 = (xscale M + mscale)in
      RealD xscale = 2.0/(hi-lo);
      RealD mscale = -(hi+lo)/(hi-lo);
--- a/lib/algorithms/iterative/DenseMatrix.h
+++ b/lib/algorithms/iterative/DenseMatrix.h
@@ -25,6 +25,9 @@ template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N)
  assert(N==M);
 }

+template<class T> void Resize(DenseVector<T > & mat, int N) { 
+  mat.resize(N);
+}
 template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
  mat.resize(N);
  for(int i=0;i<N;i++){
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -1,4 +1,3 @@
-#if 0
 #ifndef GRID_IRL_H
 #define GRID_IRL_H

@@ -18,8 +17,9 @@ template<class Field>
    const RealD small = 1.0e-16;
 public:       
    int lock;
-    int converged;
+    int get;
    int Niter;
+    int converged;

    int Nk;      // Number of converged sought
    int Np;      // Np -- Number of spare vecs in kryloc space
@@ -59,6 +59,7 @@ public:
    // Sanity checked this routine (step) against Saad.
    /////////////////////////
    void RitzMatrix(DenseVector<Field>& evec,int k){
+
      if(1) return;

      GridBase *grid = evec[0]._grid;
@@ -451,8 +452,9 @@ until convergence
 	std::cout << " -- Nconv       = "<< Nconv  << "\n";
      }

-
+    /////////////////////////////////////////////////
    // Adapted from Rudy's lanczos factor routine
+    /////////////////////////////////////////////////
    int Lanczos_Factor(int start, int end,  int cont,
 		       DenseVector<Field> & bq, 
 		       Field &bf,
@@ -546,10 +548,16 @@ until convergence
 	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n';

 	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ]
-#if 0
 	int re = 0;
+	// FIXME undefined params; how set in Rudy's code
+	int ref =0;
+	Real rho = 1.0e-8;
+
 	while( re == ref || (sqbt < rho * bck && re < 5) ){

+	  Field tmp2(grid);
+	  Field tmp1(grid);
+
 	  //bex = V^dag bf
 	  DenseVector<ComplexD> bex(j+1);
 	  for(int k=0;k<j+1;k++){
@@ -566,14 +574,14 @@ until convergence

 	  //bf = bf - V V^dag bf.   Subtracting off any component in span { V[j] } 
 	  RealD btc = axpy_norm(bf,-1.0,tmp2,bf);
-	  alpha = alpha + bex[j];	      sqbt = sqrt(real(btc));	      
+	  alpha = alpha + real(bex[j]);	      sqbt = sqrt(real(btc));	      
+	  // FIXME is alpha real in RUDY's code?
 	  RealD nmbex = 0;for(int k=0;k<j+1;k++){nmbex = nmbex + real( conjugate(bex[k])*bex[k]  );}
 	  bck = sqrt( nmbex );
 	  re++;
 	}
 	std::cout << "Iteratively refined orthogonality, changes alpha\n";
 	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl;
-#endif
 	H[j][j]=alpha;
      }

@@ -641,7 +649,7 @@ until convergence
      int M=Nm;

      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-      Resize(evals,Nm,Nm);
+      Resize(evals,Nm);
      Resize(evecs,Nm);

      int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with
@@ -702,7 +710,6 @@ until convergence
 	RealD beta;

 	Householder_vector<RealD>(ck, 0, 2, v, beta);
-
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,0);
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,1);
 	///Accumulate eigenvector
@@ -758,11 +765,11 @@ until convergence
      RealD resid_nrm=  norm2(bf);

      if(!lock) converged = 0;
-
+#if 0
      for(int i = SS - lock_num - 1; i >= SS - Nk && i >= 0; --i){

 	RealD diff = 0;
-	diff = abs(tevecs[i][Nm - 1 - lock_num]) * resid_nrm;
+	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm;

 	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;

@@ -785,53 +792,29 @@ until convergence
 	  break;
 	}
      }
+#endif
      std::cout << "Got " << converged << " so far " <<std::endl;	
    }
-#if 0
-    ///Check
-    void Check(void) {

-      DenseVector<RealD> goodval(get);
+    ///Check
+    void Check(DenseVector<RealD> &evals,
+	       DenseVector<DenseVector<RealD> > &evecs) {
+
+      DenseVector<RealD> goodval(this->get);
+
      EigenSort(evals,evecs);

      int NM = Nm;
-      int Nget = this->get;
-      S **V;
-      V = new S* [NM];

-      RealD *QZ;
-      QZ = new RealD [NM*NM];
+      DenseVector< DenseVector<RealD> > V; Size(V,NM);
+      DenseVector<RealD> QZ(NM*NM);
+
      for(int i = 0; i < NM; i++){
 	for(int j = 0; j < NM; j++){
-
-	  QZ[i*NM+j] = this->evecs[i][j];
-      
-          int f_size_cb = 24*dop.cbLs*dop.node_cbvol;
-
-	  for(int cb = this->prec; cb < 2; cb++){
-	    for(int i = 0; i < NM; i++){
-	      V[i] = (S*)(this->bq[i][cb]);
-
-	      const int m0 = 4 * 4; // this is new code
-	      assert(m0 % 16 == 0); // see the reason in VtimesQ.C
-
-	      const int row_per_thread = f_size_cb / (bfmarg::threads);
-	      {
-
-		{
-		  DenseVector<RealD> vrow_tmp0(m0*NM);
-		  DenseVector<RealD> vrow_tmp1(m0*NM);
-		  RealD *row_tmp0 = vrow_tmp0.data();
-		  RealD *row_tmp1 = vrow_tmp1.data();
-		  VtimesQ(QZ, NM, V, row_tmp0, row_tmp1, id * row_per_thread, m0, (id + 1) * row_per_thread);
-		}
-	      }
-	    }
-	  }
+	  // evecs[i][j];
 	}
      }
    }
-#endif


 /**
@@ -1020,4 +1003,4 @@ static void Lock(DenseMatrix<T> &H, 	///Hess mtx

 }
 #endif
-#endif
+
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -24,6 +24,17 @@ PARALLEL_FOR_LOOP
    return ret;
  }

+  template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=div(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+
  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -266,11 +266,8 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    if( this->HandOptDslash ) {
 #pragma omp parallel for schedule(static)
      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
 	for(int s=0;s<Ls;s++){
-	  int sU=ss;
-	  if (    LebesgueOrder::UseLebesgueOrder ) {
-	    sU=lo.Reorder(ss);
-	  }
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 	  }
@@ -323,52 +320,42 @@ PARALLEL_FOR_LOOP
      //      Counter.Report();
      //      }
    } else if( this->HandOptDslash ) {
+      /*

-#pragma omp parallel for 
+#pragma omp parallel for schedule(static)
      for(int t=0;t<threads;t++){

 	int hyperthread = t%HT;
 	int core        = t/HT;

-        int sswork, swork,soff, sU,sF;
-
-	sswork = (nwork + cores-1)/cores;
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);

 	for(int ss=0;ss<sswork;ss++){
-	  sU=ss+core*sswork; // max locality within an L2 slice
-	  if ( LebesgueOrder::UseLebesgueOrder ) {
-	    sU = lo.Reorder(sU);
+	  sU=ss+ ssoff;
+	  for(int s=soff;s<soff+swork;s++){
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	  }
-	  if ( sU < nwork ) {
-	    for(int s=soff;s<soff+swork;s++){
-	      sF = s+Ls*sU;
-	      Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
-	    }
-	  }
-	}
-      }
-
-      /*
-#pragma omp parallel for schedule(static)
-      for(int ss=0;ss<U._grid->oSites();ss++){
-	for(int s=0;s<Ls;s++){
-	  int sU=ss;
-	  if (    LebesgueOrder::UseLebesgueOrder ) {
-	    sU=lo.Reorder(ss);
-	  }
-	  int sF = s+Ls*sU;
-	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
      }
      */

+#pragma omp parallel for schedule(static)
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
+	}
+      }
    } else { 
 PARALLEL_FOR_LOOP
      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
 	for(int s=0;s<Ls;s++){
-	  //	  int sU=lo.Reorder(ss);
-	  int sU=ss;
 	  int sF = s+Ls*sU; 
 	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -29,7 +29,7 @@ namespace Grid {
 			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 #if defined(AVX512) || defined(IMCI)
-     void DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
+     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 			      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
 #else
--- a/lib/serialisation/BinaryIO.cc
+++ b/lib/serialisation/BinaryIO.cc
@@ -1,15 +1,14 @@
 #include <Grid.h>

-using namespace Grid;
-using namespace std;

+namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-BinaryWriter::BinaryWriter(const string &fileName)
-: file_(fileName, ios::binary|ios::out)
+BinaryWriter::BinaryWriter(const std::string &fileName)
+: file_(fileName, std::ios::binary|std::ios::out)
 {}

 template <>
-void BinaryWriter::writeDefault(const string &s, const string &output)
+void BinaryWriter::writeDefault(const std::string &s, const std::string &output)
 {
  uint64_t sz = output.size();
  
@@ -21,12 +20,12 @@ void BinaryWriter::writeDefault(const string &s, const string &output)
 }

 // Reader implementation ///////////////////////////////////////////////////////
-BinaryReader::BinaryReader(const string &fileName)
-: file_(fileName, ios::binary|ios::in)
+BinaryReader::BinaryReader(const std::string &fileName)
+: file_(fileName, std::ios::binary|std::ios::in)
 {}

 template <>
-void BinaryReader::readDefault(const string &s, string &output)
+void BinaryReader::readDefault(const std::string &s, std::string &output)
 {
  uint64_t sz;
  
@@ -34,3 +33,4 @@ void BinaryReader::readDefault(const string &s, string &output)
  output.reserve(sz);
  file_.read((char *)output.data(), sz);
 }
+}
--- a/lib/serialisation/TextIO.cc
+++ b/lib/serialisation/TextIO.cc
@@ -1,14 +1,12 @@
 #include <Grid.h>

-using namespace Grid;
-using namespace std;
-
+namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-TextWriter::TextWriter(const string &fileName)
-: file_(fileName, ios::out)
+TextWriter::TextWriter(const std::string &fileName)
+: file_(fileName, std::ios::out)
 {}

-void TextWriter::push(const string &s)
+void TextWriter::push(const std::string &s)
 {
  level_++;
 };
@@ -27,11 +25,11 @@ void TextWriter::indent(void)
 };

 // Reader implementation ///////////////////////////////////////////////////////
-TextReader::TextReader(const string &fileName)
-: file_(fileName, ios::in)
+TextReader::TextReader(const std::string &fileName)
+: file_(fileName, std::ios::in)
 {}

-void TextReader::push(const string &s)
+void TextReader::push(const std::string &s)
 {
  level_++;
 };
@@ -50,17 +48,18 @@ void TextReader::checkIndent(void)
    file_.get(c);
    if (c != '\t')
    {
-      cerr << "mismatch on tab " << c << " level " << level_;
-      cerr << " i "<< i <<endl;
-      abort();
+      std::cerr << "mismatch on tab " << c << " level " << level_;
+      std::cerr << " i "<< i <<std::endl;
+      std::abort();
    }
  }
 }

 template <>
-void TextReader::readDefault(const string &s, string &output)
+void TextReader::readDefault(const std::string &s, std::string &output)
 {
  checkIndent();
  output.clear();
  getline(file_, output);
 }
+}
--- a/lib/serialisation/XmlIO.cc
+++ b/lib/serialisation/XmlIO.cc
@@ -1,10 +1,8 @@
 #include <Grid.h>

-using namespace Grid;
-using namespace std;
-
+namespace Grid {
 // Writer implementation ///////////////////////////////////////////////////////
-XmlWriter::XmlWriter(const string &fileName)
+XmlWriter::XmlWriter(const std::string &fileName)
 : fileName_(fileName)
 {
  node_ = doc_.append_child();
@@ -16,7 +14,7 @@ XmlWriter::~XmlWriter(void)
  doc_.save_file(fileName_.c_str(), "  ");
 }

-void XmlWriter::push(const string &s)
+void XmlWriter::push(const std::string &s)
 {
  node_ = node_.append_child(s.c_str());
 }
@@ -27,22 +25,22 @@ void XmlWriter::pop(void)
 }

 // Reader implementation ///////////////////////////////////////////////////////
-XmlReader::XmlReader(const string &fileName)
+XmlReader::XmlReader(const std::string &fileName)
 : fileName_(fileName)
 {
  pugi::xml_parse_result result = doc_.load_file(fileName_.c_str());
  
  if ( !result )
  {
-    cerr << "XML error description: " << result.description() << "\n";
-    cerr << "XML error offset     : " << result.offset        << "\n";
-    abort();
+    std::cerr << "XML error description: " << result.description() << "\n";
+    std::cerr << "XML error offset     : " << result.offset        << "\n";
+    std::abort();
  }
  
  node_ = doc_.child("grid");
 }

-void XmlReader::push(const string &s)
+void XmlReader::push(const std::string &s)
 {
  node_ = node_.child(s.c_str());
 }
@@ -53,7 +51,8 @@ void XmlReader::pop(void)
 }

 template <>
-void XmlReader::readDefault(const string &s, string &output)
+void XmlReader::readDefault(const std::string &s, std::string &output)
 {
  output = node_.child(s.c_str()).first_child().value();
 }
+}
--- a/lib/serialisation/XmlIO.h
+++ b/lib/serialisation/XmlIO.h
@@ -96,6 +96,7 @@ namespace Grid
      node_.child("elem").set_name("elem-done");
      i++;
    }
+    //    assert( is.tellg()==-1);
    pop();
  }
  
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -67,6 +67,14 @@ namespace Grid {
    }
  };

+  template<class scalar> struct DivIntFunctor {
+    Integer y;
+  DivIntFunctor(Integer _y) : y(_y) {};
+    scalar operator()(const scalar &a)  const {
+      return Integer(a)/y;
+    }
+  };
+
  template<class scalar> struct RealFunctor {
    scalar operator()(const scalar &a)  const {
      return real(a);
@@ -131,6 +139,10 @@ namespace Grid {
  inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
    return SimdApply(ModIntFunctor<S>(y),r);
  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> div(const Grid_simd<S,V> &r,Integer y) {
+    return SimdApply(DivIntFunctor<S>(y),r);
+  }
  ////////////////////////////////////////////////////////////////////////////
  // Allows us to assign into **conformable** real vectors from complex
  ////////////////////////////////////////////////////////////////////////////
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -111,7 +111,7 @@ template<class obj,int N> inline auto toComplex(const iMatrix<obj,N> &z) -> type
  return ret;
 }

-
+BINARY_RSCALAR(div,Integer);
 BINARY_RSCALAR(mod,Integer);
 BINARY_RSCALAR(pow,RealD);