Merge branch 'develop' into sycl

2025-06-17 15:27:06 +01:00 · 2020-06-09 04:00:12 -04:00
parent 616d3dd737 ffbb3fc02c
commit cdf0a04fc5
85 changed files with 2632 additions and 1334 deletions
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -37,218 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>

 NAMESPACE_BEGIN(Grid); 

-  ////////////////////////////////////////////////////////
-  // Move following 100 LOC to lattice/Lattice_basis.h
-  ////////////////////////////////////////////////////////
-template<class Field>
-void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) 
-{
-  // If assume basis[j] are already orthonormal,
-  // can take all inner products in parallel saving 2x bandwidth
-  // Save 3x bandwidth on the second line of loop.
-  // perhaps 2.5x speed up.
-  // 2x overall in Multigrid Lanczos  
-  for(int j=0; j<k; ++j){
-    auto ip = innerProduct(basis[j],w);
-    w = w - ip*basis[j];
-  }
-}
-
-template<class Field>
-void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) 
-{
-  GridBase* grid = basis[0].Grid();
-
-  typedef typename Field::vector_object vobj;
-  typedef decltype(basis[0].View(CpuWrite)) View;
-
-  Vector<View> basis_v; basis_v.reserve(basis.size());
-
-  for(int k=0;k<basis.size();k++) basis_v.push_back(basis[k].View(CpuWrite));
-
-  View *basis_vp = &basis_v[0];
-
-#if 1
-  std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
-  thread_region
-  {
-    vobj* B = Bt.data() + Nm * thread_num();
-
-    thread_for_in_region(ss, grid->oSites(),{
-      for(int j=j0; j<j1; ++j) B[j]=0.;
-      
-      for(int j=j0; j<j1; ++j){
-	for(int k=k0; k<k1; ++k){
-	  B[j] +=Qt(j,k) * basis_v[k][ss];
-	}
-      }
-      for(int j=j0; j<j1; ++j){
-	basis_v[j][ss] = B[j];
-      }
-    });
-  }
-#else
-
-  int nrot = j1-j0;
-
-
-  uint64_t oSites   =grid->oSites();
-  uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
-
-  //  printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
-
-  Vector <vobj> Bt(siteBlock * nrot); 
-  auto Bp=&Bt[0];
-
-  // GPU readable copy of Eigen matrix
-  Vector<double> Qt_jv(Nm*Nm);
-  double *Qt_p = & Qt_jv[0];
-  for(int k=0;k<Nm;++k){
-    for(int j=0;j<Nm;++j){
-      Qt_p[j*Nm+k]=Qt(j,k);
-    }
-  }
-
-  // Block the loop to keep storage footprint down
-  for(uint64_t s=0;s<oSites;s+=siteBlock){
-
-    // remaining work in this block
-    int ssites=MIN(siteBlock,oSites-s);
-
-    // zero out the accumulators
-    accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
-	auto z=coalescedRead(Bp[ss]);
-	z=Zero();
-	coalescedWrite(Bp[ss],z);
-    });
-
-    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
-	
-      int j =sj%nrot;
-      int jj  =j0+j;
-      int ss =sj/nrot;
-      int sss=ss+s;
-
-      for(int k=k0; k<k1; ++k){
-	auto tmp = coalescedRead(Bp[ss*nrot+j]);
-	coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss]));
-      }
-    });
-
-    accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
-      int j =sj%nrot;
-      int jj  =j0+j;
-      int ss =sj/nrot;
-      int sss=ss+s;
-      coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
-    });
-  }
-  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
-#endif
-}
-
-// Extract a single rotated vector
-template<class Field>
-void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) 
-{
-  GridBase* grid = basis[0].Grid();
-  typedef typename Field::vector_object vobj;
-  typedef decltype(basis[0].View(AcceleratorWrite)) View;
-
-  result.Checkerboard() = basis[0].Checkerboard();
-
-  autoView(result_v,result, AcceleratorWrite);
-  Vector<View> basis_v; basis_v.reserve(basis.size());
-  View * basis_vp = &basis_v[0];
-
-  for(int k=0;k<basis.size();k++) basis_v.push_back(basis[k].View(AcceleratorRead));
-
-  Vector<double> Qt_jv(Nm);  double * Qt_j = & Qt_jv[0];
-
-  for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
-
-  accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
-    auto B=coalescedRead(basis_vp[k0][ss]);
-    B=Zero();
-    for(int k=k0; k<k1; ++k){
-      B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
-    }
-    coalescedWrite(result_v[ss], B);
-  });
-  for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
-}
-
-template<class Field>
-void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx) 
-{
-  int vlen = idx.size();
-
-  assert(vlen>=1);
-  assert(vlen<=sort_vals.size());
-  assert(vlen<=_v.size());
-
-  for (size_t i=0;i<vlen;i++) {
-
-    if (idx[i] != i) {
-
-      //////////////////////////////////////
-      // idx[i] is a table of desired sources giving a permutation.
-      // Swap v[i] with v[idx[i]].
-      // Find  j>i for which _vnew[j] = _vold[i],
-      // track the move idx[j] => idx[i]
-      // track the move idx[i] => i
-      //////////////////////////////////////
-      size_t j;
-      for (j=i;j<idx.size();j++)
-	if (idx[j]==i)
-	  break;
-
-      assert(idx[i] > i);     assert(j!=idx.size());      assert(idx[j]==i);
-
-      swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
-      std::swap(sort_vals[i],sort_vals[idx[i]]);
-
-      idx[j] = idx[i];
-      idx[i] = i;
-    }
-  }
-}
-
-inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals) 
-{
-  std::vector<int> idx(sort_vals.size());
-  std::iota(idx.begin(), idx.end(), 0);
-
-  // sort indexes based on comparing values in v
-  std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
-    return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
-  });
-  return idx;
-}
-
-template<class Field>
-void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse) 
-{
-  std::vector<int> idx = basisSortGetIndex(sort_vals);
-  if (reverse)
-    std::reverse(idx.begin(), idx.end());
-  
-  basisReorderInPlace(_v,sort_vals,idx);
-}
-
-// PAB: faster to compute the inner products first then fuse loops.
-// If performance critical can improve.
-template<class Field>
-void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
-  result = Zero();
-  assert(_v.size()==eval.size());
-  int N = (int)_v.size();
-  for (int i=0;i<N;i++) {
-    Field& tmp = _v[i];
-    axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
-  }
-}
-
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////