Block solver complete for staggered. Now stable on mass 0.003 and

gives 8x (!) speed up on Haswell laptop vs. standard CG for 8 RHS solves. 166 iterations vs. 537 iterations so algorithmic gain + 2x in flop rate gain. Better than a slap in the face with a wet kipper.
2025-11-03 13:34:33 +00:00 · 2017-06-20 12:37:41 +01:00
parent 0a8faac271
commit e9cc21900f
3 changed files with 321 additions and 222 deletions
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -369,71 +369,6 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };

-
-/*
-template<class vobj>
-static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
-			     int Orthog,RealD scale=1.0) 
-{    
-  // FIXME: Implementation is slow
-  // Best base the linear combination by constructing a 
-  // set of vectors of size grid->_rdimensions[Orthog].
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  
-  int Nblock = X._grid->GlobalDimensions()[Orthog];
-  
-  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
-  // If we based this on Cshift it would work for spread out
-  // but it would be even slower
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    ExtractSlice(Xslice,X,i,Orthog);
-    Rslice = Rslice + Xslice*(scale*a[i]);
-    InsertSlice(Rslice,R,i,Orthog);
-  }
-};
-template<class vobj>
-static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
-  {
-    // FIXME: Implementation is slow
-    // Look at localInnerProduct implementation,
-    // and do inside a site loop with block strided iterators
-    typedef typename vobj::scalar_object sobj;
-    typedef typename vobj::scalar_type scalar_type;
-    typedef typename vobj::vector_type vector_type;
-    typedef typename vobj::tensor_reduced scalar;
-    typedef typename scalar::scalar_object  scomplex;
-  
-    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
-    vec.resize(Nblock);
-    std::vector<scomplex> sip(Nblock);
-    Lattice<scalar> IP(lhs._grid); 
-    IP=localInnerProduct(lhs,rhs);
-    sliceSum(IP,sip,Orthog);
-  
-    for(int ss=0;ss<Nblock;ss++){
-      vec[ss] = TensorRemove(sip[ss]);
-    }
-  }
-*/
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// FIXME: Implementation is slow
-// If we based this on Cshift it would work for spread out
-// but it would be even slower
-//
-// Repeated extract slice is inefficient
-//
-// Best base the linear combination by constructing a 
-// set of vectors of size grid->_rdimensions[Orthog].
-//////////////////////////////////////////////////////////////////////////////////////////
-
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
@@ -453,7 +388,6 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }

-
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
@@ -469,64 +403,10 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  Lattice<vobj> Xslice(SliceGrid);
  Lattice<vobj> Rslice(SliceGrid);

-#if 0
-  // R[i] = Y[i] + X[j] a(j,i) 
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Xslice,X,j,Orthog);
-      Rslice = Rslice + Xslice*(scale*aa(j,i));
-    }
-    InsertSlice(Rslice,R,i,Orthog);
-  }
-#endif
-#if 0
-  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
-
-#pragma omp parallel 
-{ 
-
-  std::vector<int> lcoor(nl); // sliced coor
-  std::vector<int> hcoor(nh); // unsliced coor
-  std::vector<sobj> s_x(Nblock);
-
-#pragma omp for
-  for(int idx=0;idx<SliceGrid->lSites();idx++){
-
-    SliceGrid->LocalIndexToLocalCoor(idx,lcoor); 
-
-    int ddl=0;
-    for(int d=0;d<nh;d++){
-      if ( d!=Orthog ) { 
-	hcoor[d]=lcoor[ddl++];
-      }
-    }
-
-    sobj dot;
-    for(int i=0;i<Nblock;i++){
-      hcoor[Orthog] = i;
-      peekLocalSite(s_x[i],X,hcoor);
-    }
-
-    for(int i=0;i<Nblock;i++){
-      hcoor[Orthog] = i;
-      peekLocalSite(dot,Y,hcoor);
-      for(int j=0;j<Nblock;j++){
-	dot = dot + s_x[j]*(scale*aa(j,i));
-      }
-      pokeLocalSite(dot,R,hcoor);
-    }
-  }
-}
-#endif
-
-#if 1
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  int nl = SliceGrid->_ndimension;

-
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
@@ -535,7 +415,6 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
-
    std::vector<vobj> s_x(Nblock);

 #pragma omp for collapse(2)
@@ -543,13 +422,11 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
    for(int b=0;b<block;b++){
      int o  = n*stride + b;

-
      for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
      }

      vobj dot;
-
      for(int i=0;i<Nblock;i++){
 	dot = Y[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
@@ -559,15 +436,63 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
      }
    }}
  }
-#endif
+};
+
+template<class vobj>
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X._grid->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X._grid;
+  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  Lattice<vobj> Xslice(SliceGrid);
+  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = s_x[0]*(scale*aa(0,i));
+	for(int j=1;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
+  }
+
 };


 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
-  // FIXME: Implementation is slow
-  // Not sure of best solution.. think about it
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -582,63 +507,6 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);

-#if 0  
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Lslice,lhs,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Rslice,rhs,j,Orthog);
-      mat(i,j) = innerProduct(Lslice,Rslice);
-    }
-  }
-#endif
-
-#if 0
-  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
-
-#pragma omp parallel 
-{ 
-  std::vector<int> lcoor(nl); // sliced coor
-  std::vector<int> hcoor(nh); // unsliced coor
-  std::vector<sobj> Left(Nblock);
-  std::vector<sobj> Right(Nblock);
-  Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-#pragma omp for
-  for(int idx=0;idx<SliceGrid->lSites();idx++){
-
-    SliceGrid->LocalIndexToLocalCoor(idx,lcoor); 
-
-    int ddl=0;
-    for(int d=0;d<nh;d++){
-      if ( d!=Orthog ) { 
-	hcoor[d]=lcoor[ddl++];
-      }
-    }
-
-    // Get the scalar objects
-    for(int i=0;i<Nblock;i++){
-      hcoor[Orthog] = i;
-      peekLocalSite(Left[i] ,lhs,hcoor);
-      peekLocalSite(Right[i],rhs,hcoor);
-    }
-
-    for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
-      std::complex<double> ip = innerProduct(Left[i],Right[j]);
-      mat_thread(i,j) += ip;
-    }}
-  }
-
-#pragma omp critical
-  {
-    mat += mat_thread;
-  }  
-
-}
-#endif
-
-#if 1
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  int nl = SliceGrid->_ndimension;
@@ -681,7 +549,6 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
      mat += mat_thread;
    }  
  }
-#endif
  return;
 }