Don't need the numerical rounding tolerance in multigrid

Synchronise changes
Speed up the coarsened matrix matrix evaluation.
2025-07-17 21:27:07 +01:00 · 2023-12-22 18:10:23 -05:00 · 2023-12-22 18:09:11 -05:00 · 2023-12-22 18:07:03 -05:00 · 2023-12-22 18:06:13 -05:00 · 2023-12-22 18:05:41 -05:00
5 changed files with 112 additions and 60 deletions
--- a/Grid/algorithms/multigrid/BatchedBlas.h
+++ b/Grid/algorithms/multigrid/BatchedBlas.h
@@ -204,13 +204,12 @@ public:
      }
    }
 #endif
-     synchronise();
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
-     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
-     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
  }

  void gemmBatched(int m,int n, int k,
@@ -279,13 +278,12 @@ public:
      }
    }
 #endif
-     synchronise();
     RealD t1=usecond();
     RealD flops = 8.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
-     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
-     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
  }
  
  ///////////////////////////////////////////////////////////////////////////
@@ -357,13 +355,12 @@ public:
      }
    }
 #endif
-     synchronise();
     RealD t1=usecond();
-     RealD flops = 8.0*m*n*k*batchCount;
+     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
-     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
-     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
  }
  
  
@@ -452,13 +449,12 @@ public:
      }
    }
 #endif
-     synchronise();
     RealD t1=usecond();
-     RealD flops = 8.0*m*n*k*batchCount;
+     RealD flops = 2.0*m*n*k*batchCount;
     RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
-     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
-     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
-     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
+     //     std::cout <<GridLogPerformance<< " batched Blas call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
  }
  

--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@@ -50,6 +50,7 @@ public:
  typedef iVector<CComplex,nbasis >  Cvec;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
+  typedef Lattice<CComplex >    FineComplexField;
  typedef CoarseVector Field;
  ////////////////////
  // Data members
@@ -308,6 +309,7 @@ public:
    RealD teigen=0.0;
    RealD tmat=0.0;
    RealD tphase=0.0;
+    RealD tphaseBZ=0.0;
    RealD tinv=0.0;

    /////////////////////////////////////////////////////////////
@@ -362,28 +364,41 @@ public:
    ///////////////////////////////////////////////////////////////////////
    FineField phaV(grid); // Phased block basis vector
    FineField MphaV(grid);// Matrix applied
+    std::vector<FineComplexField> phaF(npoint,grid);
+    std::vector<CoarseComplexField> pha(npoint,CoarseGrid());
+    
    CoarseVector coarseInner(CoarseGrid());
+    
+    typedef typename CComplex::scalar_type SComplex;
+    FineComplexField one(grid); one=SComplex(1.0);
+    FineComplexField zz(grid); zz = Zero();
+    tphase=-usecond();
+    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+      /////////////////////////////////////////////////////
+      // Stick a phase on every block
+      /////////////////////////////////////////////////////
+      CoarseComplexField coor(CoarseGrid());
+      pha[p]=Zero();
+      for(int mu=0;mu<Nd;mu++){
+	LatticeCoordinate(coor,mu);
+	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	pha[p] = pha[p] + (TwoPiL * geom.shifts[p][mu]) * coor;
+      }
+      pha[p]  =exp(pha[p]*ci);

+      blockZAXPY(phaF[p],pha[p],one,zz);
+      
+    }
+    tphase+=usecond();
+    
    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
    for(int i=0;i<nbasis;i++){// Loop over basis vectors
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-	/////////////////////////////////////////////////////
-	// Stick a phase on every block
-	/////////////////////////////////////////////////////
-	tphase-=usecond();
-	CoarseComplexField coor(CoarseGrid());
-	CoarseComplexField pha(CoarseGrid());	pha=Zero();
-	for(int mu=0;mu<Nd;mu++){
-	  LatticeCoordinate(coor,mu);
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
-	}
-	pha  =exp(pha*ci);
-	phaV=Zero();
-	blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
-	tphase+=usecond();
+	tphaseBZ-=usecond();
+	phaV = phaF[p]*Subspace.subspace[i];
+	tphaseBZ+=usecond();

 	/////////////////////////////////////////////////////////////////////
 	// Multiple phased subspace vector by matrix and project to subspace
@@ -394,8 +409,8 @@ public:
 	tmat+=usecond();

 	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
-	coarseInner = conjugate(pha) * coarseInner;
+	blockProjectFast(coarseInner,MphaV,Subspace.subspace);
+	coarseInner = conjugate(pha[p]) * coarseInner;

 	ComputeProj[p] = coarseInner;
 	tproj+=usecond();
@@ -431,6 +446,7 @@ public:
    ExchangeCoarseLinks();
    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
@@ -368,6 +368,7 @@ public:
 		       ComplexD(c), 
 		       BLAS_CP);
    }
+    BLAS.synchronise();
    t_mult+=usecond();
    //    std::cout << GridLogMessage << "New Mrhs coarse BLAStoGrid "<<std::endl;
    t_BtoG=-usecond();
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -301,6 +301,37 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
  //  std::cout << GridLogPerformance << " blockProject : blockZaxpy        :  "<<t_za<<" us"<<std::endl;
 }

+
+template<class vobj,class CComplex,int nbasis,class VLattice>
+inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
+			     const             Lattice<vobj>   &fineData,
+			     const VLattice &Basis)
+{
+  GridBase * fine  = fineData.Grid();
+  GridBase * coarse= coarseData.Grid();
+
+  Lattice<iScalar<CComplex>> ip(coarse);
+  Lattice<vobj>     fineDataRed = fineData;
+
+  autoView( coarseData_ , coarseData, AcceleratorWrite);
+  autoView( ip_         , ip,         AcceleratorWrite);
+  RealD t_IP=0;
+  RealD t_co=0;
+  for(int v=0;v<nbasis;v++) {
+    t_IP-=usecond();
+    blockInnerProductD(ip,Basis[v],fineData); // ip = <basis|fine>
+    t_IP+=usecond();
+    t_co-=usecond();
+    accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
+	convertType(coarseData_[sc](v),ip_[sc]);
+    });
+    t_co+=usecond();
+  }
+  //  std::cout << GridLogPerformance << " blockProjectFast : blockInnerProduct :  "<<t_IP<<" us"<<std::endl;
+  //  std::cout << GridLogPerformance << " blockProjectFast : conv              :  "<<t_co<<" us"<<std::endl;
+}
+
+
 // This only minimises data motion from CPU to GPU
 // there is chance of better implementation that does a vxk loop of inner products to data share
 // at the GPU thread level
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -62,6 +62,8 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
 {
  const int Nsimd=vobj::Nsimd();
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;

  GridBase *grid = lat.Grid();
  Coordinate simd = grid->_simd_layout;
@@ -124,8 +126,16 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
 	///////////////////////////////////////////
 	// Transfer into lattice - will coalesce
 	///////////////////////////////////////////
-	sobj obj = extractLane(blane,buf_p[ss+offset]);
-	insertLane(lane,lat_v[osite],obj);
+	//	sobj obj = extractLane(blane,buf_p[ss+offset]);
+	//	insertLane(lane,lat_v[osite],obj);
+	const int words=sizeof(vobj)/sizeof(vector_type);
+	vector_type * from = (vector_type *)&buf_p[ss+offset];
+	vector_type * to   = (vector_type *)&lat_v[osite];
+	scalar_type stmp;
+	for(int w=0;w<words;w++){
+	  stmp = getlane(from[w], blane);
+	  putlane(to[w], stmp, lane);
+	}
      }
  });
 }
@@ -138,6 +148,8 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
 {
  const int Nsimd=vobj::Nsimd();
  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;

  autoView(lat_v, lat, AcceleratorRead);

@@ -200,23 +212,18 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
 	///////////////////////////////////////////
 	// Take out of lattice
 	///////////////////////////////////////////
-
-	sobj obj = extractLane(lane,lat_v[osite]);
-	insertLane(blane,buf_p[ss+offset],obj);
+	//	sobj obj = extractLane(lane,lat_v[osite]);
+	//	insertLane(blane,buf_p[ss+offset],obj);
+	const int words=sizeof(vobj)/sizeof(vector_type);
+	vector_type * to    = (vector_type *)&buf_p[ss+offset];
+	vector_type * from  = (vector_type *)&lat_v[osite];
+	scalar_type stmp;
+	for(int w=0;w<words;w++){
+	  stmp = getlane(from[w], lane);
+	  putlane(to[w], stmp, blane);
+	}
      }
  });
-  /*
-  int words =block*nblock/simd[dim];
-  std::vector<vobj> tbuf(words);
-  acceleratorCopyFromDevice((void *)&buf[offset],(void *)&tbuf[0],words*sizeof(vobj));
-  typedef typename vobj::scalar_type scalar;
-  scalar *sbuf = (scalar *)&tbuf[0];
-  scalar tmp=0.0;
-  for(int w=0;w<words*sizeof(vobj)/sizeof(scalar);w++){
-    tmp=tmp+conjugate(sbuf[w])*sbuf[w];
-  }
-  std::cout << " Gathered buffer norm "<<tmp<<std::endl;
-  */
 }


@@ -545,14 +552,15 @@ public:
    t_scatter+= usecond() - t;
    t_tot+=usecond();

-    std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
-    std::cout << GridLogDebug << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
-    std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000   << "ms"<<std::endl;
-    std::cout << GridLogDebug << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
-    std::cout << GridLogDebug << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
-    std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
-    std::cout << GridLogDebug << "PaddedCell::Expand new timings: total  :" << t_tot/1000     << "ms"<<std::endl;
-    std::cout << GridLogDebug << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000   << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total  :" << t_tot/1000     << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << depth*4.0*bytes/t_gather << "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << depth*4.0*bytes/t_scatter<< "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: face bytes  :" << depth*bytes/1e6 << "MB"<<std::endl;
  }
  
 };
Author	SHA1	Message	Date
Peter Boyle	b7c7000d0d	Don't need the numerical rounding tolerance in multigrid	2023-12-22 18:10:23 -05:00
Peter Boyle	551f6c4edd	Synchronise changes	2023-12-22 18:09:11 -05:00
Peter Boyle	defd814750	Speed up the coarsened matrix matrix evaluation. It is block project limited. Could be sped up with calls to Batched GEMM and a data layout change.	2023-12-22 18:07:03 -05:00
Peter Boyle	3d517bbd2a	Synchronise decouple from the launch Speeds up multileg stencils	2023-12-22 18:06:13 -05:00
Peter Boyle	78ab955fec	Better padded cell exchange	2023-12-22 18:05:41 -05:00
Peter Boyle	dd13937bb6	Better opt face gather scatter	2023-12-22 18:03:38 -05:00