MultiRHS work

2026-05-25 19:44:17 +01:00 · 2023-11-28 07:43:37 -05:00
parent 59abaeb5cd
commit 0a3682ad0b
7 changed files with 136 additions and 25 deletions
@@ -62,6 +62,7 @@ public:
  
  std::vector<deviceVector<calcMatrix> > _A;
  std::vector<CoarseVector> MultTemporaries;
+  deviceVector<GeneralStencilEntryReordered> StencilMasked;

  ///////////////////////
  // Interface
@@ -78,9 +79,40 @@ public:
    Stencil(Cell.grids.back(),geom.shifts)
  {
    _A.resize(geom.npoint);
+    int32_t padded_sites = _Op._A[0].Grid()->lSites();
    for(int p=0;p<geom.npoint;p++){
-      _A[p].resize(_CoarseGrid->lSites());
+      _A[p].resize(padded_sites);
    }
+    std::cout << GridLogMessage<<"MultiGeneralCoarsenedMatrix "<<_CoarseGrid->lSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <<std::endl;
+
+    StencilMasked.resize(_CoarseGridMulti->oSites());
+    std::vector<GeneralStencilEntryReordered> StencilTmp;
+
+    int32_t j=0;
+    int32_t sites = Stencil._entries.size()/geom.npoint;
+    for(int32_t s=0;s<sites;s++){
+      int ghost_zone=0;
+      for(int32_t point = 0 ; point < geom.npoint; point++){
+	int i=s*geom.npoint+point;
+	if( Stencil._entries[i]._permute ) {
+	  ghost_zone=1;
+	}
+      }
+      GeneralStencilEntryReordered tmp;
+      if( ghost_zone==0) {
+	for(int32_t point = 0 ; point < geom.npoint; point++){
+	  int i=s*geom.npoint+point;
+	  tmp._offset = Stencil._entries[i]._offset;
+	  tmp._permute= Stencil._entries[i]._permute;
+	  tmp._output = j;
+	  StencilTmp.push_back(tmp);
+	}
+	j++;
+      }
+    }
+    std::cout << "coarse osites x npoint "<<_CoarseGridMulti->oSites()*geom.npoint<< " stencil interior size "<< StencilTmp.size()<<std::endl;
+    assert(_CoarseGridMulti->lSites()*geom.npoint==StencilTmp.size());
+    acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
    CopyMatrix();
  }
  void CopyMatrix (void)
@@ -100,12 +132,18 @@ public:
  }
  void M (const CoarseVector &in, CoarseVector &out)
  {
+    RealD tviews=0;    RealD ttot=0;    RealD tmult=0;   RealD texch=0;    RealD text=0; RealD ttemps=0; RealD tcopy=0;
+    RealD tmult2=0;
+
+    ttot=-usecond();
    conformable(CoarseGrid(),in.Grid());
    conformable(in.Grid(),out.Grid());
    out.Checkerboard() = in.Checkerboard();
    CoarseVector tin=in;

+    texch-=usecond();
    CoarseVector pin = Cell.ExchangePeriodic(tin);
+    texch+=usecond();
    CoarseVector pout(pin.Grid());

    int npoint = geom.npoint;
@@ -116,22 +154,33 @@ public:
    
    int64_t osites=pin.Grid()->oSites();
    int64_t nrhs  =pin.Grid()->GlobalDimensions()[0]/Nsimd;
+    assert(nrhs>=1);

+    RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
+    RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
+                + 2.0*osites*sizeof(siteVector)*npoint;
+    
    {
+      tviews-=usecond();
      autoView( in_v , pin, AcceleratorRead);
      autoView( out_v , pout, AcceleratorWriteDiscard);
      autoView( Stencil_v  , Stencil, AcceleratorRead);
+      tviews+=usecond();

      // Static and prereserve to keep UVM region live and not resized across multiple calls
+      ttemps-=usecond();
      MultTemporaries.resize(npoint,pin.Grid());       
+      ttemps+=usecond();

      std::vector<Aview> AcceleratorViewContainer_h;
      std::vector<Vview> AcceleratorVecViewContainer_h; 

+      tviews-=usecond();
      for(int p=0;p<npoint;p++) {
 	AcceleratorViewContainer_h.push_back( &_A[p][0]);
 	AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
      }
+      tviews+=usecond();

      static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
      static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); 
@@ -139,15 +188,23 @@ public:
      auto Aview_p = &AcceleratorViewContainer[0];
      auto Vview_p = &AcceleratorVecViewContainer[0];

+      tcopy-=usecond();
      acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
      acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
+      tcopy+=usecond();

+      int32_t bound = _A[0].size();
+      std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
+      std::cout << " padded local dims   "<<pin.Grid()->LocalDimensions()<<std::endl;
+      std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
+      tmult-=usecond();
      accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int32_t ss   = rspb/(nbasis*npoint);
 	  int32_t bp   = rspb%(nbasis*npoint);
 	  int32_t point= bp/nbasis;
 	  int32_t b    = bp%nbasis;
+	  assert(ss<bound);
 	  auto SE  = Stencil_v.GetEntry(point,ss);
 	  if ( SE->_permute == 0 ) { 
 	    int32_t snbr= SE->_offset;
@@ -159,6 +216,7 @@ public:
 	    coalescedWrite(Vview_p[point][ss](b),res);
 	  }
      });
+      tmult2-=usecond();
      accelerator_for(sb, osites*nbasis, Nsimd, {
 	  int ss = sb/nbasis;
 	  int b  = sb%nbasis;
@@ -168,13 +226,32 @@ public:
 	  }
 	  coalescedWrite(out_v[ss](b),res);
      });
+      tmult2+=usecond();
+      tmult+=usecond();
      for(int p=0;p<npoint;p++) {
 	AcceleratorVecViewContainer_h[p].ViewClose();
      }
    }

+    text-=usecond();
    out = Cell.Extract(pout);
+    text+=usecond();
+    ttot+=usecond();

+    std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
+    std::cout << GridLogMessage<<" of which mult2  "<<tmult2<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
+    //    std::cout << GridLogMessage<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
+    
  };
  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
@@ -745,8 +745,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

-  static const int words=sizeof(vobj)/sizeof(vector_type);
-
  GridBase *Fg = From.Grid();
  GridBase *Tg = To.Grid();
  assert(!Fg->_isCheckerBoarded);
@@ -763,13 +761,14 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  // the above should guarantee that the operations are local
  
 #if 1
-
  size_t nsite = 1;
  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
  
  size_t tbytes = 4*nsite*sizeof(int);
  int *table = (int*)malloc(tbytes);
- 
+
+  RealD t_cpu=-usecond();
+#if 0
  thread_for(idx, nsite, {
      Coordinate from_coor, to_coor;
      size_t rem = idx;
@@ -792,15 +791,44 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  
  int* table_d = (int*)acceleratorAllocDevice(tbytes);
  acceleratorCopyToDevice(table,table_d,tbytes);
+#else
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  Coordinate f_ostride = Fg->_ostride;
+  Coordinate f_istride = Fg->_istride;
+  Coordinate f_rdimensions = Fg->_rdimensions;
+  Coordinate t_ostride = Tg->_ostride;
+  Coordinate t_istride = Tg->_istride;
+  Coordinate t_rdimensions = Tg->_rdimensions;
+
+  accelerator_for(idx, nsite, 1, {
+      Coordinate from_coor, to_coor;
+      size_t rem = idx;
+      for(int i=0;i<nd;i++){
+	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
+	from_coor[i] = base_i + FromLowerLeft[i];
+	to_coor[i] = base_i + ToLowerLeft[i];
+      }
+      int foidx = 0; for(int d=0;d<nd;d++) foidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
+      int fiidx = 0; for(int d=0;d<nd;d++) fiidx+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
+      int toidx = 0; for(int d=0;d<nd;d++) toidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
+      int tiidx = 0; for(int d=0;d<nd;d++) tiidx+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
+      int* tt = table_d + 4*idx;
+      tt[0] = foidx;
+      tt[1] = fiidx;
+      tt[2] = toidx;
+      tt[3] = tiidx;
+    });
+#endif
+  t_cpu+=usecond();

  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;

  autoView(from_v,From,AcceleratorRead);
  autoView(to_v,To,AcceleratorWrite);
-  
-  accelerator_for(idx,nsite,1,{
-      static const int words=sizeof(vobj)/sizeof(vector_type);
+  RealD t_acc=-usecond();
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  accelerator_for(idx,nsite,words,{
      int* tt = table_d + 4*idx;
      int from_oidx = *tt++;
      int from_lane = *tt++;
@@ -811,12 +839,20 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
      vector_type* to = (vector_type *)&to_v[to_oidx];
      
      scalar_type stmp;
+#ifdef GRID_SIMT
+      int w = acceleratorSIMTlane(words);
+      stmp = getlane(from[w], from_lane);
+      putlane(to[w], stmp, to_lane);
+#else
      for(int w=0;w<words;w++){
 	stmp = getlane(from[w], from_lane);
 	putlane(to[w], stmp, to_lane);
      }
+#endif
    });
-  
+  t_acc+=usecond();
+  std::cout << " localCopyRegion cpu " <<t_cpu/1000<<" ms"<<std::endl;
+  std::cout << " localCopyRegion acc " <<t_acc/1000<<" ms"<<std::endl;
  acceleratorFreeDevice(table_d);    
  free(table);
  
@@ -403,18 +403,8 @@ public:
      double t = usecond();
      padded = in;
      tins += usecond() - t;
-      
+      // return in; ?
    } else {
-
-      //////////////////////////////////////////////
-      // Replace sequence with
-      // ---------------------
-      // (i) Gather high face(s); start comms
-      // (ii) Gather low  face(s); start comms
-      // (iii) Copy middle bit with localCopyRegion
-      // (iv) Complete high face(s), insert slice(s)
-      // (iv) Complete low  face(s), insert slice(s)
-      //////////////////////////////////////////////
      Face_exchange(in,padded,dim,depth);
    }
    return padded;
@@ -482,6 +472,7 @@ public:
    // Gather all surface terms up to depth "d"
    ////////////////////////////////////////////////////////////////////////////
    RealD t;
+    RealD t_tot=-usecond();
    int plane=0;
    for ( int d=0;d < depth ; d ++ ) {
      int tag = d*1024 + dimension*2+0;
@@ -549,6 +540,7 @@ public:
    }
    t_scatter+= usecond() - t;
    //    DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
+    t_tot+=usecond();

    //DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
@@ -557,6 +549,7 @@ public:
    //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total  :" << t_tot/1000     << "ms"<<std::endl;
    //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
  }
  
@@ -90,7 +90,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {

  for (int i = 0; i < logstreams.size(); i++) {
    if (logstreams[i] == std::string("Tracing"))     GridLogTracing.Active(1);
-    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(1);
+    if (logstreams[i] == std::string("Memory"))      GridLogMemory.Active(0);
    if (logstreams[i] == std::string("Warning"))     GridLogWarning.Active(1);
    if (logstreams[i] == std::string("NoMessage"))   GridLogMessage.Active(0);
    if (logstreams[i] == std::string("Iterative"))   GridLogIterative.Active(1);
@@ -33,6 +33,10 @@ struct GeneralStencilEntry {
  uint64_t _offset;            // 4 bytes 
  uint8_t _permute;            // 1 bytes // Horrible alignment properties
 };
+struct GeneralStencilEntryReordered : public GeneralStencilEntry {
+  uint64_t _output;
+};
+
 // Could pack to 8 + 4 + 4 = 128 bit and use 

 class GeneralLocalStencilView {
@@ -6,6 +6,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 --enable-tracing=timer \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
+--enable-tracing=roctx \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
@@ -78,7 +78,7 @@ int main (int argc, char ** argv)
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/2;
+    clatt[d] = clatt[d]/4;
  }

  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
@@ -107,7 +107,7 @@ int main (int argc, char ** argv)

  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);

-  const int nbasis = 16;
+  const int nbasis = 32;
  const int cb = 0 ;
  LatticeFermion prom(FGrid);

@@ -265,8 +265,8 @@ int main (int argc, char ** argv)
 	LittleDiracOp.M(phi,Aphi);
      }
      t1+=usecond();
-      std::cout << r << " mrhs " << norm2(chi)<<std::endl;
-      std::cout << r << " srhs " << norm2(Aphi)<<std::endl;
+      std::cout << " mrhs [" <<r <<"] "<< norm2(chi)<<std::endl;
+      std::cout << " srhs [" <<r <<"] "<< norm2(Aphi)<<std::endl;
      chi=chi-Aphi;
      std::cout << r << " diff " << norm2(chi)<<std::endl;
    }