Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding

Fixed const correctness on PaddedCell methods Fixed compile issues on Crusher Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance Optimized kernel for StaplePadded Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings
2024-11-09 23:45:36 +00:00 · 2023-06-27 14:58:10 -04:00 · 2023-06-27 14:58:10 -04:00 · f44dce390f
commit f44dce390f
parent bb71e9a96a
4 changed files with 245 additions and 62 deletions
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -697,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
-
  // the above should guarantee that the operations are local
+  
+#if 1
+
+  size_t nsite = 1;
+  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
+  
+  size_t tbytes = 4*nsite*sizeof(int);
+  int *table = (int*)malloc(tbytes);
+ 
+  thread_for(idx, nsite, {
+      Coordinate from_coor, to_coor;
+      size_t rem = idx;
+      for(int i=0;i<nd;i++){
+	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
+	from_coor[i] = base_i + FromLowerLeft[i];
+	to_coor[i] = base_i + ToLowerLeft[i];
+      }
+      
+      int foidx = Fg->oIndex(from_coor);
+      int fiidx = Fg->iIndex(from_coor);
+      int toidx = Tg->oIndex(to_coor);
+      int tiidx = Tg->iIndex(to_coor);
+      int* tt = table + 4*idx;
+      tt[0] = foidx;
+      tt[1] = fiidx;
+      tt[2] = toidx;
+      tt[3] = tiidx;
+    });
+  
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  acceleratorCopyToDevice(table,table_d,tbytes);
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(from_v,From,AcceleratorRead);
+  autoView(to_v,To,AcceleratorWrite);
+  
+  accelerator_for(idx,nsite,1,{
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int* tt = table_d + 4*idx;
+      int from_oidx = *tt++;
+      int from_lane = *tt++;
+      int to_oidx = *tt++;
+      int to_lane = *tt;
+
+      const vector_type* from = (const vector_type *)&from_v[from_oidx];
+      vector_type* to = (vector_type *)&to_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+  
+  acceleratorFreeDevice(table_d);    
+  free(table);
+  
+
+#else  
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
@ -738,6 +798,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 #endif
    }
  });
+
+#endif
 }


@ -830,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 }


+//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
+//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@ -851,6 +915,65 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
    }
  }

+#if 1
+  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
+  size_t tbytes = 4*nsite*sizeof(int);
+  int *table = (int*)malloc(tbytes);
+  
+  thread_for(idx,nsite,{
+    Coordinate lcoor(nl);
+    Coordinate hcoor(nh);
+    lcoor[orthog] = slice_lo;
+    hcoor[orthog] = slice_hi;
+    size_t rem = idx;
+    for(int mu=0;mu<nl;mu++){
+      if(mu != orthog){
+	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
+	lcoor[mu] = hcoor[mu] = xmu;
+      }
+    }
+    int loidx = lg->oIndex(lcoor);
+    int liidx = lg->iIndex(lcoor);
+    int hoidx = hg->oIndex(hcoor);
+    int hiidx = hg->iIndex(hcoor);
+    int* tt = table + 4*idx;
+    tt[0] = loidx;
+    tt[1] = liidx;
+    tt[2] = hoidx;
+    tt[3] = hiidx;
+    });
+   
+  int* table_d = (int*)acceleratorAllocDevice(tbytes);
+  acceleratorCopyToDevice(table,table_d,tbytes);
+
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  autoView(lowDim_v,lowDim,AcceleratorRead);
+  autoView(higherDim_v,higherDim,AcceleratorWrite);
+  
+  accelerator_for(idx,nsite,1,{
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int* tt = table_d + 4*idx;
+      int from_oidx = *tt++;
+      int from_lane = *tt++;
+      int to_oidx = *tt++;
+      int to_lane = *tt;
+
+      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
+      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
+      
+      scalar_type stmp;
+      for(int w=0;w<words;w++){
+	stmp = getlane(from[w], from_lane);
+	putlane(to[w], stmp, to_lane);
+      }
+    });
+  
+  acceleratorFreeDevice(table_d);    
+  free(table);
+  
+#else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@ -866,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
+#endif
 }


--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -95,7 +95,7 @@ public:
    }
  };
  template<class vobj>
-  inline Lattice<vobj> Extract(const Lattice<vobj> &in)
+  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
  {
    Lattice<vobj> out(unpadded_grid);

@ -106,7 +106,7 @@ public:
    return out;
  }
  template<class vobj>
-  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>())
+  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    int dims = old_grid->Nd();
@ -118,7 +118,7 @@ public:
  }
  // expand up one dim at a time
  template<class vobj>
-  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>())
+  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
@ -130,20 +130,40 @@ public:
    else       conformable(old_grid,grids[dim-1]);

    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
+
+    double tins=0, tshift=0;
+    
    // Middle bit
+    double t = usecond();
    for(int x=0;x<local[dim];x++){
      InsertSliceLocal(in,padded,x,depth+x,dim);
    }
+    tins += usecond() - t;
+    
    // High bit
+    t = usecond();
    shifted = cshift.Cshift(in,dim,depth);
+    tshift += usecond() - t;
+
+    t=usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
    }
+    tins += usecond() - t;
+    
    // Low bit
+    t = usecond();
    shifted = cshift.Cshift(in,dim,-depth);
+    tshift += usecond() - t;
+    
+    t = usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,x,x,dim);
    }
+    tins += usecond() - t;
+
+    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
+    
    return padded;
  }

--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -359,6 +359,7 @@ public:
    assert(Cell.depth >= 1);
    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid

+    double t0 = usecond();
    //Generate shift arrays
    std::vector<Coordinate> shifts;
    for(int mu=0;mu<Nd;mu++){
@ -382,54 +383,73 @@ public:
 	}
      }
    }
+    int shift_mu_off = shifts.size()/Nd;
+    
+    double t1 = usecond();
    //Generate local stencil
    GeneralLocalStencil gStencil(ggrid,shifts);
+
+    double t2 = usecond();
+    
+    //Open views to padded gauge links and keep open over mu loop
+    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
+    size_t vsize = Nd*sizeof(GaugeViewType);
+    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
+    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
+    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
+    
    GaugeMat gStaple(ggrid);

-    int off = 0;
+    int outer_off = 0;
    for(int mu=0;mu<Nd;mu++){
-      gStaple = Zero();
-
-      for(int nu=0;nu<Nd;nu++){
-	if(nu != mu){	  
-	  {
-	    autoView( rgStaple_v , gStaple, AcceleratorRead);
-	    autoView( gStaple_v , gStaple, AcceleratorWrite);
-	    auto gStencil_v = gStencil.View();
-	    autoView( Ug_mu_v , U_padded[mu], AcceleratorRead);
-	    autoView( Ug_nu_v , U_padded[nu], AcceleratorRead);
-  
-	    accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-		auto stencil_ss = coalescedRead(rgStaple_v[ss]);
-
-		GeneralStencilEntry const* e = gStencil_v.GetEntry(off,ss);
-		auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off+1,ss);
-		auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off+2,ss);
-		auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
+      { //view scope
+	autoView( gStaple_v , gStaple, AcceleratorWrite);
+	auto gStencil_v = gStencil.View();
+	
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
+	    stencil_ss = Zero();
+	    int off = outer_off;
+	    
+	    for(int nu=0;nu<Nd;nu++){
+	      if(nu != mu){	  
+		GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
+		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
      
-		stencil_ss = stencil_ss + U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x;
+		stencil_ss = stencil_ss + U2 * U1 * U0;

-		e = gStencil_v.GetEntry(off+3,ss);
-		auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(off+4,ss);
-		auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off+5,ss);
-		auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
+		e = gStencil_v.GetEntry(off++,ss);
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));

-		stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu;
-      
-		coalescedWrite(gStaple_v[ss],stencil_ss);
+		stencil_ss = stencil_ss + U2 * U1 * U0;
 	      }
-	      );
-	  } //ensure views are all closed!
-	  off += 6;
-	}//nu!=mu
-      }//nu loop
+	    }
+		
+	    coalescedWrite(gStaple_v[ss],stencil_ss);
+	  }
+	  );
+      } //ensure views are all closed!
+      
      staple[mu] = Cell.Extract(gStaple);
+      outer_off += shift_mu_off;
    }//mu loop

+    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
+    free(Ug_dirs_v_host);
+    acceleratorFreeDevice(Ug_dirs_v);
+    
+    double t3=usecond();
+    
+    std::cout << GridLogPerformance << "StaplePaddedAll timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms, kernel:" << (t3-t2)/1000 << "ms" << std::endl;   
  }

   
@ -805,7 +825,7 @@ public:
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
-    U2 = U * CshiftLink(U, mu, 1);
+    U2 = U * Gimpl::CshiftLink(U, mu, 1);
  }

  ////////////////////////////////////////////////////////////////////////////
@ -826,9 +846,9 @@ public:

        // Up staple    ___ ___
        //             |       |
-        tmp = CshiftLink(adj(U[nu]), nu, -1);
+        tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
        tmp = adj(U2[mu]) * tmp;
-        tmp = CshiftLink(tmp, mu, -2);
+        tmp = Gimpl::CshiftLink(tmp, mu, -2);

        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);

@ -836,14 +856,14 @@ public:
        //             |___ ___|
        //
        tmp = adj(U2[mu]) * U[nu];
-        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, CshiftLink(tmp, mu, -2));
+        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));

        //              ___ ___
        //             |    ___|
        //             |___ ___|
        //

-        Stap += CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
+        Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);

        //              ___ ___
        //             |___    |
@ -852,7 +872,7 @@ public:

        //  tmp= Staple2x1* Cshift(U[mu],mu,-2);
        //  Stap+= Cshift(tmp,mu,1) ;
-        Stap += CshiftLink(Staple2x1, mu, 1) * CshiftLink(U[mu], mu, -1);
+        Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
        ;

        //       --
@ -860,10 +880,10 @@ public:
        //
        //      |  |

-        tmp = CshiftLink(adj(U2[nu]), nu, -2);
+        tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
-        tmp = U2[nu] * CshiftLink(tmp, nu, 2);
-        Stap += CshiftLink(tmp, mu, 1);
+        tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);

        //      |  |
        //
@ -872,8 +892,8 @@ public:

        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
        tmp = adj(U2[nu]) * tmp;
-        tmp = CshiftLink(tmp, nu, -2);
-        Stap += CshiftLink(tmp, mu, 1);
+        tmp = Gimpl::CshiftLink(tmp, nu, -2);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);
      }
    }
  }
@ -1006,6 +1026,7 @@ public:
    assert(Cell.depth >= 2);
    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid

+    double t0 = usecond();
    std::vector<Coordinate> shifts;
    for (int mu = 0; mu < Nd; mu++){
      for (int nu = 0; nu < Nd; nu++) {
@ -1060,8 +1081,11 @@ public:
    }
    size_t nshift = shifts.size();
    int mu_off_delta = nshift / Nd;
+    double t1 = usecond();
+    
    GeneralLocalStencil gStencil(ggrid,shifts);
-
+    double t2 = usecond();
+    
    //Open views to padded gauge links and keep open over mu loop
    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
    size_t vsize = Nd*sizeof(GaugeViewType);
@ -1183,6 +1207,10 @@ public:
    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
    free(Ug_dirs_v_host);
    acceleratorFreeDevice(Ug_dirs_v);
+    
+    double t3 = usecond();
+    
+    std::cout << GridLogPerformance << "RectStaplePaddedAll timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms, kernel:" << (t3-t2)/1000 << "ms" << std::endl;   
  }


@ -1198,13 +1226,18 @@ public:
    StapleAll(Stap, U);
    RectStapleAll(RectStap, U);
 #else
+    double t0 = usecond();
    //Use the padded cell with maximal reuse
    PaddedCell Ghost(2, dynamic_cast<GridCartesian*>(U[0].Grid()));
    CshiftImplGauge<Gimpl> cshift_impl;
    std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
    for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
+    double t1 = usecond();
    StaplePaddedAll(Stap, U_pad, Ghost);
+    double t2 = usecond();
    RectStaplePaddedAll(RectStap, U_pad, Ghost);
+    double t3 = usecond();
+    std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
 #endif
  }

--- a/tests/debug/Test_iwasaki_action_newstaple.cc
+++ b/tests/debug/Test_iwasaki_action_newstaple.cc
@ -165,18 +165,24 @@ int main (int argc, char ** argv)
  IwasakiGaugeActionOrig<Gimpl> action_orig(beta);
  IwasakiGaugeAction<Gimpl> action_new(beta);

-  double t0 = usecond();
-  action_orig.deriv(U, derivOrig);
-  double t1 = usecond();
-  action_new.deriv(U, derivNew);
-  double t2 = usecond();
+  double torig=0, tnew=0;
+  int ntest = 10;
+  for(int i=0;i<ntest;i++){
+    double t0 = usecond();
+    action_orig.deriv(U, derivOrig);
+    double t1 = usecond();
+    action_new.deriv(U, derivNew);
+    double t2 = usecond();

-  GaugeLorentz diff = derivOrig - derivNew;
-  double n = norm2(diff);
-  std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
-  assert(n<1e-10);
+    GaugeLorentz diff = derivOrig - derivNew;
+    double n = norm2(diff);
+    std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
+    assert(n<1e-10);

-  std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
+    std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
+    torig += (t1-t0)/1000; tnew += (t2-t1)/1000;
+  }
+  std::cout << GridLogMessage << "Avg timings " << ntest << " iterations: orig:" << torig/ntest << "ms,   new:" << tnew/ntest << "ms" << std::endl;
  
  Grid_finalize();
 }