Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding

Fixed const correctness on PaddedCell methods Fixed compile issues on Crusher Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance Optimized kernel for StaplePadded Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings
2026-02-02 13:23:29 +00:00 · 2023-06-27 14:58:10 -04:00
parent bb71e9a96a
commit f44dce390f
4 changed files with 245 additions and 62 deletions
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -697,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
  for(int d=0;d<nd;d++){
    assert(Fg->_processors[d]  == Tg->_processors[d]);
  }
  // the above should guarantee that the operations are local
 #if 1
  size_t nsite = 1;
  for(int i=0;i<nd;i++) nsite *= RegionSize[i];
  size_t tbytes = 4*nsite*sizeof(int);
  int *table = (int*)malloc(tbytes);
  thread_for(idx, nsite, {
      Coordinate from_coor, to_coor;
      size_t rem = idx;
      for(int i=0;i<nd;i++){
 	size_t base_i  = rem % RegionSize[i]; rem /= RegionSize[i];
 	from_coor[i] = base_i + FromLowerLeft[i];
 	to_coor[i] = base_i + ToLowerLeft[i];
      }
      int foidx = Fg->oIndex(from_coor);
      int fiidx = Fg->iIndex(from_coor);
      int toidx = Tg->oIndex(to_coor);
      int tiidx = Tg->iIndex(to_coor);
      int* tt = table + 4*idx;
      tt[0] = foidx;
      tt[1] = fiidx;
      tt[2] = toidx;
      tt[3] = tiidx;
    });
  int* table_d = (int*)acceleratorAllocDevice(tbytes);
  acceleratorCopyToDevice(table,table_d,tbytes);
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  autoView(from_v,From,AcceleratorRead);
  autoView(to_v,To,AcceleratorWrite);
  accelerator_for(idx,nsite,1,{
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int* tt = table_d + 4*idx;
      int from_oidx = *tt++;
      int from_lane = *tt++;
      int to_oidx = *tt++;
      int to_lane = *tt;
      const vector_type* from = (const vector_type *)&from_v[from_oidx];
      vector_type* to = (vector_type *)&to_v[to_oidx];
      scalar_type stmp;
      for(int w=0;w<words;w++){
 	stmp = getlane(from[w], from_lane);
 	putlane(to[w], stmp, to_lane);
      }
    });
  acceleratorFreeDevice(table_d);    
  free(table);
 #else  
  Coordinate ldf = Fg->_ldimensions;
  Coordinate rdf = Fg->_rdimensions;
  Coordinate isf = Fg->_istride;
@@ -738,6 +798,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
 #endif
    }
  });
 #endif
 }
@@ -830,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
 }
 //Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
 //The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
 template<class vobj>
 void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
 {
@@ -851,6 +915,65 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
    }
  }
 #if 1
  size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
  size_t tbytes = 4*nsite*sizeof(int);
  int *table = (int*)malloc(tbytes);
  thread_for(idx,nsite,{
    Coordinate lcoor(nl);
    Coordinate hcoor(nh);
    lcoor[orthog] = slice_lo;
    hcoor[orthog] = slice_hi;
    size_t rem = idx;
    for(int mu=0;mu<nl;mu++){
      if(mu != orthog){
 	int xmu = rem % lg->LocalDimensions()[mu];  rem /= lg->LocalDimensions()[mu];
 	lcoor[mu] = hcoor[mu] = xmu;
      }
    }
    int loidx = lg->oIndex(lcoor);
    int liidx = lg->iIndex(lcoor);
    int hoidx = hg->oIndex(hcoor);
    int hiidx = hg->iIndex(hcoor);
    int* tt = table + 4*idx;
    tt[0] = loidx;
    tt[1] = liidx;
    tt[2] = hoidx;
    tt[3] = hiidx;
    });
  int* table_d = (int*)acceleratorAllocDevice(tbytes);
  acceleratorCopyToDevice(table,table_d,tbytes);
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  autoView(lowDim_v,lowDim,AcceleratorRead);
  autoView(higherDim_v,higherDim,AcceleratorWrite);
  accelerator_for(idx,nsite,1,{
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int* tt = table_d + 4*idx;
      int from_oidx = *tt++;
      int from_lane = *tt++;
      int to_oidx = *tt++;
      int to_lane = *tt;
      const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
      vector_type* to = (vector_type *)&higherDim_v[to_oidx];
      scalar_type stmp;
      for(int w=0;w<words;w++){
 	stmp = getlane(from[w], from_lane);
 	putlane(to[w], stmp, to_lane);
      }
    });
  acceleratorFreeDevice(table_d);    
  free(table);
 #else
  // the above should guarantee that the operations are local
  autoView(lowDimv,lowDim,CpuRead);
  autoView(higherDimv,higherDim,CpuWrite);
@@ -866,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
      pokeLocalSite(s,higherDimv,hcoor);
    }
  });
 #endif
 }
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -95,7 +95,7 @@ public:
    }
  };
  template<class vobj>
-  inline Lattice<vobj> Extract(const Lattice<vobj> &in)
+  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
  {
    Lattice<vobj> out(unpadded_grid);
@@ -106,7 +106,7 @@ public:
    return out;
  }
  template<class vobj>
-  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>())
+  inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    int dims = old_grid->Nd();
@@ -118,7 +118,7 @@ public:
  }
  // expand up one dim at a time
  template<class vobj>
-  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>())
+  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
@@ -130,20 +130,40 @@ public:
    else       conformable(old_grid,grids[dim-1]);
    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
    double tins=0, tshift=0;
    // Middle bit
    double t = usecond();
    for(int x=0;x<local[dim];x++){
      InsertSliceLocal(in,padded,x,depth+x,dim);
    }
    tins += usecond() - t;
    // High bit
    t = usecond();
    shifted = cshift.Cshift(in,dim,depth);
    tshift += usecond() - t;
    t=usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
    }
    tins += usecond() - t;
    // Low bit
    t = usecond();
    shifted = cshift.Cshift(in,dim,-depth);
    tshift += usecond() - t;
    t = usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,x,x,dim);
    }
    tins += usecond() - t;
    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
    return padded;
  }
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -359,6 +359,7 @@ public:
    assert(Cell.depth >= 1);
    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
    double t0 = usecond();
    //Generate shift arrays
    std::vector<Coordinate> shifts;
    for(int mu=0;mu<Nd;mu++){
@@ -382,54 +383,73 @@ public:
 	}
      }
    }
    int shift_mu_off = shifts.size()/Nd;
    double t1 = usecond();
    //Generate local stencil
    GeneralLocalStencil gStencil(ggrid,shifts);
    double t2 = usecond();
    //Open views to padded gauge links and keep open over mu loop
    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
    size_t vsize = Nd*sizeof(GaugeViewType);
    GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
    GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
    acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
    GaugeMat gStaple(ggrid);
-    int off = 0;
+    int outer_off = 0;
    for(int mu=0;mu<Nd;mu++){
-      gStaple = Zero();
+      { //view scope
-
+	autoView( gStaple_v , gStaple, AcceleratorWrite);
-      for(int nu=0;nu<Nd;nu++){
+	auto gStencil_v = gStencil.View();
-	if(nu != mu){	  
+	
-	  {
+	accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
-	    autoView( rgStaple_v , gStaple, AcceleratorRead);
+	    decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
-	    autoView( gStaple_v , gStaple, AcceleratorWrite);
+	    stencil_ss = Zero();
-	    auto gStencil_v = gStencil.View();
+	    int off = outer_off;
-	    autoView( Ug_mu_v , U_padded[mu], AcceleratorRead);
+	    
-	    autoView( Ug_nu_v , U_padded[nu], AcceleratorRead);
+	    for(int nu=0;nu<Nd;nu++){
-  
+	      if(nu != mu){	  
-	    accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
+		GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
-		auto stencil_ss = coalescedRead(rgStaple_v[ss]);
+		auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-
+		e = gStencil_v.GetEntry(off++,ss);
-		GeneralStencilEntry const* e = gStencil_v.GetEntry(off,ss);
+		auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
+		e = gStencil_v.GetEntry(off++,ss);
-		e = gStencil_v.GetEntry(off+1,ss);
+		auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
 		auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
 		e = gStencil_v.GetEntry(off+2,ss);
 		auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
-		stencil_ss = stencil_ss + U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x;
+		stencil_ss = stencil_ss + U2 * U1 * U0;
-		e = gStencil_v.GetEntry(off+3,ss);
+		e = gStencil_v.GetEntry(off++,ss);
-		auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
+		U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
-		e = gStencil_v.GetEntry(off+4,ss);
+		e = gStencil_v.GetEntry(off++,ss);
-		auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
+		U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
-		e = gStencil_v.GetEntry(off+5,ss);
+		e = gStencil_v.GetEntry(off++,ss);
-		auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
+		U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
-		stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu;
+		stencil_ss = stencil_ss + U2 * U1 * U0;
 		coalescedWrite(gStaple_v[ss],stencil_ss);
 	      }
-	      );
+	    }
-	  } //ensure views are all closed!
+		
-	  off += 6;
+	    coalescedWrite(gStaple_v[ss],stencil_ss);
-	}//nu!=mu
+	  }
-      }//nu loop
+	  );
      } //ensure views are all closed!
      staple[mu] = Cell.Extract(gStaple);
      outer_off += shift_mu_off;
    }//mu loop
    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
    free(Ug_dirs_v_host);
    acceleratorFreeDevice(Ug_dirs_v);
    double t3=usecond();
    std::cout << GridLogPerformance << "StaplePaddedAll timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms, kernel:" << (t3-t2)/1000 << "ms" << std::endl;   
  }
@@ -805,7 +825,7 @@ public:
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
-    U2 = U * CshiftLink(U, mu, 1);
+    U2 = U * Gimpl::CshiftLink(U, mu, 1);
  }
  ////////////////////////////////////////////////////////////////////////////
@@ -826,9 +846,9 @@ public:
        // Up staple    ___ ___
        //             |       |
-        tmp = CshiftLink(adj(U[nu]), nu, -1);
+        tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
        tmp = adj(U2[mu]) * tmp;
-        tmp = CshiftLink(tmp, mu, -2);
+        tmp = Gimpl::CshiftLink(tmp, mu, -2);
        Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);
@@ -836,14 +856,14 @@ public:
        //             |___ ___|
        //
        tmp = adj(U2[mu]) * U[nu];
-        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, CshiftLink(tmp, mu, -2));
+        Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));
        //              ___ ___
        //             |    ___|
        //             |___ ___|
        //
-        Stap += CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
+        Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
        //              ___ ___
        //             |___    |
@@ -852,7 +872,7 @@ public:
        //  tmp= Staple2x1* Cshift(U[mu],mu,-2);
        //  Stap+= Cshift(tmp,mu,1) ;
-        Stap += CshiftLink(Staple2x1, mu, 1) * CshiftLink(U[mu], mu, -1);
+        Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
        ;
        //       --
@@ -860,10 +880,10 @@ public:
        //
        //      |  |
-        tmp = CshiftLink(adj(U2[nu]), nu, -2);
+        tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
        tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
-        tmp = U2[nu] * CshiftLink(tmp, nu, 2);
+        tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
-        Stap += CshiftLink(tmp, mu, 1);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);
        //      |  |
        //
@@ -872,8 +892,8 @@ public:
        tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
        tmp = adj(U2[nu]) * tmp;
-        tmp = CshiftLink(tmp, nu, -2);
+        tmp = Gimpl::CshiftLink(tmp, nu, -2);
-        Stap += CshiftLink(tmp, mu, 1);
+        Stap += Gimpl::CshiftLink(tmp, mu, 1);
      }
    }
  }
@@ -1006,6 +1026,7 @@ public:
    assert(Cell.depth >= 2);
    GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
    double t0 = usecond();
    std::vector<Coordinate> shifts;
    for (int mu = 0; mu < Nd; mu++){
      for (int nu = 0; nu < Nd; nu++) {
@@ -1060,8 +1081,11 @@ public:
    }
    size_t nshift = shifts.size();
    int mu_off_delta = nshift / Nd;
    double t1 = usecond();
    GeneralLocalStencil gStencil(ggrid,shifts);
-
+    double t2 = usecond();
    //Open views to padded gauge links and keep open over mu loop
    typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
    size_t vsize = Nd*sizeof(GaugeViewType);
@@ -1183,6 +1207,10 @@ public:
    for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
    free(Ug_dirs_v_host);
    acceleratorFreeDevice(Ug_dirs_v);
    double t3 = usecond();
    std::cout << GridLogPerformance << "RectStaplePaddedAll timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms, kernel:" << (t3-t2)/1000 << "ms" << std::endl;   
  }
@@ -1198,13 +1226,18 @@ public:
    StapleAll(Stap, U);
    RectStapleAll(RectStap, U);
 #else
    double t0 = usecond();
    //Use the padded cell with maximal reuse
    PaddedCell Ghost(2, dynamic_cast<GridCartesian*>(U[0].Grid()));
    CshiftImplGauge<Gimpl> cshift_impl;
    std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
    for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
    double t1 = usecond();
    StaplePaddedAll(Stap, U_pad, Ghost);
    double t2 = usecond();
    RectStaplePaddedAll(RectStap, U_pad, Ghost);
    double t3 = usecond();
    std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
 #endif
  }
--- a/tests/debug/Test_iwasaki_action_newstaple.cc
+++ b/tests/debug/Test_iwasaki_action_newstaple.cc
@@ -165,18 +165,24 @@ int main (int argc, char ** argv)
  IwasakiGaugeActionOrig<Gimpl> action_orig(beta);
  IwasakiGaugeAction<Gimpl> action_new(beta);
-  double t0 = usecond();
+  double torig=0, tnew=0;
-  action_orig.deriv(U, derivOrig);
+  int ntest = 10;
-  double t1 = usecond();
+  for(int i=0;i<ntest;i++){
-  action_new.deriv(U, derivNew);
+    double t0 = usecond();
-  double t2 = usecond();
+    action_orig.deriv(U, derivOrig);
    double t1 = usecond();
    action_new.deriv(U, derivNew);
    double t2 = usecond();
-  GaugeLorentz diff = derivOrig - derivNew;
+    GaugeLorentz diff = derivOrig - derivNew;
-  double n = norm2(diff);
+    double n = norm2(diff);
-  std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
+    std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
-  assert(n<1e-10);
+    assert(n<1e-10);
-  std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
+    std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms,  new: " << (t2-t1)/1000 << "ms" << std::endl;
    torig += (t1-t0)/1000; tnew += (t2-t1)/1000;
  }
  std::cout << GridLogMessage << "Avg timings " << ntest << " iterations: orig:" << torig/ntest << "ms,   new:" << tnew/ntest << "ms" << std::endl;
  Grid_finalize();
 }