Offload more loops

2026-07-04 01:13:29 +01:00 · 2020-06-10 12:57:55 -04:00
parent e97f3688db
commit 8720aecb80
3 changed files with 29 additions and 108 deletions
@@ -258,15 +258,16 @@ private:
  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
 public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
-    autoView(T_v,T,CpuWrite);
+    autoView(T_v,T,AcceleratorWrite);
-    autoView(F_v,F,CpuRead);
+    autoView(F_v,F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
@@ -282,9 +283,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
-    autoView(T_v, T,CpuWrite);
+    autoView(T_v, T,AcceleratorWrite);
-    autoView(F_v, F,CpuRead);
+    autoView(F_v, F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
@@ -300,9 +301,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
-    autoView(T_v,T,CpuWrite);
+    autoView(T_v,T,AcceleratorWrite);
-    autoView(F_v,F,CpuRead);
+    autoView(F_v,F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
@@ -318,9 +319,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
-    autoView( T_v , T, CpuWrite);
+    autoView( T_v , T, AcceleratorWrite);
-    autoView( F_v , F, CpuRead);
+    autoView( F_v , F, AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
@@ -336,9 +337,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
-    autoView( T_v ,T,CpuWrite);
+    autoView( T_v ,T,AcceleratorWrite);
-    autoView( F_v ,F,CpuRead);
+    autoView( F_v ,F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
@@ -355,9 +356,9 @@ private:
    T = Zero();
-    autoView( T_v , T,CpuWrite);
+    autoView( T_v , T,AcceleratorWrite);
-    autoView( F_v , F,CpuRead);
+    autoView( F_v , F,AcceleratorRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
@@ -106,10 +106,10 @@ public:
 			    const _SpinorField & phi,
 			    int mu)
  {
-    autoView( out_v, out, CpuWrite);
+    autoView( out_v, out, AcceleratorWrite);
-    autoView( phi_v, phi, CpuRead);
+    autoView( phi_v, phi, AcceleratorRead);
-    autoView( Umu_v, Umu, CpuRead);
+    autoView( Umu_v, Umu, AcceleratorRead);
-    thread_for(sss,out.Grid()->oSites(),{
+    accelerator_for(sss,out.Grid()->oSites(),1,{
 	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
    });
  }
@@ -192,10 +192,10 @@ public:
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
+      autoView( tmp_v , tmp, AcceleratorWrite);
-      autoView( Btilde_v , Btilde, CpuRead);
+      autoView( Btilde_v , Btilde, AcceleratorRead);
-      autoView( Atilde_v , Atilde, CpuRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
-      thread_for(sss,tmp.Grid()->oSites(),{
+      accelerator_for(sss,tmp.Grid()->oSites(),1,{
 	  int sU=sss;
 	  for(int s=0;s<Ls;s++){
 	    int sF = s+Ls*sU;
@@ -467,32 +467,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-#if 0
+  assert(0);
  PropagatorField tmp1(_grid), tmp2(_grid);
  q_out = Zero();
  // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
  // Inefficient comms method but not performance critical.
  tmp1 = Cshift(q_in_1, mu, 1);
  tmp2 = Cshift(q_in_2, mu, 1);
  autoView( tmp1_v  ,  tmp1, CpuWrite);
  autoView( tmp2_v  ,  tmp2, CpuWrite);
  autoView( q_in_1_v,q_in_1, CpuRead);
  autoView( q_in_2_v,q_in_2, CpuRead);
  autoView( q_out_v , q_out, CpuRead);
  autoView( Umu_v   ,   Umu, CpuRead);
  thread_for(sU, Umu.Grid()->oSites(),{
      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
 					       q_in_2_v[sU],
 					       q_out_v[sU],
 					       Umu_v, sU, mu);
      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
 					       tmp2_v[sU],
 					       q_out_v[sU],
 					       Umu_v, sU, mu);
  });
 #else
 #endif
 }
@@ -508,62 +483,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 {
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-#if 0
+  assert(0);
  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
  Complex i(0.0,1.0);
  PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
  unsigned int tshift = (mu == Tp) ? 1 : 0;
  unsigned int LLt    = GridDefaultLatt()[Tp];
  q_out = Zero();
  LatticeInteger coords(_grid);
  LatticeCoordinate(coords, Tp);
  // Need q(x + mu) and q(x - mu).
  tmp    = Cshift(q_in, mu, 1);
  tmpFwd = tmp*lattice_cmplx;
  tmp    = lattice_cmplx*q_in;
  tmpBwd = Cshift(tmp, mu, -1);
  autoView( coords_v , coords, CpuRead);
  autoView( tmpFwd_v , tmpFwd, CpuRead);
  autoView( tmpBwd_v , tmpBwd, CpuRead);
  autoView( Umu_v    , Umu, CpuRead);
  autoView( q_out_v  , q_out, CpuWrite);
  thread_for(sU, Umu.Grid()->oSites(), {
    // Compute the sequential conserved current insertion only if our simd
    // object contains a timeslice we need.
    vPredicate t_mask;
    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
    Integer timeSlices = Reduce(t_mask());
    if (timeSlices > 0) {
      Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], 
 					  q_out_v[sU], 
 					  Umu_v, sU, mu, t_mask);
    }
    // Repeat for backward direction.
    t_mask()     = ((coords_v[sU] >= (tmin + tshift)) && 
 		    (coords_v[sU] <= (tmax + tshift)));
    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
    unsigned int t0 = 0;
    if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
    timeSlices = Reduce(t_mask());
    if (timeSlices > 0) {
      Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], 
 					  q_out_v[sU], 
 					  Umu_v, sU, mu, t_mask);
    }
  });
 #else
 #endif
 }
 NAMESPACE_END(Grid);