Offload more loops

2025-08-09 07:57:05 +01:00 · 2020-06-10 12:57:55 -04:00
parent e97f3688db
commit 8720aecb80
3 changed files with 29 additions and 108 deletions
--- a/Grid/qcd/action/fermion/WilsonCloverFermion.h
+++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h
@@ -258,15 +258,16 @@ private:
  CloverFieldType CloverTermDagEven, CloverTermDagOdd;       // Clover term Dag EO
  CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO

+ public:
  // eventually these can be compressed into 6x6 blocks instead of the 12x12
  // using the DeGrand-Rossi basis for the gamma matrices
  CloverFieldType fillCloverYZ(const GaugeLinkField &F)
  {
    CloverFieldType T(F.Grid());
    T = Zero();
-    autoView(T_v,T,CpuWrite);
-    autoView(F_v,F,CpuRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView(T_v,T,AcceleratorWrite);
+    autoView(F_v,F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
@@ -282,9 +283,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
    
-    autoView(T_v, T,CpuWrite);
-    autoView(F_v, F,CpuRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView(T_v, T,AcceleratorWrite);
+    autoView(F_v, F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -F_v[i]()();
      T_v[i]()(1, 0) = F_v[i]()();
@@ -300,9 +301,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();

-    autoView(T_v,T,CpuWrite);
-    autoView(F_v,F,CpuRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView(T_v,T,AcceleratorWrite);
+    autoView(F_v,F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
      T_v[i]()(1, 1) = timesI(F_v[i]()());
@@ -318,9 +319,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();

-    autoView( T_v , T, CpuWrite);
-    autoView( F_v , F, CpuRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView( T_v , T, AcceleratorWrite);
+    autoView( F_v , F, AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = timesI(F_v[i]()());
      T_v[i]()(1, 0) = timesI(F_v[i]()());
@@ -336,9 +337,9 @@ private:
    CloverFieldType T(F.Grid());
    T = Zero();
    
-    autoView( T_v ,T,CpuWrite);
-    autoView( F_v ,F,CpuRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView( T_v ,T,AcceleratorWrite);
+    autoView( F_v ,F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 1) = -(F_v[i]()());
      T_v[i]()(1, 0) = (F_v[i]()());
@@ -355,9 +356,9 @@ private:

    T = Zero();

-    autoView( T_v , T,CpuWrite);
-    autoView( F_v , F,CpuRead);
-    thread_for(i, CloverTerm.Grid()->oSites(),
+    autoView( T_v , T,AcceleratorWrite);
+    autoView( F_v , F,AcceleratorRead);
+    accelerator_for(i, CloverTerm.Grid()->oSites(),1,
    {
      T_v[i]()(0, 0) = timesI(F_v[i]()());
      T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -106,10 +106,10 @@ public:
 			    const _SpinorField & phi,
 			    int mu)
  {
-    autoView( out_v, out, CpuWrite);
-    autoView( phi_v, phi, CpuRead);
-    autoView( Umu_v, Umu, CpuRead);
-    thread_for(sss,out.Grid()->oSites(),{
+    autoView( out_v, out, AcceleratorWrite);
+    autoView( phi_v, phi, AcceleratorRead);
+    autoView( Umu_v, Umu, AcceleratorRead);
+    accelerator_for(sss,out.Grid()->oSites(),1,{
 	multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
    });
  }
@@ -192,10 +192,10 @@ public:
    GaugeLinkField tmp(mat.Grid());
    tmp = Zero();
    {
-      autoView( tmp_v , tmp, CpuWrite);
-      autoView( Btilde_v , Btilde, CpuRead);
-      autoView( Atilde_v , Atilde, CpuRead);
-      thread_for(sss,tmp.Grid()->oSites(),{
+      autoView( tmp_v , tmp, AcceleratorWrite);
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,tmp.Grid()->oSites(),1,{
 	  int sU=sss;
 	  for(int s=0;s<Ls;s++){
 	    int sF = s+Ls*sU;
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -467,32 +467,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
  conformable(_grid, q_in_1.Grid());
  conformable(_grid, q_in_2.Grid());
  conformable(_grid, q_out.Grid());
-#if 0
-  PropagatorField tmp1(_grid), tmp2(_grid);
-  q_out = Zero();
-
-  // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
-  // Inefficient comms method but not performance critical.
-  tmp1 = Cshift(q_in_1, mu, 1);
-  tmp2 = Cshift(q_in_2, mu, 1);
-  autoView( tmp1_v  ,  tmp1, CpuWrite);
-  autoView( tmp2_v  ,  tmp2, CpuWrite);
-  autoView( q_in_1_v,q_in_1, CpuRead);
-  autoView( q_in_2_v,q_in_2, CpuRead);
-  autoView( q_out_v , q_out, CpuRead);
-  autoView( Umu_v   ,   Umu, CpuRead);
-  thread_for(sU, Umu.Grid()->oSites(),{
-      Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
-					       q_in_2_v[sU],
-					       q_out_v[sU],
-					       Umu_v, sU, mu);
-      Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
-					       tmp2_v[sU],
-					       q_out_v[sU],
-					       Umu_v, sU, mu);
-  });
-#else
-#endif
+  assert(0);
 }


@@ -508,62 +483,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 {
  conformable(_grid, q_in.Grid());
  conformable(_grid, q_out.Grid());
-#if 0
-
-  //  Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
-  Complex i(0.0,1.0);
-  PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
-  unsigned int tshift = (mu == Tp) ? 1 : 0;
-  unsigned int LLt    = GridDefaultLatt()[Tp];
-
-  q_out = Zero();
-  LatticeInteger coords(_grid);
-  LatticeCoordinate(coords, Tp);
-
-  // Need q(x + mu) and q(x - mu).
-  tmp    = Cshift(q_in, mu, 1);
-  tmpFwd = tmp*lattice_cmplx;
-  tmp    = lattice_cmplx*q_in;
-  tmpBwd = Cshift(tmp, mu, -1);
-
-  autoView( coords_v , coords, CpuRead);
-  autoView( tmpFwd_v , tmpFwd, CpuRead);
-  autoView( tmpBwd_v , tmpBwd, CpuRead);
-  autoView( Umu_v    , Umu, CpuRead);
-  autoView( q_out_v  , q_out, CpuWrite);
-
-  thread_for(sU, Umu.Grid()->oSites(), {
-
-    // Compute the sequential conserved current insertion only if our simd
-    // object contains a timeslice we need.
-    vPredicate t_mask;
-    t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
-    Integer timeSlices = Reduce(t_mask());
-
-    if (timeSlices > 0) {
-      Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], 
-					  q_out_v[sU], 
-					  Umu_v, sU, mu, t_mask);
-    }
-
-    // Repeat for backward direction.
-    t_mask()     = ((coords_v[sU] >= (tmin + tshift)) && 
-		    (coords_v[sU] <= (tmax + tshift)));
-    
-    //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
-    unsigned int t0 = 0;
-    if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
-    
-    timeSlices = Reduce(t_mask());
-
-    if (timeSlices > 0) {
-      Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], 
-					  q_out_v[sU], 
-					  Umu_v, sU, mu, t_mask);
-    }
-  });
-#else
-#endif
+  assert(0);
 }

 NAMESPACE_END(Grid);