Improved performance of G-parity kernel for GPUs by simplifying multLink implementation

2026-05-17 07:34:31 +01:00 · 2020-12-07 11:53:35 -05:00
parent 888eacd3b8
commit 2ef1fa66a8
1 changed files with 15 additions and 27 deletions
@@ -97,42 +97,30 @@ public:
    Coordinate icoor;
 #ifdef GRID_SIMT
    _Spinor tmp;
    const int Nsimd =SiteDoubledGaugeField::Nsimd();
    int s = acceleratorSIMTlane(Nsimd);
    St.iCoorFromIindex(icoor,s);
    int mmu = mu % Nd;
    if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
      int permute_lane = (sl==1) 
    	|| ((distance== 1)&&(icoor[direction]==1))
 	|| ((distance==-1)&&(icoor[direction]==0));
-      if ( permute_lane ) { 
+    auto UU0=coalescedRead(U(0)(mu));
-	tmp(0) = chi(1);
+    auto UU1=coalescedRead(U(1)(mu));
-	tmp(1) = chi(0);
+    
-      } else {
+    //Decide whether we do a G-parity flavor twist
-	tmp(0) = chi(0);
+    //Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
-	tmp(1) = chi(1);
+    //It also assumes (but does not check) that abs(distance) == 1
-      }
+    int permute_lane = (sl==1) 
    || ((distance== 1)&&(icoor[direction]==1))
    || ((distance==-1)&&(icoor[direction]==0));
-      auto UU0=coalescedRead(U(0)(mu));
+    permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
      auto UU1=coalescedRead(U(1)(mu));
-      mult(&phi(0),&UU0,&tmp(0));
+    //Apply the links
-      mult(&phi(1),&UU1,&tmp(1));
+    int f_upper = permute_lane ? 1 : 0;
    int f_lower = !f_upper;
-    } else {
+    mult(&phi(0),&UU0,&chi(f_upper));
-
+    mult(&phi(1),&UU1,&chi(f_lower));
      auto UU0=coalescedRead(U(0)(mu));
      auto UU1=coalescedRead(U(1)(mu));
      mult(&phi(0),&UU0,&chi(0));
      mult(&phi(1),&UU1,&chi(1));
    }
 #else
    typedef _Spinor vobj;