1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-10 22:20:45 +01:00

Improved performance of G-parity kernel for GPUs by simplifying multLink implementation

This commit is contained in:
Christopher Kelly 2020-12-07 11:53:35 -05:00
parent 888eacd3b8
commit 2ef1fa66a8

View File

@ -97,42 +97,30 @@ public:
Coordinate icoor; Coordinate icoor;
#ifdef GRID_SIMT #ifdef GRID_SIMT
_Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd(); const int Nsimd =SiteDoubledGaugeField::Nsimd();
int s = acceleratorSIMTlane(Nsimd); int s = acceleratorSIMTlane(Nsimd);
St.iCoorFromIindex(icoor,s); St.iCoorFromIindex(icoor,s);
int mmu = mu % Nd; int mmu = mu % Nd;
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
int permute_lane = (sl==1)
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
if ( permute_lane ) { auto UU0=coalescedRead(U(0)(mu));
tmp(0) = chi(1); auto UU1=coalescedRead(U(1)(mu));
tmp(1) = chi(0);
} else { //Decide whether we do a G-parity flavor twist
tmp(0) = chi(0); //Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
tmp(1) = chi(1); //It also assumes (but does not check) that abs(distance) == 1
} int permute_lane = (sl==1)
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
auto UU0=coalescedRead(U(0)(mu)); permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
auto UU1=coalescedRead(U(1)(mu));
mult(&phi(0),&UU0,&tmp(0)); //Apply the links
mult(&phi(1),&UU1,&tmp(1)); int f_upper = permute_lane ? 1 : 0;
int f_lower = !f_upper;
} else { mult(&phi(0),&UU0,&chi(f_upper));
mult(&phi(1),&UU1,&chi(f_lower));
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
mult(&phi(0),&UU0,&chi(0));
mult(&phi(1),&UU1,&chi(1));
}
#else #else
typedef _Spinor vobj; typedef _Spinor vobj;