diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc index 6b532c7a..d184b70e 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc @@ -99,35 +99,37 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i, auto psi = psi_i.View(); auto phi = phi_i.View(); auto chi = chi_i.View(); + Coeff_t *lower_v = &lower[0]; + Coeff_t *diag_v = &diag[0]; + Coeff_t *upper_v = &upper[0]; int Ls =this->Ls; assert(phi.Checkerboard() == psi.Checkerboard()); - // Flops = 6.0*(Nc*Ns) *Ls*vol + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites4d = nsimd * grid->oSites() / Ls; + + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) M5Dcalls++; M5Dtime-=usecond(); - thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls - auto tmp = psi[0]; + accelerator_loopN( sss, sites4d ,{ + uint64_t lane = sss % nsimd; + uint64_t ss = Ls * (sss / nsimd); + for(int s=0;s<Ls;s++){ - if ( s==0 ) { - spProj5p(tmp,psi[ss+s+1]); - chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp; - - spProj5m(tmp,psi[ss+Ls-1]); - chi[ss+s]=chi[ss+s]+lower[s]*tmp; - } else if ( s==(Ls-1)) { - spProj5p(tmp,psi[ss+0]); - chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp; - - spProj5m(tmp,psi[ss+s-1]); - chi[ss+s]=chi[ss+s]+lower[s]*tmp; - } else { - spProj5p(tmp,psi[ss+s+1]); - chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp; - - spProj5m(tmp,psi[ss+s-1]); - chi[ss+s]=chi[ss+s]+lower[s]*tmp; - } + auto res = extractLane(lane,phi[ss+s]); + res = diag_v[s]*res; + + auto tmp = extractLane(lane,psi[ss+(s+1)%Ls]); + spProj5p(tmp,tmp); + res += upper_v[s]*tmp; + + tmp = extractLane(lane,psi[ss+(s+Ls-1)%Ls]); + spProj5m(tmp,tmp); + res += lower_v[s]*tmp; + + insertLane(lane,chi[ss+s],res); } }); M5Dtime+=usecond();