diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc index 00f83c21..367c5ff1 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc @@ -205,48 +205,64 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - int Ls=this->Ls; - + auto psi = psi_i.View(); auto chi = chi_i.View(); - - assert(psi.Checkerboard() == psi.Checkerboard()); - + Coeff_t *lee_v = &lee[0]; + Coeff_t *leem_v = &leem[0]; + Coeff_t *uee_v = &uee[0]; + Coeff_t *ueem_v = &ueem[0]; + Coeff_t *dee_v = &dee[0]; + + int Ls=this->Ls; + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites4d = nsimd * grid->oSites() / Ls; + + typedef typename SiteSpinor::scalar_object ScalarSiteSpinor; + MooeeInvCalls++; MooeeInvTime-=usecond(); - - thread_loop((int ss=0;ssoSites();ss+=Ls),{ // adds Ls - - auto tmp = psi[0]; - - // Apply (U^{\prime})^{-dagger} - chi[ss]=psi[ss]; - for (int s=1;s=0;s--){ - spProj5p(tmp,chi[ss+s+1]); - chi[ss+s] = chi[ss+s] - conjugate(lee[s])*tmp; + res = extractLane(lane,chi[ss+s]); + res = conjugate(1.0/dee_v[s])*res - conjugate(lee_v[s])*tmp - conjugate(leem_v[s])*acc; + spProj5p(tmp,res); + insertLane(lane,chi[ss+s],res); } }); - + MooeeInvTime+=usecond(); - + } #ifdef CAYLEY_DPERP_GPU