diff --git a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc index f99804a5..83f119ca 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc +++ b/Grid/qcd/action/fermion/CayleyFermion5Dgpu.cc @@ -50,35 +50,39 @@ void CayleyFermion5D::M5D(const FermionField &psi_i, auto psi = psi_i.View(); auto phi = phi_i.View(); auto chi = chi_i.View(); + Coeff_t *lower_v = &lower[0]; + Coeff_t *diag_v = &diag[0]; + Coeff_t *upper_v = &upper[0]; int Ls =this->Ls; assert(phi.Checkerboard() == psi.Checkerboard()); + + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites4d = nsimd * grid->oSites() / Ls; + // 10 = 3 complex mult + 2 complex add // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) M5Dcalls++; M5Dtime-=usecond(); - - thread_loop( (int ss=0;ssoSites();ss+=Ls),{ // adds Ls + + typedef typename SiteSpinor::scalar_object ScalarSiteSpinor; + + accelerator_loopN( sss, sites4d ,{ + uint64_t lane = sss % nsimd; + uint64_t ss = Ls * (sss / nsimd); + for(int s=0;s