diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 9fe5f9f8..2f58a027 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -148,7 +148,7 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp, acc, res;; + spinor tmp, acc, res; // X = Nc*Ns // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index 4078267d..ddf852de 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -242,36 +242,38 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ - - uint64_t ss = sss*Ls; - + uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp; + spinor tmp, acc, res, tmp2_spProj; - // Apply (L^{\prime})^{-1} - coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0] - for(int s=1; s=0; s--){ - spProj5m(tmp, chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp); + res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc; + + // Apply U_m^{-1} D^{-1} U^{-1} + res = (1.0/pdee[Ls-1])*res; + coalescedWrite(chi[ss+Ls-1],res); + spProj5p(acc,res); + spProj5m(tmp,res); + for (int s=Ls-2;s>=0;s--){ + res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; + spProj5m(tmp,res); + coalescedWrite(chi[ss+s],res); } }); @@ -300,45 +302,45 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp, acc, res, tmp_spProj; - uint64_t ss = sss*Ls; + // Apply (L^{\prime})^{-1} L_m^{-1} + res = psi(ss); + spProj5m(tmp,res); + acc = pleem[0]*tmp; + spProj5p(tmp,res); + coalescedWrite(chi[ss],res); + tmp_spProj = pMooeeInv_shift_lc[0]*res; - typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp1,tmp2,tmp2_spProj; + for(int s=1;s=0; s--){ - coalescedWrite(chi[ss+s] , chi(ss+s) - puee[s]*tmp1); - spProj5m(tmp1, chi(ss+s)); - coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInv_shift_norm[s]*tmp2_spProj); - } + // Apply U_m^{-1} D^{-1} U^{-1} + res = (1.0/pdee[Ls-1])*res; + spProj5p(acc,res); + spProj5m(tmp,res); + coalescedWrite(chi[ss+Ls-1], res + pMooeeInv_shift_norm[Ls-1]*tmp_spProj); + for (int s=Ls-2;s>=0;s--){ + res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; + spProj5m(tmp,res); + coalescedWrite(chi[ss+s], res + pMooeeInv_shift_norm[s]*tmp_spProj); + } }); this->MooeeInvTime += usecond(); @@ -366,36 +368,38 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ - - uint64_t ss = sss*Ls; - +uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp; + spinor tmp, acc, res; - // Apply (U^{\prime})^{-dag} - coalescedWrite(chi[ss], psi(ss)); - for(int s=1; s=0; s--){ - spProj5p(tmp, chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp); + res = psi(ss+Ls-1) - puee[Ls-2]*tmp - acc; + + // Apply U_m^{-1} D^{-1} U^{-1} + res = (1.0/pdee[Ls-1])*res; + coalescedWrite(chi[ss+Ls-1],res); + spProj5m(acc,res); + spProj5p(tmp,res); + for (int s=Ls-2;s>=0;s--){ + res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc; + spProj5p(tmp,res); + coalescedWrite(chi[ss+s],res); } }); @@ -425,45 +429,45 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp, acc, res, tmp_spProj; - uint64_t ss = sss*Ls; + // Apply (L^{\prime})^{-1} L_m^{-1} + res = psi(ss); + spProj5p(tmp,res); + acc = pueem[0]*tmp; + spProj5m(tmp,res); + coalescedWrite(chi[ss],res); + tmp_spProj = pMooeeInvDag_shift_lc[0]*res; - typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp1,tmp2,tmp2_spProj; + for(int s=1;s=0; s--){ - coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp1); - spProj5p(tmp1, chi(ss+s)); - coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInvDag_shift_norm[s]*tmp2_spProj); - } + // Apply U_m^{-1} D^{-1} U^{-1} + res = (1.0/pdee[Ls-1])*res; + spProj5m(acc,res); + spProj5p(tmp,res); + coalescedWrite(chi[ss+Ls-1], res + pMooeeInvDag_shift_norm[Ls-1]*tmp_spProj); + for (int s=Ls-2;s>=0;s--){ + res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc; + spProj5p(tmp,res); + coalescedWrite(chi[ss+s], res + pMooeeInvDag_shift_norm[s]*tmp_spProj); + } }); this->MooeeInvTime += usecond();