diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index ca88afd8..8af3e7c0 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -127,6 +127,12 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi int Ls=this->Ls; + auto plee = & lee [0]; + auto pdee = & dee [0]; + auto puee = & uee [0]; + auto pleem = & leem[0]; + auto pueem = & ueem[0]; + MooeeInvCalls++; MooeeInvTime-=usecond(); uint64_t nloop = grid->oSites()/Ls; @@ -140,27 +146,27 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] for(int s=1;s=0;s--){ spProj5m(tmp,chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - uee[s]*tmp); + coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp); } }); @@ -179,6 +185,12 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi auto psi = psi_i.View(); auto chi = chi_i.View(); + auto plee = & lee [0]; + auto pdee = & dee [0]; + auto puee = & uee [0]; + auto pleem = & leem[0]; + auto pueem = & ueem[0]; + assert(psi.Checkerboard() == psi.Checkerboard()); MooeeInvCalls++; @@ -195,25 +207,25 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi coalescedWrite(chi[ss],psi(ss)); for (int s=1;s=0;s--){ spProj5p(tmp,chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(lee[s])*tmp); + coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(plee[s])*tmp); } }); MooeeInvTime+=usecond(); diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 468bddbd..a3eca650 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -113,6 +113,13 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie auto chi=chi_i.View(); int Ls = this->Ls; + auto plee = & this->lee[0]; + auto pdee = & this->dee[0]; + auto puee = & this->uee[0]; + + auto pleem = & this->leem[0]; + auto pueem = & this->ueem[0]; + this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); uint64_t nloop=grid->oSites()/Ls; @@ -126,27 +133,27 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] for(int s=1; slee[s-1]*tmp1); + coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1); } // L_m^{-1} for(int s=0; sleem[s]*tmp1); + coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1); } // U_m^{-1} D^{-1} for(int s=0; sdee[s])*chi(ss+s) - (this->ueem[s]/this->dee[Ls])*tmp1); + coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls])*tmp1); } spProj5m(tmp2, chi(ss+Ls-1)); - coalescedWrite(chi[ss+Ls-1],(1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2); + coalescedWrite(chi[ss+Ls-1],(1.0/pdee[Ls])*tmp1 + (1.0/pdee[Ls-1])*tmp2); // Apply U^{-1} for(int s=Ls-2; s>=0; s--){ spProj5m(tmp1, chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - this->uee[s]*tmp1); + coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp1); } }); this->MooeeInvTime += usecond(); diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index 5825c06a..650435fc 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -83,7 +83,8 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion auto phi = phi_i.View(); auto chi = chi_i.View(); - int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator + auto pm = this->pm; + int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator assert(phi.Checkerboard() == psi.Checkerboard()); @@ -104,8 +105,8 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion spProj5m(tmp1, psi(idx_u)); spProj5p(tmp2, psi(idx_l)); - if(this->pm == 1){ spProj5p(tmp, psi(ss+shift_s)); } - else { spProj5m(tmp, psi(ss+shift_s)); } + if(pm == 1){ spProj5p(tmp, psi(ss+shift_s)); } + else { spProj5m(tmp, psi(ss+shift_s)); } coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 +lower[s]*tmp2 + shift_coeffs[s]*tmp); } @@ -169,6 +170,8 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm this->M5Dcalls++; this->M5Dtime -= usecond(); + auto pm = this->pm; + int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss = sss*Ls; @@ -188,8 +191,8 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); else coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); - if(this->pm == 1){ spProj5p(tmp, psi(ss+s)); } - else { spProj5m(tmp, psi(ss+s)); } + if(pm == 1){ spProj5p(tmp, psi(ss+s)); } + else { spProj5m(tmp, psi(ss+s)); } coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+shift_coeffs[s]*tmp); } @@ -207,6 +210,12 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & auto psi = psi_i.View(); auto chi = chi_i.View(); + auto plee = & this->lee [0]; + auto pdee = & this->dee [0]; + auto puee = & this->uee [0]; + auto pleem= & this->leem[0]; + auto pueem= & this->ueem[0]; + if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } this->MooeeInvCalls++; @@ -224,26 +233,26 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0] for(int s=1; slee[s-1]*tmp); + coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp); } // L_m^{-1} for(int s=0; sleem[s]*tmp); + coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp); } // U_m^{-1} D^{-1} for(int s=0; sdee[s])*chi(ss+s) - (this->ueem[s]/this->dee[Ls-1])*tmp); + coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp); } - coalescedWrite(chi[ss+Ls-1], (1.0/this->dee[Ls-1])*chi(ss+Ls-1)); + coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); // Apply U^{-1} for(int s=Ls-2; s>=0; s--){ spProj5m(tmp, chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - this->uee[s]*tmp); + coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp); } }); @@ -259,6 +268,14 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF auto psi = psi_i.View(); auto chi = chi_i.View(); + auto pm = this->pm; + auto plee = & this->lee [0]; + auto pdee = & this->dee [0]; + auto puee = & this->uee [0]; + auto pleem= & this->leem[0]; + auto pueem= & this->ueem[0]; + auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0]; + auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0]; this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); @@ -272,36 +289,36 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2 coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0] - tmp2 = MooeeInv_shift_lc[0]*psi(ss); + tmp2 = pMooeeInv_shift_lc[0]*psi(ss); for(int s=1; slee[s-1]*tmp1); - tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi(ss+s); + coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1); + tmp2 = tmp2 + pMooeeInv_shift_lc[s]*psi(ss+s); } - if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);} - else { spProj5m(tmp2_spProj, tmp2); } + if(pm == 1){ spProj5p(tmp2_spProj, tmp2);} + else { spProj5m(tmp2_spProj, tmp2); } // L_m^{-1} for(int s=0; sleem[s]*tmp1); + coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1); } // U_m^{-1} D^{-1} for(int s=0; sdee[s])*chi(ss+s) - (this->ueem[s]/this->dee[Ls-1])*tmp1); + coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp1); } - // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj; - coalescedWrite(chi[ss+Ls-1], (1.0/this->dee[Ls-1])*chi(ss+Ls-1)); + // chi[ss+Ls-1] = (1.0/pdee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj; + coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); spProj5m(tmp1, chi(ss+Ls-1)); - coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + MooeeInv_shift_norm[Ls-1]*tmp2_spProj); + coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInv_shift_norm[Ls-1]*tmp2_spProj); // Apply U^{-1} and add shift term for(int s=Ls-2; s>=0; s--){ - coalescedWrite(chi[ss+s] , chi(ss+s) - this->uee[s]*tmp1); + coalescedWrite(chi[ss+s] , chi(ss+s) - puee[s]*tmp1); spProj5m(tmp1, chi(ss+s)); - coalescedWrite(chi[ss+s], chi(ss+s) + MooeeInv_shift_norm[s]*tmp2_spProj); + coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInv_shift_norm[s]*tmp2_spProj); } }); @@ -319,6 +336,12 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel auto psi = psi_i.View(); auto chi = chi_i.View(); + auto plee = & this->lee [0]; + auto pdee = & this->dee [0]; + auto puee = & this->uee [0]; + auto pleem= & this->leem[0]; + auto pueem= & this->ueem[0]; + this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); @@ -334,26 +357,26 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel coalescedWrite(chi[ss], psi(ss)); for(int s=1; suee[s-1]*tmp); + coalescedWrite(chi[ss+s], psi(ss+s) - puee[s-1]*tmp); } // U_m^{-\dag} for(int s=0; sueem[s]*tmp); + coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp); } // L_m^{-\dag} D^{-dag} for(int s=0; sdee[s])*chi(ss+s) - (this->leem[s]/this->dee[Ls-1])*tmp); + coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp); } - coalescedWrite(chi[ss+Ls-1], (1.0/this->dee[Ls-1])*chi(ss+Ls-1)); + coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); // Apply L^{-dag} for(int s=Ls-2; s>=0; s--){ spProj5p(tmp, chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - this->lee[s]*tmp); + coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp); } }); @@ -369,6 +392,14 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi auto chi = chi_i.View(); int Ls = this->Ls; + auto pm = this->pm; + auto plee = & this->lee [0]; + auto pdee = & this->dee [0]; + auto puee = & this->uee [0]; + auto pleem= & this->leem[0]; + auto pueem= & this->ueem[0]; + auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; + auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); @@ -383,36 +414,36 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2 coalescedWrite(chi[ss], psi(ss)); - tmp2 = MooeeInvDag_shift_lc[0]*psi(ss); + tmp2 = pMooeeInvDag_shift_lc[0]*psi(ss); for(int s=1; suee[s-1]*tmp1); - tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi(ss+s); + coalescedWrite(chi[ss+s],psi(ss+s) - puee[s-1]*tmp1); + tmp2 = tmp2 + pMooeeInvDag_shift_lc[s]*psi(ss+s); } - if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);} - else { spProj5m(tmp2_spProj, tmp2);} + if(pm == 1){ spProj5p(tmp2_spProj, tmp2);} + else { spProj5m(tmp2_spProj, tmp2);} // U_m^{-\dag} for(int s=0; sueem[s]*tmp1); + coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp1); } // L_m^{-\dag} D^{-dag} for(int s=0; sdee[s])*chi(ss+s) - (this->leem[s]/this->dee[Ls-1])*tmp1); + coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp1); } - coalescedWrite(chi[ss+Ls-1], (1.0/this->dee[Ls-1])*chi(ss+Ls-1)); + coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); spProj5p(tmp1, chi(ss+Ls-1)); - coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj); + coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInvDag_shift_norm[Ls-1]*tmp2_spProj); // Apply L^{-dag} for(int s=Ls-2; s>=0; s--){ - coalescedWrite(chi[ss+s], chi(ss+s) - this->lee[s]*tmp1); + coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp1); spProj5p(tmp1, chi(ss+s)); - coalescedWrite(chi[ss+s], chi(ss+s) + MooeeInvDag_shift_norm[s]*tmp2_spProj); + coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInvDag_shift_norm[s]*tmp2_spProj); } }); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 605a3aec..5eea31f5 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -382,20 +382,20 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} #ifndef GRID_NVCC + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} #ifndef GRID_NVCC + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} - if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} #ifndef GRID_NVCC + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif }