From 464cd6593157f11fc402e59a03736764ef3313f0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 15 Jun 2019 12:35:14 +0100 Subject: [PATCH] Still to test this fully --- .../DomainWallEOFAFermionCache.h | 120 ++++++++---------- 1 file changed, 52 insertions(+), 68 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 788321d2..468bddbd 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -53,25 +53,17 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi this->M5Dcalls++; this->M5Dtime -= usecond(); - thread_loop( (int ss=0; ssoSites(); ss+=Ls),{ // adds Ls + auto nloop=grid->oSites()/Ls; + accelerator_for(sss,nloop,Simd::Nsimd(),{ + auto ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; for(int s=0; s::M5Ddag(const FermionField& psi_i, const Fermio this->M5Dcalls++; this->M5Dtime -= usecond(); - thread_loop((int ss=0; ssoSites(); ss+=Ls),{ // adds Ls - auto tmp = psi[0]; + auto nloop=grid->oSites()/Ls; + accelerator_for(sss,nloop,Simd::Nsimd(),{ + typedef decltype(coalescedRead(psi[0])) spinor; + auto ss=sss*Ls; for(int s=0; s::MooeeInv(const FermionField& psi_i, FermionFie this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); - thread_loop((int ss=0; ssoSites(); ss+=Ls),{ // adds Ls - - auto tmp1 = psi[0]; - auto tmp2 = psi[0]; + uint64_t nloop=grid->oSites()/Ls; + accelerator_for(sss,nloop,Simd::Nsimd(),{ + auto ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1,tmp2; // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops // Apply (L^{\prime})^{-1} - chi[ss] = psi[ss]; // chi[0]=psi[0] + coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] for(int s=1; slee[s-1]*tmp1; + spProj5p(tmp1, chi(ss+s-1)); + coalescedWrite(chi[ss+s], psi(ss+s) - this->lee[s-1]*tmp1); } // L_m^{-1} for(int s=0; sleem[s]*tmp1; + spProj5m(tmp1, chi(ss+s)); + coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - this->leem[s]*tmp1); } // U_m^{-1} D^{-1} for(int s=0; sdee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1; + spProj5p(tmp1, chi(ss+Ls-1)); + coalescedWrite(chi[ss+s], (1.0/this->dee[s])*chi(ss+s) - (this->ueem[s]/this->dee[Ls])*tmp1); } - spProj5m(tmp2, chi[ss+Ls-1]); - chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2; + spProj5m(tmp2, chi(ss+Ls-1)); + coalescedWrite(chi[ss+Ls-1],(1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2); // Apply U^{-1} for(int s=Ls-2; s>=0; s--){ - spProj5m(tmp1, chi[ss+s+1]); - chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1; + spProj5m(tmp1, chi(ss+s+1)); + coalescedWrite(chi[ss+s], chi(ss+s) - this->uee[s]*tmp1); } }); - this->MooeeInvTime += usecond(); } @@ -196,37 +180,37 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); - - thread_loop((int ss=0; ssoSites(); ss+=Ls),{ // adds Ls - - auto tmp1 = psi[0]; - auto tmp2 = psi[0]; + auto nloop = grid->oSites()/Ls; + accelerator_for(sss,nloop,Simd::Nsimd(),{ + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1,tmp2; + auto ss=sss*Ls; // Apply (U^{\prime})^{-dagger} - chi[ss] = psi[ss]; + coalescedWrite(chi[ss], psi(ss)); for(int s=1; s=0; s--){ - spProj5p(tmp1, chi[ss+s+1]); - chi[ss+s] = chi[ss+s] - leec[s]*tmp1; + spProj5p(tmp1, chi(ss+s+1)); + coalescedWrite(chi[ss+s],chi(ss+s) - leec[s]*tmp1); } });