1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Merge pull request #235 from grid-test-organisation/feature/5d-improvement

MooeeInv and M5D optimisations + enable threading with nvcc
This commit is contained in:
Peter Boyle 2019-12-10 21:45:03 -05:00 committed by GitHub
commit 848079e8ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 291 additions and 248 deletions

View File

@ -10,6 +10,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Gianluca Filaci <g.filaci@ed.ac.uk>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -54,6 +55,10 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
auto chi = chi_i.View(); auto chi = chi_i.View();
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
int Ls =this->Ls; int Ls =this->Ls;
// 10 = 3 complex mult + 2 complex add // 10 = 3 complex mult + 2 complex add
@ -71,7 +76,7 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
uint64_t idx_l = ss+((s+Ls-1)%Ls); uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5m(tmp1,psi(idx_u)); spProj5m(tmp1,psi(idx_u));
spProj5p(tmp2,psi(idx_l)); spProj5p(tmp2,psi(idx_l));
coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2); coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
} }
}); });
M5Dtime+=usecond(); M5Dtime+=usecond();
@ -93,6 +98,10 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
auto chi = chi_i.View(); auto chi = chi_i.View();
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
int Ls=this->Ls; int Ls=this->Ls;
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
@ -109,7 +118,7 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
uint64_t idx_l = ss+((s+Ls-1)%Ls); uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5p(tmp1,psi(idx_u)); spProj5p(tmp1,psi(idx_u));
spProj5m(tmp2,psi(idx_l)); spProj5m(tmp2,psi(idx_l));
coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2); coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
} }
}); });
M5Dtime+=usecond(); M5Dtime+=usecond();
@ -139,39 +148,41 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp; spinor tmp, acc, res;
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops // X = Nc*Ns
// Apply (L^{\prime})^{-1} // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] // Apply (L^{\prime})^{-1} L_m^{-1}
for(int s=1;s<Ls;s++){ res = psi(ss);
spProj5p(tmp,chi(ss+s-1)); spProj5m(tmp,res);
coalescedWrite(chi[ss+s] , psi(ss+s)-plee[s-1]*tmp); acc = pleem[0]*tmp;
spProj5p(tmp,res);
coalescedWrite(chi[ss],res);
for(int s=1;s<Ls-1;s++){
res = psi(ss+s);
res -= plee[s-1]*tmp;
spProj5m(tmp,res);
acc += pleem[s]*tmp;
spProj5p(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
// L_m^{-1}
for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -pleem[s]P_- chi // Apply U_m^{-1} D^{-1} U^{-1}
spProj5m(tmp,chi(ss+s)); res = (1.0/pdee[Ls-1])*res;
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp); coalescedWrite(chi[ss+Ls-1],res);
} spProj5p(acc,res);
spProj5m(tmp,res);
// U_m^{-1} D^{-1}
for (int s=0;s<Ls-1;s++){
// Chi[s] + 1/d chi[s]
spProj5p(tmp,chi(ss+Ls-1));
coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s)-(pueem[s]/pdee[Ls-1])*tmp);
}
coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
// Apply U^{-1}
for (int s=Ls-2;s>=0;s--){ for (int s=Ls-2;s>=0;s--){
spProj5m(tmp,chi(ss+s+1)); res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp); spProj5m(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
}); });
MooeeInvTime+=usecond(); MooeeInvTime+=usecond();
} }
template<class Impl> template<class Impl>
@ -201,31 +212,36 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp; spinor tmp, acc, res;
// Apply (U^{\prime})^{-dagger} // X = Nc*Ns
coalescedWrite(chi[ss],psi(ss)); // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
for (int s=1;s<Ls;s++){ // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
spProj5m(tmp,chi(ss+s-1)); res = psi(ss);
coalescedWrite(chi[ss+s], psi(ss+s)-conjugate(puee[s-1])*tmp); spProj5p(tmp,res);
acc = conjugate(pueem[0])*tmp;
spProj5m(tmp,res);
coalescedWrite(chi[ss],res);
for(int s=1;s<Ls-1;s++){
res = psi(ss+s);
res -= conjugate(puee[s-1])*tmp;
spProj5p(tmp,res);
acc += conjugate(pueem[s])*tmp;
spProj5m(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
// U_m^{-\dagger} res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc;
for (int s=0;s<Ls-1;s++){
spProj5p(tmp,chi(ss+s)); // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - conjugate(pueem[s])*tmp); res = (1.0/pdee[Ls-1])*res;
} coalescedWrite(chi[ss+Ls-1],res);
spProj5m(acc,res);
// L_m^{-\dagger} D^{-dagger} spProj5p(tmp,res);
for (int s=0;s<Ls-1;s++){
spProj5m(tmp,chi(ss+Ls-1));
coalescedWrite(chi[ss+s], conjugate(1.0/pdee[s])*chi(ss+s)-conjugate(pleem[s]/pdee[Ls-1])*tmp);
}
coalescedWrite(chi[ss+Ls-1], conjugate(1.0/pdee[Ls-1])*chi(ss+Ls-1));
// Apply L^{-dagger}
for (int s=Ls-2;s>=0;s--){ for (int s=Ls-2;s>=0;s--){
spProj5p(tmp,chi(ss+s+1)); res = (1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc;
coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(plee[s])*tmp); spProj5p(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
}); });
MooeeInvTime+=usecond(); MooeeInvTime+=usecond();

View File

@ -11,6 +11,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu> Author: David Murphy <dmurphy@phys.columbia.edu>
Author: Gianluca Filaci <g.filaci@ed.ac.uk>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -49,6 +50,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
auto psi = psi_i.View(); auto psi = psi_i.View();
auto chi = chi_i.View(); auto chi = chi_i.View();
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++; this->M5Dcalls++;
this->M5Dtime -= usecond(); this->M5Dtime -= usecond();
@ -63,7 +67,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
uint64_t idx_l = ss+((s+Ls-1)%Ls); uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5m(tmp1, psi(idx_u)); spProj5m(tmp1, psi(idx_u));
spProj5p(tmp2, psi(idx_l)); spProj5p(tmp2, psi(idx_l));
coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
} }
}); });
@ -82,6 +86,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
auto phi = phi_i.View(); auto phi = phi_i.View();
auto chi = chi_i.View(); auto chi = chi_i.View();
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++; this->M5Dcalls++;
@ -97,7 +104,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
uint64_t idx_l = ss+((s+Ls-1)%Ls); uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5p(tmp1, psi(idx_u)); spProj5p(tmp1, psi(idx_u));
spProj5m(tmp2, psi(idx_l)); spProj5m(tmp2, psi(idx_l));
coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
} }
}); });
@ -124,36 +131,37 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
this->MooeeInvTime -= usecond(); this->MooeeInvTime -= usecond();
uint64_t nloop=grid->oSites()/Ls; uint64_t nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
auto ss=sss*Ls; uint64_t ss=sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp1,tmp2; spinor tmp, acc, res;
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops // Apply (L^{\prime})^{-1} L_m^{-1}
// Apply (L^{\prime})^{-1} res = psi(ss);
coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] spProj5m(tmp,res);
for(int s=1; s<Ls; s++){ acc = pleem[0]*tmp;
spProj5p(tmp1, chi(ss+s-1)); spProj5p(tmp,res);
coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1); coalescedWrite(chi[ss],res);
for(int s=1;s<Ls-1;s++){
res = psi(ss+s);
res -= plee[s-1]*tmp;
spProj5m(tmp,res);
acc += pleem[s]*tmp;
spProj5p(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi // Apply U_m^{-1} D^{-1} U^{-1}
spProj5m(tmp1, chi(ss+s)); acc = (1.0/pdee[Ls ])*res;
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1); tmp = (1.0/pdee[Ls-1])*res;
} spProj5p(acc,acc);
spProj5m(tmp,tmp);
// U_m^{-1} D^{-1} coalescedWrite(chi[ss+Ls-1], acc + tmp);
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s] for (int s=Ls-2;s>=0;s--){
spProj5p(tmp1, chi(ss+Ls-1)); res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls])*tmp1); spProj5m(tmp,res);
} coalescedWrite(chi[ss+s],res);
spProj5m(tmp2, chi(ss+Ls-1));
coalescedWrite(chi[ss+Ls-1],(1.0/pdee[Ls])*tmp1 + (1.0/pdee[Ls-1])*tmp2);
// Apply U^{-1}
for(int s=Ls-2; s>=0; s--){
spProj5m(tmp1, chi(ss+s+1));
coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp1);
} }
}); });
this->MooeeInvTime += usecond(); this->MooeeInvTime += usecond();
@ -168,56 +176,50 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
auto chi = chi_i.View(); auto chi = chi_i.View();
int Ls = this->Ls; int Ls = this->Ls;
auto plee = & this->lee[0];
auto pdee = & this->dee[0];
auto puee = & this->uee[0];
auto pleem = & this->leem[0];
auto pueem = & this->ueem[0];
assert(psi.Checkerboard() == psi.Checkerboard()); assert(psi.Checkerboard() == psi.Checkerboard());
Vector<Coeff_t> ueec(Ls);
Vector<Coeff_t> deec(Ls+1);
Vector<Coeff_t> leec(Ls);
Vector<Coeff_t> ueemc(Ls);
Vector<Coeff_t> leemc(Ls);
for(int s=0; s<ueec.size(); s++){
ueec[s] = conjugate(this->uee[s]);
deec[s] = conjugate(this->dee[s]);
leec[s] = conjugate(this->lee[s]);
ueemc[s] = conjugate(this->ueem[s]);
leemc[s] = conjugate(this->leem[s]);
}
deec[Ls] = conjugate(this->dee[Ls]);
this->MooeeInvCalls++; this->MooeeInvCalls++;
this->MooeeInvTime -= usecond(); this->MooeeInvTime -= usecond();
auto nloop = grid->oSites()/Ls; auto nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp1,tmp2; spinor tmp, acc, res;
auto ss=sss*Ls;
// Apply (U^{\prime})^{-dagger} // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
coalescedWrite(chi[ss], psi(ss)); res = psi(ss);
for(int s=1; s<Ls; s++){ spProj5p(tmp,res);
spProj5m(tmp1, chi(ss+s-1)); acc = conjugate(pueem[0])*tmp;
coalescedWrite(chi[ss+s], psi(ss+s) - ueec[s-1]*tmp1); spProj5m(tmp,res);
coalescedWrite(chi[ss],res);
for(int s=1;s<Ls-1;s++){
res = psi(ss+s);
res -= conjugate(puee[s-1])*tmp;
spProj5p(tmp,res);
acc += conjugate(pueem[s])*tmp;
spProj5m(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc;
// U_m^{-\dagger}
for(int s=0; s<Ls-1; s++){ // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
spProj5p(tmp1, chi(ss+s)); acc = conjugate(1.0/pdee[Ls-1])*res;
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - ueemc[s]*tmp1); tmp = conjugate(1.0/pdee[Ls ])*res;
} spProj5m(acc,acc);
spProj5p(tmp,tmp);
// L_m^{-\dagger} D^{-dagger} coalescedWrite(chi[ss+Ls-1], acc + tmp);
for(int s=0; s<Ls-1; s++){ for (int s=Ls-2;s>=0;s--){
spProj5m(tmp1, chi(ss+Ls-1)); res = conjugate(1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc;
coalescedWrite(chi[ss+s] ,(1.0/deec[s])*chi(ss+s) - (leemc[s]/deec[Ls-1])*tmp1); spProj5p(tmp,res);
} coalescedWrite(chi[ss+s],res);
spProj5p(tmp2, chi(ss+Ls-1));
coalescedWrite(chi[ss+Ls-1], (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2);
// Apply L^{-dagger}
for(int s=Ls-2; s>=0; s--){
spProj5p(tmp1, chi(ss+s+1));
coalescedWrite(chi[ss+s],chi(ss+s) - leec[s]*tmp1);
} }
}); });

View File

@ -11,6 +11,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu> Author: David Murphy <dmurphy@phys.columbia.edu>
Author: Gianluca Filaci <g.filaci@ed.ac.uk>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -49,6 +50,10 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++; this->M5Dcalls++;
this->M5Dtime -= usecond(); this->M5Dtime -= usecond();
@ -64,7 +69,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
uint64_t idx_l = ss+((s+Ls-1)%Ls); uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5m(tmp1, psi(idx_u)); spProj5m(tmp1, psi(idx_u));
spProj5p(tmp2, psi(idx_l)); spProj5p(tmp2, psi(idx_l));
coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
} }
}); });
@ -88,6 +93,11 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
auto pshift_coeffs = &shift_coeffs[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++; this->M5Dcalls++;
this->M5Dtime -= usecond(); this->M5Dtime -= usecond();
@ -108,7 +118,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
if(pm == 1){ spProj5p(tmp, psi(ss+shift_s)); } if(pm == 1){ spProj5p(tmp, psi(ss+shift_s)); }
else { spProj5m(tmp, psi(ss+shift_s)); } else { spProj5m(tmp, psi(ss+shift_s)); }
coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 +lower[s]*tmp2 + shift_coeffs[s]*tmp); coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 +plower[s]*tmp2 + pshift_coeffs[s]*tmp);
} }
}); });
@ -128,6 +138,10 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++; this->M5Dcalls++;
this->M5Dtime -= usecond(); this->M5Dtime -= usecond();
@ -144,7 +158,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
uint64_t idx_l = ss+((s+Ls-1)%Ls); uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5p(tmp1, psi(idx_u)); spProj5p(tmp1, psi(idx_u));
spProj5m(tmp2, psi(idx_l)); spProj5m(tmp2, psi(idx_l));
coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
} }
}); });
@ -166,6 +180,11 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
auto pshift_coeffs = &shift_coeffs[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++; this->M5Dcalls++;
this->M5Dtime -= usecond(); this->M5Dtime -= usecond();
@ -189,12 +208,12 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
spProj5p(tmp1, psi(idx_u)); spProj5p(tmp1, psi(idx_u));
spProj5m(tmp2, psi(idx_l)); spProj5m(tmp2, psi(idx_l));
if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
else coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); else coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
if(pm == 1){ spProj5p(tmp, psi(ss+s)); } if(pm == 1){ spProj5p(tmp, psi(ss+s)); }
else { spProj5m(tmp, psi(ss+s)); } else { spProj5m(tmp, psi(ss+s)); }
coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+shift_coeffs[s]*tmp); coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+pshift_coeffs[s]*tmp);
} }
}); });
@ -223,36 +242,38 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
uint64_t ss = sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp; spinor tmp, acc, res;
// Apply (L^{\prime})^{-1} // X = Nc*Ns
coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0] // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
for(int s=1; s<Ls; s++){ // Apply (L^{\prime})^{-1} L_m^{-1}
spProj5p(tmp, chi(ss+s-1)); res = psi(ss);
coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp); spProj5m(tmp,res);
acc = pleem[0]*tmp;
spProj5p(tmp,res);
coalescedWrite(chi[ss],res);
for(int s=1;s<Ls-1;s++){
res = psi(ss+s);
res -= plee[s-1]*tmp;
spProj5m(tmp,res);
acc += pleem[s]*tmp;
spProj5p(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi // Apply U_m^{-1} D^{-1} U^{-1}
spProj5m(tmp, chi(ss+s)); res = (1.0/pdee[Ls-1])*res;
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp); coalescedWrite(chi[ss+Ls-1],res);
} spProj5p(acc,res);
spProj5m(tmp,res);
// U_m^{-1} D^{-1} for (int s=Ls-2;s>=0;s--){
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s] res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
spProj5p(tmp, chi(ss+Ls-1)); spProj5m(tmp,res);
coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp); coalescedWrite(chi[ss+s],res);
}
coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
// Apply U^{-1}
for(int s=Ls-2; s>=0; s--){
spProj5m(tmp, chi(ss+s+1));
coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp);
} }
}); });
@ -281,45 +302,45 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp, acc, res, tmp_spProj;
uint64_t ss = sss*Ls; // Apply (L^{\prime})^{-1} L_m^{-1}
res = psi(ss);
spProj5m(tmp,res);
acc = pleem[0]*tmp;
spProj5p(tmp,res);
coalescedWrite(chi[ss],res);
tmp_spProj = pMooeeInv_shift_lc[0]*res;
typedef decltype(coalescedRead(psi[0])) spinor; for(int s=1;s<Ls-1;s++){
spinor tmp1,tmp2,tmp2_spProj; res = psi(ss+s);
tmp_spProj += pMooeeInv_shift_lc[s]*res;
res -= plee[s-1]*tmp;
spProj5m(tmp,res);
acc += pleem[s]*tmp;
spProj5p(tmp,res);
coalescedWrite(chi[ss+s],res);
}
res = psi(ss+Ls-1);
// Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2 tmp_spProj += pMooeeInv_shift_lc[Ls-1]*res;
coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0] if(pm == 1){ spProj5p(tmp_spProj, tmp_spProj);}
tmp2 = pMooeeInv_shift_lc[0]*psi(ss); else { spProj5m(tmp_spProj, tmp_spProj); }
for(int s=1; s<Ls; s++){
spProj5p(tmp1, chi(ss+s-1));
coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1);
tmp2 = tmp2 + pMooeeInv_shift_lc[s]*psi(ss+s);
}
if(pm == 1){ spProj5p(tmp2_spProj, tmp2);}
else { spProj5m(tmp2_spProj, tmp2); }
// L_m^{-1} res = res - plee[Ls-2]*tmp - acc;
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
spProj5m(tmp1, chi(ss+s));
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1);
}
// U_m^{-1} D^{-1} // Apply U_m^{-1} D^{-1} U^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s] res = (1.0/pdee[Ls-1])*res;
spProj5p(tmp1, chi(ss+Ls-1)); spProj5p(acc,res);
coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp1); spProj5m(tmp,res);
} coalescedWrite(chi[ss+Ls-1], res + pMooeeInv_shift_norm[Ls-1]*tmp_spProj);
// chi[ss+Ls-1] = (1.0/pdee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj; for (int s=Ls-2;s>=0;s--){
coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
spProj5m(tmp1, chi(ss+Ls-1)); spProj5m(tmp,res);
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInv_shift_norm[Ls-1]*tmp2_spProj); coalescedWrite(chi[ss+s], res + pMooeeInv_shift_norm[s]*tmp_spProj);
}
// Apply U^{-1} and add shift term
for(int s=Ls-2; s>=0; s--){
coalescedWrite(chi[ss+s] , chi(ss+s) - puee[s]*tmp1);
spProj5m(tmp1, chi(ss+s));
coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInv_shift_norm[s]*tmp2_spProj);
}
}); });
this->MooeeInvTime += usecond(); this->MooeeInvTime += usecond();
@ -347,39 +368,40 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
uint64_t ss = sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp; spinor tmp, acc, res;
// Apply (U^{\prime})^{-dag} // X = Nc*Ns
coalescedWrite(chi[ss], psi(ss)); // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
for(int s=1; s<Ls; s++){ // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
spProj5m(tmp, chi(ss+s-1)); res = psi(ss);
coalescedWrite(chi[ss+s], psi(ss+s) - puee[s-1]*tmp); spProj5p(tmp,res);
} acc = pueem[0]*tmp;
spProj5m(tmp,res);
coalescedWrite(chi[ss],res);
// U_m^{-\dag} for(int s=1;s<Ls-1;s++){
for(int s=0; s<Ls-1; s++){ res = psi(ss+s);
spProj5p(tmp, chi(ss+s)); res -= puee[s-1]*tmp;
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp); spProj5p(tmp,res);
acc += pueem[s]*tmp;
spProj5m(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
res = psi(ss+Ls-1) - puee[Ls-2]*tmp - acc;
// L_m^{-\dag} D^{-dag}
for(int s=0; s<Ls-1; s++){ // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
spProj5m(tmp, chi(ss+Ls-1)); res = (1.0/pdee[Ls-1])*res;
coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp); coalescedWrite(chi[ss+Ls-1],res);
} spProj5m(acc,res);
coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); spProj5p(tmp,res);
for (int s=Ls-2;s>=0;s--){
// Apply L^{-dag} res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc;
for(int s=Ls-2; s>=0; s--){ spProj5p(tmp,res);
spProj5p(tmp, chi(ss+s+1)); coalescedWrite(chi[ss+s],res);
coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp);
} }
}); });
this->MooeeInvTime += usecond(); this->MooeeInvTime += usecond();
} }
@ -406,45 +428,45 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp, acc, res, tmp_spProj;
uint64_t ss = sss*Ls; // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
res = psi(ss);
spProj5p(tmp,res);
acc = pueem[0]*tmp;
spProj5m(tmp,res);
coalescedWrite(chi[ss],res);
tmp_spProj = pMooeeInvDag_shift_lc[0]*res;
typedef decltype(coalescedRead(psi[0])) spinor; for(int s=1;s<Ls-1;s++){
spinor tmp1,tmp2,tmp2_spProj; res = psi(ss+s);
tmp_spProj += pMooeeInvDag_shift_lc[s]*res;
res -= puee[s-1]*tmp;
spProj5p(tmp,res);
acc += pueem[s]*tmp;
spProj5m(tmp,res);
coalescedWrite(chi[ss+s],res);
}
res = psi(ss+Ls-1);
// Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2 tmp_spProj += pMooeeInvDag_shift_lc[Ls-1]*res;
coalescedWrite(chi[ss], psi(ss)); if(pm == 1){ spProj5p(tmp_spProj, tmp_spProj); }
tmp2 = pMooeeInvDag_shift_lc[0]*psi(ss); else { spProj5m(tmp_spProj, tmp_spProj); }
for(int s=1; s<Ls; s++){
spProj5m(tmp1, chi(ss+s-1));
coalescedWrite(chi[ss+s],psi(ss+s) - puee[s-1]*tmp1);
tmp2 = tmp2 + pMooeeInvDag_shift_lc[s]*psi(ss+s);
}
if(pm == 1){ spProj5p(tmp2_spProj, tmp2);} res = res - puee[Ls-2]*tmp - acc;
else { spProj5m(tmp2_spProj, tmp2);}
// U_m^{-\dag} // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
for(int s=0; s<Ls-1; s++){ res = (1.0/pdee[Ls-1])*res;
spProj5p(tmp1, chi(ss+s)); spProj5m(acc,res);
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp1); spProj5p(tmp,res);
} coalescedWrite(chi[ss+Ls-1], res + pMooeeInvDag_shift_norm[Ls-1]*tmp_spProj);
for (int s=Ls-2;s>=0;s--){
// L_m^{-\dag} D^{-dag} res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc;
for(int s=0; s<Ls-1; s++){ spProj5p(tmp,res);
spProj5m(tmp1, chi(ss+Ls-1)); coalescedWrite(chi[ss+s], res + pMooeeInvDag_shift_norm[s]*tmp_spProj);
coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp1); }
}
coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
spProj5p(tmp1, chi(ss+Ls-1));
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInvDag_shift_norm[Ls-1]*tmp2_spProj);
// Apply L^{-dag}
for(int s=Ls-2; s>=0; s--){
coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp1);
spProj5p(tmp1, chi(ss+s));
coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInvDag_shift_norm[s]*tmp2_spProj);
}
}); });
this->MooeeInvTime += usecond(); this->MooeeInvTime += usecond();

View File

@ -281,6 +281,9 @@ case ${CXX} in
CXX="nvcc -x cu " CXX="nvcc -x cu "
CXXLD="nvcc -link" CXXLD="nvcc -link"
CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
if test $ac_openmp = yes; then
CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
fi
;; ;;
*) *)
CXXLD=${CXX} CXXLD=${CXX}