1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-27 22:25:56 +01:00

fast MooeeInv

This commit is contained in:
gfilaci 2019-08-07 16:34:01 +01:00
parent b473405652
commit 3ef519aaa4

View File

@ -10,6 +10,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Gianluca Filaci <g.filaci@ed.ac.uk>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -139,34 +140,36 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp; spinor tmp, acc, res;;
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops // X = Nc*Ns
// Apply (L^{\prime})^{-1} // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] // Apply (L^{\prime})^{-1} L_m^{-1}
for(int s=1;s<Ls;s++){ res = psi(ss);
spProj5p(tmp,chi(ss+s-1)); spProj5m(tmp,res);
coalescedWrite(chi[ss+s] , psi(ss+s)-plee[s-1]*tmp); acc = pleem[0]*tmp;
spProj5p(tmp,res);
coalescedWrite(chi[ss],res);
for(int s=1;s<Ls-1;s++){
res = psi(ss+s);
res -= plee[s-1]*tmp;
spProj5m(tmp,res);
acc += pleem[s]*tmp;
spProj5p(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
// L_m^{-1} // Apply U_m^{-1} D^{-1} U^{-1}
for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -pleem[s]P_- chi res = (1.0/pdee[Ls-1])*res;
spProj5m(tmp,chi(ss+s)); coalescedWrite(chi[ss+Ls-1],res);
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp); spProj5p(acc,res);
} spProj5m(tmp,res);
// U_m^{-1} D^{-1}
for (int s=0;s<Ls-1;s++){
// Chi[s] + 1/d chi[s]
spProj5p(tmp,chi(ss+Ls-1));
coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s)-(pueem[s]/pdee[Ls-1])*tmp);
}
coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
// Apply U^{-1}
for (int s=Ls-2;s>=0;s--){ for (int s=Ls-2;s>=0;s--){
spProj5m(tmp,chi(ss+s+1)); res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp); spProj5m(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
}); });
@ -201,31 +204,36 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp; spinor tmp, acc, res;
// Apply (U^{\prime})^{-dagger} // X = Nc*Ns
coalescedWrite(chi[ss],psi(ss)); // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
for (int s=1;s<Ls;s++){ // Apply (L^{\prime})^{-1} L_m^{-1}
spProj5m(tmp,chi(ss+s-1)); res = psi(ss);
coalescedWrite(chi[ss+s], psi(ss+s)-conjugate(puee[s-1])*tmp); spProj5p(tmp,res);
} acc = conjugate(pueem[0])*tmp;
// U_m^{-\dagger} spProj5m(tmp,res);
for (int s=0;s<Ls-1;s++){ coalescedWrite(chi[ss],res);
spProj5p(tmp,chi(ss+s));
coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - conjugate(pueem[s])*tmp);
}
// L_m^{-\dagger} D^{-dagger} for(int s=1;s<Ls-1;s++){
for (int s=0;s<Ls-1;s++){ res = psi(ss+s);
spProj5m(tmp,chi(ss+Ls-1)); res -= conjugate(puee[s-1])*tmp;
coalescedWrite(chi[ss+s], conjugate(1.0/pdee[s])*chi(ss+s)-conjugate(pleem[s]/pdee[Ls-1])*tmp); spProj5p(tmp,res);
acc += conjugate(pueem[s])*tmp;
spProj5m(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
coalescedWrite(chi[ss+Ls-1], conjugate(1.0/pdee[Ls-1])*chi(ss+Ls-1)); res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc;
// Apply L^{-dagger} // Apply U_m^{-1} D^{-1} U^{-1}
res = (1.0/pdee[Ls-1])*res;
coalescedWrite(chi[ss+Ls-1],res);
spProj5m(acc,res);
spProj5p(tmp,res);
for (int s=Ls-2;s>=0;s--){ for (int s=Ls-2;s>=0;s--){
spProj5p(tmp,chi(ss+s+1)); res = (1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc;
coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(plee[s])*tmp); spProj5p(tmp,res);
coalescedWrite(chi[ss+s],res);
} }
}); });
MooeeInvTime+=usecond(); MooeeInvTime+=usecond();