From 3ef519aaa4291b83708209cfe6f271ed2de20be5 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Wed, 7 Aug 2019 16:34:01 +0100 Subject: [PATCH 1/6] fast MooeeInv --- .../implementation/CayleyFermion5Dcache.h | 104 ++++++++++-------- 1 file changed, 56 insertions(+), 48 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 8af3e7c0..9dc9ba02 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -10,6 +10,7 @@ Author: Peter Boyle Author: Peter Boyle Author: Peter Boyle Author: paboyle +Author: Gianluca Filaci This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -139,39 +140,41 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp; + spinor tmp, acc, res;; - // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops - // Apply (L^{\prime})^{-1} - coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] - for(int s=1;s=0;s--){ - spProj5m(tmp,chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp); + res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; + spProj5m(tmp,res); + coalescedWrite(chi[ss+s],res); } }); MooeeInvTime+=usecond(); - + } template @@ -201,31 +204,36 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp; + spinor tmp, acc, res; - // Apply (U^{\prime})^{-dagger} - coalescedWrite(chi[ss],psi(ss)); - for (int s=1;s=0;s--){ - spProj5p(tmp,chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(plee[s])*tmp); + res = (1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc; + spProj5p(tmp,res); + coalescedWrite(chi[ss+s],res); } }); MooeeInvTime+=usecond(); From 0efaf3c4fa76b125596e9ea1d0cfeeb2bff1acf2 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Mon, 2 Sep 2019 11:33:00 +0100 Subject: [PATCH 2/6] access M5D coeffs through pointers --- .../implementation/CayleyFermion5Dcache.h | 12 +++++-- .../DomainWallEOFAFermionCache.h | 11 +++++-- .../implementation/MobiusEOFAFermionCache.h | 31 +++++++++++++++---- 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 9dc9ba02..9fe5f9f8 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -55,6 +55,10 @@ CayleyFermion5D::M5D(const FermionField &psi_i, auto chi = chi_i.View(); assert(phi.Checkerboard() == psi.Checkerboard()); + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + int Ls =this->Ls; // 10 = 3 complex mult + 2 complex add @@ -72,7 +76,7 @@ CayleyFermion5D::M5D(const FermionField &psi_i, uint64_t idx_l = ss+((s+Ls-1)%Ls); spProj5m(tmp1,psi(idx_u)); spProj5p(tmp2,psi(idx_l)); - coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2); + coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2); } }); M5Dtime+=usecond(); @@ -94,6 +98,10 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, auto chi = chi_i.View(); assert(phi.Checkerboard() == psi.Checkerboard()); + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + int Ls=this->Ls; // Flops = 6.0*(Nc*Ns) *Ls*vol @@ -110,7 +118,7 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, uint64_t idx_l = ss+((s+Ls-1)%Ls); spProj5p(tmp1,psi(idx_u)); spProj5m(tmp2,psi(idx_l)); - coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2); + coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2); } }); M5Dtime+=usecond(); diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index a3eca650..2ffb89b8 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -11,6 +11,7 @@ Author: Peter Boyle Author: Peter Boyle Author: paboyle Author: David Murphy +Author: Gianluca Filaci This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -49,6 +50,9 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi auto psi = psi_i.View(); auto chi = chi_i.View(); assert(phi.Checkerboard() == psi.Checkerboard()); + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol this->M5Dcalls++; this->M5Dtime -= usecond(); @@ -63,7 +67,7 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi uint64_t idx_l = ss+((s+Ls-1)%Ls); spProj5m(tmp1, psi(idx_u)); spProj5p(tmp2, psi(idx_l)); - coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); + coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); } }); @@ -82,6 +86,9 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio auto phi = phi_i.View(); auto chi = chi_i.View(); assert(phi.Checkerboard() == psi.Checkerboard()); + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol this->M5Dcalls++; @@ -97,7 +104,7 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio uint64_t idx_l = ss+((s+Ls-1)%Ls); spProj5p(tmp1, psi(idx_u)); spProj5m(tmp2, psi(idx_l)); - coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); + coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); } }); diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index 650435fc..4078267d 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -11,6 +11,7 @@ Author: Peter Boyle Author: Peter Boyle Author: paboyle Author: David Murphy +Author: Gianluca Filaci This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -49,6 +50,10 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField assert(phi.Checkerboard() == psi.Checkerboard()); + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + // Flops = 6.0*(Nc*Ns) *Ls*vol this->M5Dcalls++; this->M5Dtime -= usecond(); @@ -64,7 +69,7 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField uint64_t idx_l = ss+((s+Ls-1)%Ls); spProj5m(tmp1, psi(idx_u)); spProj5p(tmp2, psi(idx_l)); - coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); + coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); } }); @@ -88,6 +93,11 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion assert(phi.Checkerboard() == psi.Checkerboard()); + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + auto pshift_coeffs = &shift_coeffs[0]; + // Flops = 6.0*(Nc*Ns) *Ls*vol this->M5Dcalls++; this->M5Dtime -= usecond(); @@ -108,7 +118,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion if(pm == 1){ spProj5p(tmp, psi(ss+shift_s)); } else { spProj5m(tmp, psi(ss+shift_s)); } - coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 +lower[s]*tmp2 + shift_coeffs[s]*tmp); + coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 +plower[s]*tmp2 + pshift_coeffs[s]*tmp); } }); @@ -128,6 +138,10 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie assert(phi.Checkerboard() == psi.Checkerboard()); + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + // Flops = 6.0*(Nc*Ns) *Ls*vol this->M5Dcalls++; this->M5Dtime -= usecond(); @@ -144,7 +158,7 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie uint64_t idx_l = ss+((s+Ls-1)%Ls); spProj5p(tmp1, psi(idx_u)); spProj5m(tmp2, psi(idx_l)); - coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); + coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); } }); @@ -166,6 +180,11 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm assert(phi.Checkerboard() == psi.Checkerboard()); + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + auto pshift_coeffs = &shift_coeffs[0]; + // Flops = 6.0*(Nc*Ns) *Ls*vol this->M5Dcalls++; this->M5Dtime -= usecond(); @@ -189,12 +208,12 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm spProj5p(tmp1, psi(idx_u)); spProj5m(tmp2, psi(idx_l)); - if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); - else coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); + if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); + else coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); if(pm == 1){ spProj5p(tmp, psi(ss+s)); } else { spProj5m(tmp, psi(ss+s)); } - coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+shift_coeffs[s]*tmp); + coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+pshift_coeffs[s]*tmp); } }); From e66669d30015e502278d65aaa14f2c1d19f4a05d Mon Sep 17 00:00:00 2001 From: gfilaci Date: Mon, 2 Sep 2019 14:26:13 +0100 Subject: [PATCH 3/6] fast MooeeInv for EOFA --- .../implementation/CayleyFermion5Dcache.h | 2 +- .../implementation/MobiusEOFAFermionCache.h | 246 +++++++++--------- 2 files changed, 126 insertions(+), 122 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 9fe5f9f8..2f58a027 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -148,7 +148,7 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp, acc, res;; + spinor tmp, acc, res; // X = Nc*Ns // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index 4078267d..ddf852de 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -242,36 +242,38 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ - - uint64_t ss = sss*Ls; - + uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp; + spinor tmp, acc, res, tmp2_spProj; - // Apply (L^{\prime})^{-1} - coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0] - for(int s=1; s=0; s--){ - spProj5m(tmp, chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp); + res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc; + + // Apply U_m^{-1} D^{-1} U^{-1} + res = (1.0/pdee[Ls-1])*res; + coalescedWrite(chi[ss+Ls-1],res); + spProj5p(acc,res); + spProj5m(tmp,res); + for (int s=Ls-2;s>=0;s--){ + res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; + spProj5m(tmp,res); + coalescedWrite(chi[ss+s],res); } }); @@ -300,45 +302,45 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp, acc, res, tmp_spProj; - uint64_t ss = sss*Ls; + // Apply (L^{\prime})^{-1} L_m^{-1} + res = psi(ss); + spProj5m(tmp,res); + acc = pleem[0]*tmp; + spProj5p(tmp,res); + coalescedWrite(chi[ss],res); + tmp_spProj = pMooeeInv_shift_lc[0]*res; - typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp1,tmp2,tmp2_spProj; + for(int s=1;s=0; s--){ - coalescedWrite(chi[ss+s] , chi(ss+s) - puee[s]*tmp1); - spProj5m(tmp1, chi(ss+s)); - coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInv_shift_norm[s]*tmp2_spProj); - } + // Apply U_m^{-1} D^{-1} U^{-1} + res = (1.0/pdee[Ls-1])*res; + spProj5p(acc,res); + spProj5m(tmp,res); + coalescedWrite(chi[ss+Ls-1], res + pMooeeInv_shift_norm[Ls-1]*tmp_spProj); + for (int s=Ls-2;s>=0;s--){ + res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; + spProj5m(tmp,res); + coalescedWrite(chi[ss+s], res + pMooeeInv_shift_norm[s]*tmp_spProj); + } }); this->MooeeInvTime += usecond(); @@ -366,36 +368,38 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ - - uint64_t ss = sss*Ls; - +uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp; + spinor tmp, acc, res; - // Apply (U^{\prime})^{-dag} - coalescedWrite(chi[ss], psi(ss)); - for(int s=1; s=0; s--){ - spProj5p(tmp, chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp); + res = psi(ss+Ls-1) - puee[Ls-2]*tmp - acc; + + // Apply U_m^{-1} D^{-1} U^{-1} + res = (1.0/pdee[Ls-1])*res; + coalescedWrite(chi[ss+Ls-1],res); + spProj5m(acc,res); + spProj5p(tmp,res); + for (int s=Ls-2;s>=0;s--){ + res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc; + spProj5p(tmp,res); + coalescedWrite(chi[ss+s],res); } }); @@ -425,45 +429,45 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp, acc, res, tmp_spProj; - uint64_t ss = sss*Ls; + // Apply (L^{\prime})^{-1} L_m^{-1} + res = psi(ss); + spProj5p(tmp,res); + acc = pueem[0]*tmp; + spProj5m(tmp,res); + coalescedWrite(chi[ss],res); + tmp_spProj = pMooeeInvDag_shift_lc[0]*res; - typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp1,tmp2,tmp2_spProj; + for(int s=1;s=0; s--){ - coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp1); - spProj5p(tmp1, chi(ss+s)); - coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInvDag_shift_norm[s]*tmp2_spProj); - } + // Apply U_m^{-1} D^{-1} U^{-1} + res = (1.0/pdee[Ls-1])*res; + spProj5m(acc,res); + spProj5p(tmp,res); + coalescedWrite(chi[ss+Ls-1], res + pMooeeInvDag_shift_norm[Ls-1]*tmp_spProj); + for (int s=Ls-2;s>=0;s--){ + res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc; + spProj5p(tmp,res); + coalescedWrite(chi[ss+s], res + pMooeeInvDag_shift_norm[s]*tmp_spProj); + } }); this->MooeeInvTime += usecond(); From fdd9b14e82dc25045cfb8db7b86f98154a26e8e1 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Mon, 2 Sep 2019 14:49:51 +0100 Subject: [PATCH 4/6] speed up MooeeInvDag for DWF EOFA --- .../DomainWallEOFAFermionCache.h | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 2ffb89b8..8bdab03f 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -175,23 +175,15 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion auto chi = chi_i.View(); int Ls = this->Ls; + auto plee = & this->lee[0]; + auto pdee = & this->dee[0]; + auto puee = & this->uee[0]; + + auto pleem = & this->leem[0]; + auto pueem = & this->ueem[0]; + assert(psi.Checkerboard() == psi.Checkerboard()); - Vector ueec(Ls); - Vector deec(Ls+1); - Vector leec(Ls); - Vector ueemc(Ls); - Vector leemc(Ls); - - for(int s=0; suee[s]); - deec[s] = conjugate(this->dee[s]); - leec[s] = conjugate(this->lee[s]); - ueemc[s] = conjugate(this->ueem[s]); - leemc[s] = conjugate(this->leem[s]); - } - deec[Ls] = conjugate(this->dee[Ls]); - this->MooeeInvCalls++; this->MooeeInvTime -= usecond(); auto nloop = grid->oSites()/Ls; @@ -204,27 +196,27 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion coalescedWrite(chi[ss], psi(ss)); for(int s=1; s=0; s--){ spProj5p(tmp1, chi(ss+s+1)); - coalescedWrite(chi[ss+s],chi(ss+s) - leec[s]*tmp1); + coalescedWrite(chi[ss+s],chi(ss+s) - conjugate(plee[s])*tmp1); } }); From 0c1efa523582a2b9f6121b2da036016ada623894 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Wed, 7 Aug 2019 12:11:18 +0100 Subject: [PATCH 5/6] pass OpenMP flag to host compiler --- configure.ac | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configure.ac b/configure.ac index ed7b9a17..b8f94fce 100644 --- a/configure.ac +++ b/configure.ac @@ -287,6 +287,9 @@ case ${CXX} in CXX="nvcc -x cu " CXXLD="nvcc -link" CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" + if test $ac_openmp = yes; then + CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" + fi ;; *) CXXLD=${CXX} From a7fa86dc29e154ac3339dc8b15be831c80ecc0b9 Mon Sep 17 00:00:00 2001 From: gfilaci Date: Thu, 5 Sep 2019 12:05:21 +0100 Subject: [PATCH 6/6] MooeeInv improvement for DW EOFA + comments --- .../implementation/CayleyFermion5Dcache.h | 4 +- .../DomainWallEOFAFermionCache.h | 109 +++++++++--------- .../implementation/MobiusEOFAFermionCache.h | 13 +-- 3 files changed, 64 insertions(+), 62 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 2f58a027..35402994 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -216,7 +216,7 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi // X = Nc*Ns // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops - // Apply (L^{\prime})^{-1} L_m^{-1} + // Apply (U^{\prime})^{-dagger} U_m^{-\dagger} res = psi(ss); spProj5p(tmp,res); acc = conjugate(pueem[0])*tmp; @@ -233,7 +233,7 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi } res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc; - // Apply U_m^{-1} D^{-1} U^{-1} + // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger} res = (1.0/pdee[Ls-1])*res; coalescedWrite(chi[ss+Ls-1],res); spProj5m(acc,res); diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 8bdab03f..46d3fa1f 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -131,36 +131,37 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie this->MooeeInvTime -= usecond(); uint64_t nloop=grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ - auto ss=sss*Ls; + uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp1,tmp2; + spinor tmp, acc, res; - // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops - // Apply (L^{\prime})^{-1} - coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] - for(int s=1; s=0; s--){ - spProj5m(tmp1, chi(ss+s+1)); - coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp1); + res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc; + + // Apply U_m^{-1} D^{-1} U^{-1} + acc = (1.0/pdee[Ls ])*res; + tmp = (1.0/pdee[Ls-1])*res; + spProj5p(acc,acc); + spProj5m(tmp,tmp); + coalescedWrite(chi[ss+Ls-1], acc + tmp); + for (int s=Ls-2;s>=0;s--){ + res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; + spProj5m(tmp,res); + coalescedWrite(chi[ss+s],res); } }); this->MooeeInvTime += usecond(); @@ -188,35 +189,37 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion this->MooeeInvTime -= usecond(); auto nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ + uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp1,tmp2; - auto ss=sss*Ls; + spinor tmp, acc, res; - // Apply (U^{\prime})^{-dagger} - coalescedWrite(chi[ss], psi(ss)); - for(int s=1; s=0; s--){ - spProj5p(tmp1, chi(ss+s+1)); - coalescedWrite(chi[ss+s],chi(ss+s) - conjugate(plee[s])*tmp1); + res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc; + + // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger} + acc = conjugate(1.0/pdee[Ls-1])*res; + tmp = conjugate(1.0/pdee[Ls ])*res; + spProj5m(acc,acc); + spProj5p(tmp,tmp); + coalescedWrite(chi[ss+Ls-1], acc + tmp); + for (int s=Ls-2;s>=0;s--){ + res = conjugate(1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc; + spProj5p(tmp,res); + coalescedWrite(chi[ss+s],res); } }); diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index ddf852de..f74c7a51 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -244,7 +244,7 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; - spinor tmp, acc, res, tmp2_spProj; + spinor tmp, acc, res; // X = Nc*Ns // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops @@ -368,13 +368,13 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ -uint64_t ss=sss*Ls; + uint64_t ss=sss*Ls; typedef decltype(coalescedRead(psi[0])) spinor; spinor tmp, acc, res; // X = Nc*Ns // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops - // Apply (L^{\prime})^{-1} L_m^{-1} + // Apply (U^{\prime})^{-dagger} U_m^{-\dagger} res = psi(ss); spProj5p(tmp,res); acc = pueem[0]*tmp; @@ -391,7 +391,7 @@ uint64_t ss=sss*Ls; } res = psi(ss+Ls-1) - puee[Ls-2]*tmp - acc; - // Apply U_m^{-1} D^{-1} U^{-1} + // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger} res = (1.0/pdee[Ls-1])*res; coalescedWrite(chi[ss+Ls-1],res); spProj5m(acc,res); @@ -402,7 +402,6 @@ uint64_t ss=sss*Ls; coalescedWrite(chi[ss+s],res); } }); - this->MooeeInvTime += usecond(); } @@ -433,7 +432,7 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi typedef decltype(coalescedRead(psi[0])) spinor; spinor tmp, acc, res, tmp_spProj; - // Apply (L^{\prime})^{-1} L_m^{-1} + // Apply (U^{\prime})^{-dagger} U_m^{-\dagger} res = psi(ss); spProj5p(tmp,res); acc = pueem[0]*tmp; @@ -458,7 +457,7 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi res = res - puee[Ls-2]*tmp - acc; - // Apply U_m^{-1} D^{-1} U^{-1} + // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger} res = (1.0/pdee[Ls-1])*res; spProj5m(acc,res); spProj5p(tmp,res);