mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-26 17:49:33 +00:00 
			
		
		
		
	Merge pull request #235 from grid-test-organisation/feature/5d-improvement
MooeeInv and M5D optimisations + enable threading with nvcc
This commit is contained in:
		| @@ -10,6 +10,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| Author: Gianluca Filaci <g.filaci@ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
| @@ -54,6 +55,10 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i, | ||||
|   auto chi = chi_i.View(); | ||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||
|  | ||||
|   auto pdiag = &diag[0]; | ||||
|   auto pupper = &upper[0]; | ||||
|   auto plower = &lower[0]; | ||||
|  | ||||
|   int Ls =this->Ls; | ||||
|  | ||||
|   // 10 = 3 complex mult + 2 complex add | ||||
| @@ -71,7 +76,7 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i, | ||||
|       uint64_t idx_l = ss+((s+Ls-1)%Ls); | ||||
|       spProj5m(tmp1,psi(idx_u)); | ||||
|       spProj5p(tmp2,psi(idx_l)); | ||||
|       coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2); | ||||
|       coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2); | ||||
|     } | ||||
|   }); | ||||
|   M5Dtime+=usecond(); | ||||
| @@ -93,6 +98,10 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i, | ||||
|   auto chi = chi_i.View(); | ||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||
|  | ||||
|   auto pdiag = &diag[0]; | ||||
|   auto pupper = &upper[0]; | ||||
|   auto plower = &lower[0]; | ||||
|  | ||||
|   int Ls=this->Ls; | ||||
|  | ||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||
| @@ -109,7 +118,7 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i, | ||||
|       uint64_t idx_l = ss+((s+Ls-1)%Ls); | ||||
|       spProj5p(tmp1,psi(idx_u)); | ||||
|       spProj5m(tmp2,psi(idx_l)); | ||||
|       coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2); | ||||
|       coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2); | ||||
|     } | ||||
|   }); | ||||
|   M5Dtime+=usecond(); | ||||
| @@ -139,39 +148,41 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi | ||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||
|     uint64_t ss=sss*Ls; | ||||
|     typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|     spinor tmp; | ||||
|     spinor tmp, acc, res; | ||||
|  | ||||
|     // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops | ||||
|     // Apply (L^{\prime})^{-1} | ||||
|     coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] | ||||
|     for(int s=1;s<Ls;s++){ | ||||
|       spProj5p(tmp,chi(ss+s-1));   | ||||
|       coalescedWrite(chi[ss+s] , psi(ss+s)-plee[s-1]*tmp); | ||||
|     // X = Nc*Ns | ||||
|     // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops | ||||
|     // Apply (L^{\prime})^{-1} L_m^{-1} | ||||
|     res = psi(ss); | ||||
|     spProj5m(tmp,res); | ||||
|     acc = pleem[0]*tmp; | ||||
|     spProj5p(tmp,res); | ||||
|     coalescedWrite(chi[ss],res); | ||||
|      | ||||
|     for(int s=1;s<Ls-1;s++){ | ||||
|       res = psi(ss+s); | ||||
|       res -= plee[s-1]*tmp; | ||||
|       spProj5m(tmp,res); | ||||
|       acc += pleem[s]*tmp; | ||||
|       spProj5p(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|  | ||||
|     // L_m^{-1}  | ||||
|     for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -pleem[s]P_- chi | ||||
|       spProj5m(tmp,chi(ss+s));     | ||||
|       coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp); | ||||
|     } | ||||
|  | ||||
|     // U_m^{-1} D^{-1} | ||||
|     for (int s=0;s<Ls-1;s++){ | ||||
|       // Chi[s] + 1/d chi[s]  | ||||
|       spProj5p(tmp,chi(ss+Ls-1));  | ||||
|       coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s)-(pueem[s]/pdee[Ls-1])*tmp); | ||||
|     }	 | ||||
|     coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); | ||||
|        | ||||
|     // Apply U^{-1} | ||||
|     res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc; | ||||
|      | ||||
|     // Apply U_m^{-1} D^{-1} U^{-1} | ||||
|     res = (1.0/pdee[Ls-1])*res; | ||||
|     coalescedWrite(chi[ss+Ls-1],res); | ||||
|     spProj5p(acc,res); | ||||
|     spProj5m(tmp,res); | ||||
|     for (int s=Ls-2;s>=0;s--){ | ||||
|       spProj5m(tmp,chi(ss+s+1));   | ||||
|       coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp); | ||||
|       res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; | ||||
|       spProj5m(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
|   MooeeInvTime+=usecond(); | ||||
|  | ||||
|    | ||||
| } | ||||
|  | ||||
| template<class Impl> | ||||
| @@ -201,31 +212,36 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi | ||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||
|     uint64_t ss=sss*Ls; | ||||
|     typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|     spinor tmp; | ||||
|     spinor tmp, acc, res; | ||||
|  | ||||
|     // Apply (U^{\prime})^{-dagger} | ||||
|     coalescedWrite(chi[ss],psi(ss)); | ||||
|     for (int s=1;s<Ls;s++){ | ||||
|       spProj5m(tmp,chi(ss+s-1)); | ||||
|       coalescedWrite(chi[ss+s], psi(ss+s)-conjugate(puee[s-1])*tmp); | ||||
|     // X = Nc*Ns | ||||
|     // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops | ||||
|     // Apply (U^{\prime})^{-dagger} U_m^{-\dagger} | ||||
|     res = psi(ss); | ||||
|     spProj5p(tmp,res); | ||||
|     acc = conjugate(pueem[0])*tmp; | ||||
|     spProj5m(tmp,res); | ||||
|     coalescedWrite(chi[ss],res); | ||||
|      | ||||
|     for(int s=1;s<Ls-1;s++){ | ||||
|       res = psi(ss+s); | ||||
|       res -= conjugate(puee[s-1])*tmp; | ||||
|       spProj5p(tmp,res); | ||||
|       acc += conjugate(pueem[s])*tmp; | ||||
|       spProj5m(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|     // U_m^{-\dagger}  | ||||
|     for (int s=0;s<Ls-1;s++){ | ||||
|       spProj5p(tmp,chi(ss+s)); | ||||
|       coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - conjugate(pueem[s])*tmp); | ||||
|     } | ||||
|  | ||||
|     // L_m^{-\dagger} D^{-dagger} | ||||
|     for (int s=0;s<Ls-1;s++){ | ||||
|       spProj5m(tmp,chi(ss+Ls-1)); | ||||
|       coalescedWrite(chi[ss+s], conjugate(1.0/pdee[s])*chi(ss+s)-conjugate(pleem[s]/pdee[Ls-1])*tmp); | ||||
|     }	 | ||||
|     coalescedWrite(chi[ss+Ls-1], conjugate(1.0/pdee[Ls-1])*chi(ss+Ls-1)); | ||||
|    | ||||
|     // Apply L^{-dagger} | ||||
|     res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc; | ||||
|      | ||||
|     // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger} | ||||
|     res = (1.0/pdee[Ls-1])*res; | ||||
|     coalescedWrite(chi[ss+Ls-1],res); | ||||
|     spProj5m(acc,res); | ||||
|     spProj5p(tmp,res); | ||||
|     for (int s=Ls-2;s>=0;s--){ | ||||
|       spProj5p(tmp,chi(ss+s+1)); | ||||
|       coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(plee[s])*tmp); | ||||
|       res = (1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc; | ||||
|       spProj5p(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|   }); | ||||
|   MooeeInvTime+=usecond(); | ||||
|   | ||||
| @@ -11,6 +11,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| Author: David Murphy <dmurphy@phys.columbia.edu> | ||||
| Author: Gianluca Filaci <g.filaci@ed.ac.uk> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| @@ -49,6 +50,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi | ||||
|   auto psi = psi_i.View(); | ||||
|   auto chi = chi_i.View(); | ||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||
|   auto pdiag = &diag[0]; | ||||
|   auto pupper = &upper[0]; | ||||
|   auto plower = &lower[0]; | ||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||
|   this->M5Dcalls++; | ||||
|   this->M5Dtime -= usecond(); | ||||
| @@ -63,7 +67,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi | ||||
|       uint64_t idx_l = ss+((s+Ls-1)%Ls); | ||||
|       spProj5m(tmp1, psi(idx_u)); | ||||
|       spProj5p(tmp2, psi(idx_l)); | ||||
|       coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); | ||||
|       coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
| @@ -82,6 +86,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio | ||||
|   auto phi = phi_i.View(); | ||||
|   auto chi = chi_i.View(); | ||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||
|   auto pdiag = &diag[0]; | ||||
|   auto pupper = &upper[0]; | ||||
|   auto plower = &lower[0]; | ||||
|  | ||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||
|   this->M5Dcalls++; | ||||
| @@ -97,7 +104,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio | ||||
|       uint64_t idx_l = ss+((s+Ls-1)%Ls); | ||||
|       spProj5p(tmp1, psi(idx_u)); | ||||
|       spProj5m(tmp2, psi(idx_l)); | ||||
|       coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); | ||||
|       coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
| @@ -124,36 +131,37 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie | ||||
|   this->MooeeInvTime -= usecond(); | ||||
|   uint64_t nloop=grid->oSites()/Ls; | ||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||
|     auto ss=sss*Ls; | ||||
|     uint64_t ss=sss*Ls; | ||||
|     typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|     spinor tmp1,tmp2; | ||||
|     spinor tmp, acc, res; | ||||
|  | ||||
|     // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops | ||||
|     // Apply (L^{\prime})^{-1} | ||||
|     coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0] | ||||
|     for(int s=1; s<Ls; s++){ | ||||
|       spProj5p(tmp1, chi(ss+s-1)); | ||||
|       coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1); | ||||
|     // Apply (L^{\prime})^{-1} L_m^{-1} | ||||
|     res = psi(ss); | ||||
|     spProj5m(tmp,res); | ||||
|     acc = pleem[0]*tmp; | ||||
|     spProj5p(tmp,res); | ||||
|     coalescedWrite(chi[ss],res); | ||||
|      | ||||
|     for(int s=1;s<Ls-1;s++){ | ||||
|       res = psi(ss+s); | ||||
|       res -= plee[s-1]*tmp; | ||||
|       spProj5m(tmp,res); | ||||
|       acc += pleem[s]*tmp; | ||||
|       spProj5p(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|  | ||||
|     // L_m^{-1} | ||||
|     for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi | ||||
|       spProj5m(tmp1, chi(ss+s)); | ||||
|       coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1); | ||||
|     } | ||||
|  | ||||
|     // U_m^{-1} D^{-1} | ||||
|     for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s] | ||||
|       spProj5p(tmp1, chi(ss+Ls-1)); | ||||
|       coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls])*tmp1); | ||||
|     } | ||||
|     spProj5m(tmp2, chi(ss+Ls-1)); | ||||
|     coalescedWrite(chi[ss+Ls-1],(1.0/pdee[Ls])*tmp1 + (1.0/pdee[Ls-1])*tmp2); | ||||
|  | ||||
|     // Apply U^{-1} | ||||
|     for(int s=Ls-2; s>=0; s--){ | ||||
|       spProj5m(tmp1, chi(ss+s+1)); | ||||
|       coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp1); | ||||
|     res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc; | ||||
|      | ||||
|     // Apply U_m^{-1} D^{-1} U^{-1} | ||||
|     acc = (1.0/pdee[Ls  ])*res; | ||||
|     tmp = (1.0/pdee[Ls-1])*res; | ||||
|     spProj5p(acc,acc); | ||||
|     spProj5m(tmp,tmp); | ||||
|     coalescedWrite(chi[ss+Ls-1], acc + tmp); | ||||
|     for (int s=Ls-2;s>=0;s--){ | ||||
|       res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; | ||||
|       spProj5m(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|   }); | ||||
|   this->MooeeInvTime += usecond(); | ||||
| @@ -168,56 +176,50 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion | ||||
|   auto chi = chi_i.View(); | ||||
|   int Ls = this->Ls; | ||||
|  | ||||
|   auto plee  = & this->lee[0]; | ||||
|   auto pdee  = & this->dee[0]; | ||||
|   auto puee  = & this->uee[0]; | ||||
|  | ||||
|   auto pleem = & this->leem[0]; | ||||
|   auto pueem = & this->ueem[0]; | ||||
|  | ||||
|   assert(psi.Checkerboard() == psi.Checkerboard()); | ||||
|  | ||||
|   Vector<Coeff_t> ueec(Ls); | ||||
|   Vector<Coeff_t> deec(Ls+1); | ||||
|   Vector<Coeff_t> leec(Ls); | ||||
|   Vector<Coeff_t> ueemc(Ls); | ||||
|   Vector<Coeff_t> leemc(Ls); | ||||
|  | ||||
|   for(int s=0; s<ueec.size(); s++){ | ||||
|     ueec[s]  = conjugate(this->uee[s]); | ||||
|     deec[s]  = conjugate(this->dee[s]); | ||||
|     leec[s]  = conjugate(this->lee[s]); | ||||
|     ueemc[s] = conjugate(this->ueem[s]); | ||||
|     leemc[s] = conjugate(this->leem[s]); | ||||
|   } | ||||
|   deec[Ls] = conjugate(this->dee[Ls]); | ||||
|  | ||||
|   this->MooeeInvCalls++; | ||||
|   this->MooeeInvTime -= usecond(); | ||||
|   auto nloop = grid->oSites()/Ls; | ||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||
|     uint64_t ss=sss*Ls; | ||||
|     typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|     spinor tmp1,tmp2; | ||||
|     auto ss=sss*Ls; | ||||
|     spinor tmp, acc, res; | ||||
|  | ||||
|     // Apply (U^{\prime})^{-dagger} | ||||
|     coalescedWrite(chi[ss], psi(ss)); | ||||
|     for(int s=1; s<Ls; s++){ | ||||
|       spProj5m(tmp1, chi(ss+s-1)); | ||||
|       coalescedWrite(chi[ss+s], psi(ss+s) - ueec[s-1]*tmp1); | ||||
|     // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}  | ||||
|     res = psi(ss); | ||||
|     spProj5p(tmp,res); | ||||
|     acc = conjugate(pueem[0])*tmp; | ||||
|     spProj5m(tmp,res); | ||||
|     coalescedWrite(chi[ss],res); | ||||
|      | ||||
|     for(int s=1;s<Ls-1;s++){ | ||||
|       res = psi(ss+s); | ||||
|       res -= conjugate(puee[s-1])*tmp; | ||||
|       spProj5p(tmp,res); | ||||
|       acc += conjugate(pueem[s])*tmp; | ||||
|       spProj5m(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|  | ||||
|     // U_m^{-\dagger} | ||||
|     for(int s=0; s<Ls-1; s++){ | ||||
|       spProj5p(tmp1, chi(ss+s)); | ||||
|       coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - ueemc[s]*tmp1); | ||||
|     } | ||||
|  | ||||
|     // L_m^{-\dagger} D^{-dagger} | ||||
|     for(int s=0; s<Ls-1; s++){ | ||||
|       spProj5m(tmp1, chi(ss+Ls-1)); | ||||
|       coalescedWrite(chi[ss+s] ,(1.0/deec[s])*chi(ss+s) - (leemc[s]/deec[Ls-1])*tmp1); | ||||
|     } | ||||
|     spProj5p(tmp2, chi(ss+Ls-1)); | ||||
|     coalescedWrite(chi[ss+Ls-1], (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2); | ||||
|  | ||||
|     // Apply L^{-dagger} | ||||
|     for(int s=Ls-2; s>=0; s--){ | ||||
|       spProj5p(tmp1, chi(ss+s+1)); | ||||
|       coalescedWrite(chi[ss+s],chi(ss+s) - leec[s]*tmp1); | ||||
|     res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc; | ||||
|      | ||||
|     // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger} | ||||
|     acc = conjugate(1.0/pdee[Ls-1])*res; | ||||
|     tmp = conjugate(1.0/pdee[Ls  ])*res; | ||||
|     spProj5m(acc,acc); | ||||
|     spProj5p(tmp,tmp); | ||||
|     coalescedWrite(chi[ss+Ls-1], acc + tmp); | ||||
|     for (int s=Ls-2;s>=0;s--){ | ||||
|       res = conjugate(1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc; | ||||
|       spProj5p(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
|   | ||||
| @@ -11,6 +11,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| Author: David Murphy <dmurphy@phys.columbia.edu> | ||||
| Author: Gianluca Filaci <g.filaci@ed.ac.uk> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| @@ -49,6 +50,10 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField | ||||
|  | ||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||
|  | ||||
|   auto pdiag = &diag[0]; | ||||
|   auto pupper = &upper[0]; | ||||
|   auto plower = &lower[0]; | ||||
|  | ||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||
|   this->M5Dcalls++; | ||||
|   this->M5Dtime -= usecond(); | ||||
| @@ -64,7 +69,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField | ||||
|       uint64_t idx_l = ss+((s+Ls-1)%Ls); | ||||
|       spProj5m(tmp1, psi(idx_u)); | ||||
|       spProj5p(tmp2, psi(idx_l)); | ||||
|       coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); | ||||
|       coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
| @@ -88,6 +93,11 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion | ||||
|  | ||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||
|  | ||||
|   auto pdiag = &diag[0]; | ||||
|   auto pupper = &upper[0]; | ||||
|   auto plower = &lower[0]; | ||||
|   auto pshift_coeffs = &shift_coeffs[0]; | ||||
|  | ||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||
|   this->M5Dcalls++; | ||||
|   this->M5Dtime -= usecond(); | ||||
| @@ -108,7 +118,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion | ||||
|       if(pm == 1){ spProj5p(tmp, psi(ss+shift_s)); } | ||||
|       else       { spProj5m(tmp, psi(ss+shift_s)); } | ||||
|  | ||||
|       coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 +lower[s]*tmp2 + shift_coeffs[s]*tmp); | ||||
|       coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 +plower[s]*tmp2 + pshift_coeffs[s]*tmp); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
| @@ -128,6 +138,10 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie | ||||
|  | ||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||
|  | ||||
|   auto pdiag = &diag[0]; | ||||
|   auto pupper = &upper[0]; | ||||
|   auto plower = &lower[0]; | ||||
|  | ||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||
|   this->M5Dcalls++; | ||||
|   this->M5Dtime -= usecond(); | ||||
| @@ -144,7 +158,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie | ||||
|       uint64_t idx_l = ss+((s+Ls-1)%Ls); | ||||
|       spProj5p(tmp1, psi(idx_u)); | ||||
|       spProj5m(tmp2, psi(idx_l)); | ||||
|       coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); | ||||
|       coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
| @@ -166,6 +180,11 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm | ||||
|  | ||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||
|  | ||||
|   auto pdiag = &diag[0]; | ||||
|   auto pupper = &upper[0]; | ||||
|   auto plower = &lower[0]; | ||||
|   auto pshift_coeffs = &shift_coeffs[0]; | ||||
|  | ||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||
|   this->M5Dcalls++; | ||||
|   this->M5Dtime -= usecond(); | ||||
| @@ -189,12 +208,12 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm | ||||
|       spProj5p(tmp1, psi(idx_u)); | ||||
|       spProj5m(tmp2, psi(idx_l)); | ||||
|  | ||||
|       if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); | ||||
|       else          coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2); | ||||
|       if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); | ||||
|       else          coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); | ||||
|       if(pm == 1){ spProj5p(tmp, psi(ss+s)); } | ||||
|       else       { spProj5m(tmp, psi(ss+s)); } | ||||
|  | ||||
|       coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+shift_coeffs[s]*tmp); | ||||
|       coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+pshift_coeffs[s]*tmp); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
| @@ -223,36 +242,38 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField & | ||||
|  | ||||
|   int nloop = grid->oSites()/Ls; | ||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||
|  | ||||
|     uint64_t ss = sss*Ls; | ||||
|  | ||||
|     uint64_t ss=sss*Ls; | ||||
|     typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|     spinor tmp; | ||||
|     spinor tmp, acc, res; | ||||
|  | ||||
|     // Apply (L^{\prime})^{-1} | ||||
|     coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0] | ||||
|     for(int s=1; s<Ls; s++){ | ||||
|       spProj5p(tmp, chi(ss+s-1)); | ||||
|       coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp); | ||||
|     // X = Nc*Ns | ||||
|     // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops | ||||
|     // Apply (L^{\prime})^{-1} L_m^{-1} | ||||
|     res = psi(ss); | ||||
|     spProj5m(tmp,res); | ||||
|     acc = pleem[0]*tmp; | ||||
|     spProj5p(tmp,res); | ||||
|     coalescedWrite(chi[ss],res); | ||||
|      | ||||
|     for(int s=1;s<Ls-1;s++){ | ||||
|       res = psi(ss+s); | ||||
|       res -= plee[s-1]*tmp; | ||||
|       spProj5m(tmp,res); | ||||
|       acc += pleem[s]*tmp; | ||||
|       spProj5p(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|  | ||||
|     // L_m^{-1} | ||||
|     for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi | ||||
|       spProj5m(tmp, chi(ss+s)); | ||||
|       coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp); | ||||
|     } | ||||
|  | ||||
|     // U_m^{-1} D^{-1} | ||||
|     for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s] | ||||
|       spProj5p(tmp, chi(ss+Ls-1)); | ||||
|       coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp); | ||||
|     } | ||||
|     coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); | ||||
|  | ||||
|     // Apply U^{-1} | ||||
|     for(int s=Ls-2; s>=0; s--){ | ||||
|       spProj5m(tmp, chi(ss+s+1)); | ||||
|       coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp); | ||||
|     res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc; | ||||
|      | ||||
|     // Apply U_m^{-1} D^{-1} U^{-1} | ||||
|     res = (1.0/pdee[Ls-1])*res; | ||||
|     coalescedWrite(chi[ss+Ls-1],res); | ||||
|     spProj5p(acc,res); | ||||
|     spProj5m(tmp,res); | ||||
|     for (int s=Ls-2;s>=0;s--){ | ||||
|       res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; | ||||
|       spProj5m(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|   }); | ||||
|     | ||||
| @@ -281,45 +302,45 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF | ||||
|  | ||||
|   int nloop = grid->oSites()/Ls; | ||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||
|       uint64_t ss=sss*Ls; | ||||
|       typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|       spinor tmp, acc, res, tmp_spProj; | ||||
|  | ||||
|     uint64_t ss = sss*Ls; | ||||
|       // Apply (L^{\prime})^{-1} L_m^{-1} | ||||
|       res = psi(ss); | ||||
|       spProj5m(tmp,res); | ||||
|       acc = pleem[0]*tmp; | ||||
|       spProj5p(tmp,res); | ||||
|       coalescedWrite(chi[ss],res); | ||||
|       tmp_spProj = pMooeeInv_shift_lc[0]*res; | ||||
|  | ||||
|     typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|     spinor tmp1,tmp2,tmp2_spProj; | ||||
|       for(int s=1;s<Ls-1;s++){ | ||||
| 	res = psi(ss+s); | ||||
| 	tmp_spProj += pMooeeInv_shift_lc[s]*res; | ||||
| 	res -= plee[s-1]*tmp; | ||||
| 	spProj5m(tmp,res); | ||||
| 	acc += pleem[s]*tmp; | ||||
| 	spProj5p(tmp,res); | ||||
| 	coalescedWrite(chi[ss+s],res); | ||||
|       } | ||||
|       res = psi(ss+Ls-1); | ||||
|  | ||||
|     // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2 | ||||
|     coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0] | ||||
|     tmp2 = pMooeeInv_shift_lc[0]*psi(ss); | ||||
|     for(int s=1; s<Ls; s++){ | ||||
|       spProj5p(tmp1, chi(ss+s-1)); | ||||
|       coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1); | ||||
|       tmp2 = tmp2 + pMooeeInv_shift_lc[s]*psi(ss+s); | ||||
|     } | ||||
|     if(pm == 1){ spProj5p(tmp2_spProj, tmp2);} | ||||
|     else       { spProj5m(tmp2_spProj, tmp2); } | ||||
|       tmp_spProj += pMooeeInv_shift_lc[Ls-1]*res; | ||||
|       if(pm == 1){ spProj5p(tmp_spProj, tmp_spProj);} | ||||
|       else       { spProj5m(tmp_spProj, tmp_spProj); } | ||||
|  | ||||
|     // L_m^{-1} | ||||
|     for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi | ||||
|       spProj5m(tmp1, chi(ss+s)); | ||||
|       coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1); | ||||
|     } | ||||
|       res = res - plee[Ls-2]*tmp - acc; | ||||
|  | ||||
|     // U_m^{-1} D^{-1} | ||||
|     for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s] | ||||
|       spProj5p(tmp1, chi(ss+Ls-1)); | ||||
|       coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp1); | ||||
|     } | ||||
|     // chi[ss+Ls-1] = (1.0/pdee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj; | ||||
|     coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); | ||||
|     spProj5m(tmp1, chi(ss+Ls-1)); | ||||
|     coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInv_shift_norm[Ls-1]*tmp2_spProj); | ||||
|  | ||||
|     // Apply U^{-1} and add shift term | ||||
|     for(int s=Ls-2; s>=0; s--){ | ||||
|       coalescedWrite(chi[ss+s] , chi(ss+s) - puee[s]*tmp1); | ||||
|       spProj5m(tmp1, chi(ss+s)); | ||||
|       coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInv_shift_norm[s]*tmp2_spProj); | ||||
|     } | ||||
|       // Apply U_m^{-1} D^{-1} U^{-1} | ||||
|       res = (1.0/pdee[Ls-1])*res; | ||||
|       spProj5p(acc,res); | ||||
|       spProj5m(tmp,res); | ||||
|       coalescedWrite(chi[ss+Ls-1], res + pMooeeInv_shift_norm[Ls-1]*tmp_spProj); | ||||
|       for (int s=Ls-2;s>=0;s--){ | ||||
| 	res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc; | ||||
| 	spProj5m(tmp,res); | ||||
| 	coalescedWrite(chi[ss+s], res + pMooeeInv_shift_norm[s]*tmp_spProj); | ||||
|       } | ||||
|   }); | ||||
|  | ||||
|   this->MooeeInvTime += usecond(); | ||||
| @@ -347,39 +368,40 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel | ||||
|  | ||||
|   int nloop = grid->oSites()/Ls; | ||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||
|  | ||||
|     uint64_t ss = sss*Ls; | ||||
|  | ||||
|     uint64_t ss=sss*Ls; | ||||
|     typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|     spinor tmp; | ||||
|     spinor tmp, acc, res; | ||||
|  | ||||
|     // Apply (U^{\prime})^{-dag} | ||||
|     coalescedWrite(chi[ss], psi(ss)); | ||||
|     for(int s=1; s<Ls; s++){ | ||||
|       spProj5m(tmp, chi(ss+s-1)); | ||||
|       coalescedWrite(chi[ss+s], psi(ss+s) - puee[s-1]*tmp); | ||||
|     } | ||||
|     // X = Nc*Ns | ||||
|     // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops | ||||
|     // Apply (U^{\prime})^{-dagger} U_m^{-\dagger} | ||||
|     res = psi(ss); | ||||
|     spProj5p(tmp,res); | ||||
|     acc = pueem[0]*tmp; | ||||
|     spProj5m(tmp,res); | ||||
|     coalescedWrite(chi[ss],res); | ||||
|      | ||||
|     // U_m^{-\dag} | ||||
|     for(int s=0; s<Ls-1; s++){ | ||||
|       spProj5p(tmp, chi(ss+s)); | ||||
|       coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp); | ||||
|     for(int s=1;s<Ls-1;s++){ | ||||
|       res = psi(ss+s); | ||||
|       res -= puee[s-1]*tmp; | ||||
|       spProj5p(tmp,res); | ||||
|       acc += pueem[s]*tmp; | ||||
|       spProj5m(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|  | ||||
|     // L_m^{-\dag} D^{-dag} | ||||
|     for(int s=0; s<Ls-1; s++){ | ||||
|       spProj5m(tmp, chi(ss+Ls-1)); | ||||
|       coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp); | ||||
|     } | ||||
|     coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); | ||||
|  | ||||
|     // Apply L^{-dag} | ||||
|     for(int s=Ls-2; s>=0; s--){ | ||||
|       spProj5p(tmp, chi(ss+s+1)); | ||||
|       coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp); | ||||
|     res = psi(ss+Ls-1) - puee[Ls-2]*tmp - acc; | ||||
|      | ||||
|     // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger} | ||||
|     res = (1.0/pdee[Ls-1])*res; | ||||
|     coalescedWrite(chi[ss+Ls-1],res); | ||||
|     spProj5m(acc,res); | ||||
|     spProj5p(tmp,res); | ||||
|     for (int s=Ls-2;s>=0;s--){ | ||||
|       res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc; | ||||
|       spProj5p(tmp,res); | ||||
|       coalescedWrite(chi[ss+s],res); | ||||
|     } | ||||
|   }); | ||||
|  | ||||
|   this->MooeeInvTime += usecond(); | ||||
| } | ||||
|  | ||||
| @@ -406,45 +428,45 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi | ||||
|  | ||||
|   int nloop = grid->oSites()/Ls; | ||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||
|       uint64_t ss=sss*Ls; | ||||
|       typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|       spinor tmp, acc, res, tmp_spProj; | ||||
|  | ||||
|     uint64_t ss = sss*Ls; | ||||
|       // Apply (U^{\prime})^{-dagger} U_m^{-\dagger} | ||||
|       res = psi(ss); | ||||
|       spProj5p(tmp,res); | ||||
|       acc = pueem[0]*tmp; | ||||
|       spProj5m(tmp,res); | ||||
|       coalescedWrite(chi[ss],res); | ||||
|       tmp_spProj = pMooeeInvDag_shift_lc[0]*res; | ||||
|  | ||||
|     typedef decltype(coalescedRead(psi[0])) spinor; | ||||
|     spinor tmp1,tmp2,tmp2_spProj; | ||||
|       for(int s=1;s<Ls-1;s++){ | ||||
| 	res = psi(ss+s); | ||||
| 	tmp_spProj += pMooeeInvDag_shift_lc[s]*res; | ||||
| 	res -= puee[s-1]*tmp; | ||||
| 	spProj5p(tmp,res); | ||||
| 	acc += pueem[s]*tmp; | ||||
| 	spProj5m(tmp,res); | ||||
| 	coalescedWrite(chi[ss+s],res); | ||||
|       } | ||||
|       res = psi(ss+Ls-1); | ||||
|  | ||||
|     // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2 | ||||
|     coalescedWrite(chi[ss], psi(ss)); | ||||
|     tmp2 = pMooeeInvDag_shift_lc[0]*psi(ss); | ||||
|     for(int s=1; s<Ls; s++){ | ||||
|       spProj5m(tmp1, chi(ss+s-1)); | ||||
|       coalescedWrite(chi[ss+s],psi(ss+s) - puee[s-1]*tmp1); | ||||
|       tmp2 = tmp2 + pMooeeInvDag_shift_lc[s]*psi(ss+s); | ||||
|     } | ||||
|       tmp_spProj += pMooeeInvDag_shift_lc[Ls-1]*res; | ||||
|       if(pm == 1){ spProj5p(tmp_spProj, tmp_spProj); } | ||||
|       else       { spProj5m(tmp_spProj, tmp_spProj); } | ||||
|  | ||||
|     if(pm == 1){ spProj5p(tmp2_spProj, tmp2);} | ||||
|     else       { spProj5m(tmp2_spProj, tmp2);} | ||||
|       res = res - puee[Ls-2]*tmp - acc; | ||||
|  | ||||
|     // U_m^{-\dag} | ||||
|     for(int s=0; s<Ls-1; s++){ | ||||
|       spProj5p(tmp1, chi(ss+s)); | ||||
|       coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp1); | ||||
|     } | ||||
|  | ||||
|     // L_m^{-\dag} D^{-dag} | ||||
|     for(int s=0; s<Ls-1; s++){ | ||||
|       spProj5m(tmp1, chi(ss+Ls-1)); | ||||
|       coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp1); | ||||
|     } | ||||
|     coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1)); | ||||
|     spProj5p(tmp1, chi(ss+Ls-1)); | ||||
|     coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInvDag_shift_norm[Ls-1]*tmp2_spProj); | ||||
|  | ||||
|     // Apply L^{-dag} | ||||
|     for(int s=Ls-2; s>=0; s--){ | ||||
|       coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp1); | ||||
|       spProj5p(tmp1, chi(ss+s)); | ||||
|       coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInvDag_shift_norm[s]*tmp2_spProj); | ||||
|     } | ||||
|       // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger} | ||||
|       res = (1.0/pdee[Ls-1])*res; | ||||
|       spProj5m(acc,res); | ||||
|       spProj5p(tmp,res); | ||||
|       coalescedWrite(chi[ss+Ls-1], res + pMooeeInvDag_shift_norm[Ls-1]*tmp_spProj); | ||||
|       for (int s=Ls-2;s>=0;s--){ | ||||
| 	res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc; | ||||
| 	spProj5p(tmp,res); | ||||
| 	coalescedWrite(chi[ss+s], res + pMooeeInvDag_shift_norm[s]*tmp_spProj); | ||||
|       } | ||||
|   }); | ||||
|  | ||||
|   this->MooeeInvTime += usecond(); | ||||
|   | ||||
| @@ -281,6 +281,9 @@ case ${CXX} in | ||||
|     CXX="nvcc -x cu " | ||||
|     CXXLD="nvcc -link" | ||||
|     CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" | ||||
|     if test $ac_openmp = yes; then | ||||
|        CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" | ||||
|     fi | ||||
|     ;; | ||||
|   *) | ||||
|     CXXLD=${CXX} | ||||
|   | ||||
		Reference in New Issue
	
	Block a user