mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-26 17:49:33 +00:00 
			
		
		
		
	Re-import DWF and abstract base EOFA fermion classes and tests
This commit is contained in:
		
							
								
								
									
										605
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										605
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,605 @@ | ||||
| /************************************************************************************* | ||||
|  | ||||
| Grid physics library, www.github.com/paboyle/Grid | ||||
|  | ||||
| Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc | ||||
|  | ||||
| Copyright (C) 2017 | ||||
|  | ||||
| Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | ||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> | ||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
| Author: David Murphy <dmurphy@phys.columbia.edu> | ||||
|  | ||||
| This program is free software; you can redistribute it and/or modify | ||||
| it under the terms of the GNU General Public License as published by | ||||
| the Free Software Foundation; either version 2 of the License, or | ||||
| (at your option) any later version. | ||||
|  | ||||
| This program is distributed in the hope that it will be useful, | ||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
| GNU General Public License for more details. | ||||
|  | ||||
| You should have received a copy of the GNU General Public License along | ||||
| with this program; if not, write to the Free Software Foundation, Inc., | ||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|  | ||||
| See the full license in the file "LICENSE" in the top level distribution directory | ||||
| *************************************************************************************/ | ||||
| /*  END LEGAL */ | ||||
|  | ||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | ||||
| #include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h> | ||||
|  | ||||
| namespace Grid { | ||||
| namespace QCD { | ||||
|  | ||||
|     /* | ||||
|     * Dense matrix versions of routines | ||||
|     */ | ||||
|     template<class Impl> | ||||
|     void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi) | ||||
|     { | ||||
|         this->MooeeInternal(psi, chi, DaggerYes, InverseYes); | ||||
|     } | ||||
|  | ||||
|     template<class Impl> | ||||
|     void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi) | ||||
|     { | ||||
|         this->MooeeInternal(psi, chi, DaggerNo, InverseYes); | ||||
|     } | ||||
|  | ||||
|     template<class Impl> | ||||
|     void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi, | ||||
|         FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper) | ||||
|     { | ||||
|         GridBase* grid = psi._grid; | ||||
|         int Ls  = this->Ls; | ||||
|         int LLs = grid->_rdimensions[0]; | ||||
|         const int nsimd = Simd::Nsimd(); | ||||
|  | ||||
|         Vector<iSinglet<Simd> > u(LLs); | ||||
|         Vector<iSinglet<Simd> > l(LLs); | ||||
|         Vector<iSinglet<Simd> > d(LLs); | ||||
|  | ||||
|         assert(Ls/LLs == nsimd); | ||||
|         assert(phi.checkerboard == psi.checkerboard); | ||||
|  | ||||
|         chi.checkerboard = psi.checkerboard; | ||||
|  | ||||
|         // just directly address via type pun | ||||
|         typedef typename Simd::scalar_type scalar_type; | ||||
|         scalar_type* u_p = (scalar_type*) &u[0]; | ||||
|         scalar_type* l_p = (scalar_type*) &l[0]; | ||||
|         scalar_type* d_p = (scalar_type*) &d[0]; | ||||
|  | ||||
|         for(int o=0;o<LLs;o++){ // outer | ||||
|         for(int i=0;i<nsimd;i++){ //inner | ||||
|             int s  = o + i*LLs; | ||||
|             int ss = o*nsimd + i; | ||||
|             u_p[ss] = upper[s]; | ||||
|             l_p[ss] = lower[s]; | ||||
|             d_p[ss] = diag[s]; | ||||
|         }} | ||||
|  | ||||
|         this->M5Dcalls++; | ||||
|         this->M5Dtime -= usecond(); | ||||
|  | ||||
|         assert(Nc == 3); | ||||
|  | ||||
|         parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs | ||||
|  | ||||
|             #if 0 | ||||
|  | ||||
|                 alignas(64) SiteHalfSpinor hp; | ||||
|                 alignas(64) SiteHalfSpinor hm; | ||||
|                 alignas(64) SiteSpinor fp; | ||||
|                 alignas(64) SiteSpinor fm; | ||||
|  | ||||
|                 for(int v=0; v<LLs; v++){ | ||||
|  | ||||
|                     int vp = (v+1)%LLs; | ||||
|                     int vm = (v+LLs-1)%LLs; | ||||
|  | ||||
|                     spProj5m(hp, psi[ss+vp]); | ||||
|                     spProj5p(hm, psi[ss+vm]); | ||||
|  | ||||
|                     if (vp <= v){ rotate(hp, hp, 1); } | ||||
|                     if (vm >= v){ rotate(hm, hm, nsimd-1); } | ||||
|  | ||||
|                     hp = 0.5*hp; | ||||
|                     hm = 0.5*hm; | ||||
|  | ||||
|                     spRecon5m(fp, hp); | ||||
|                     spRecon5p(fm, hm); | ||||
|  | ||||
|                     chi[ss+v] = d[v]*phi[ss+v]; | ||||
|                     chi[ss+v] = chi[ss+v] + u[v]*fp; | ||||
|                     chi[ss+v] = chi[ss+v] + l[v]*fm; | ||||
|  | ||||
|                 } | ||||
|  | ||||
|             #else | ||||
|  | ||||
|                 for(int v=0; v<LLs; v++){ | ||||
|  | ||||
|                     vprefetch(psi[ss+v+LLs]); | ||||
|  | ||||
|                     int vp = (v==LLs-1) ? 0     : v+1; | ||||
|                     int vm = (v==0)     ? LLs-1 : v-1; | ||||
|  | ||||
|                     Simd hp_00 = psi[ss+vp]()(2)(0); | ||||
|                     Simd hp_01 = psi[ss+vp]()(2)(1); | ||||
|                     Simd hp_02 = psi[ss+vp]()(2)(2); | ||||
|                     Simd hp_10 = psi[ss+vp]()(3)(0); | ||||
|                     Simd hp_11 = psi[ss+vp]()(3)(1); | ||||
|                     Simd hp_12 = psi[ss+vp]()(3)(2); | ||||
|  | ||||
|                     Simd hm_00 = psi[ss+vm]()(0)(0); | ||||
|                     Simd hm_01 = psi[ss+vm]()(0)(1); | ||||
|                     Simd hm_02 = psi[ss+vm]()(0)(2); | ||||
|                     Simd hm_10 = psi[ss+vm]()(1)(0); | ||||
|                     Simd hm_11 = psi[ss+vm]()(1)(1); | ||||
|                     Simd hm_12 = psi[ss+vm]()(1)(2); | ||||
|  | ||||
|                     if(vp <= v){ | ||||
|                         hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v); | ||||
|                         hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); | ||||
|                         hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); | ||||
|                         hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); | ||||
|                         hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); | ||||
|                         hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); | ||||
|                     } | ||||
|  | ||||
|                     if(vm >= v){ | ||||
|                         hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); | ||||
|                         hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); | ||||
|                         hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); | ||||
|                         hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); | ||||
|                         hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); | ||||
|                         hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); | ||||
|                     } | ||||
|  | ||||
|                     // Can force these to real arithmetic and save 2x. | ||||
|                     Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00); | ||||
|                     Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01); | ||||
|                     Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02); | ||||
|                     Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10); | ||||
|                     Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11); | ||||
|                     Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12); | ||||
|                     Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00); | ||||
|                     Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01); | ||||
|                     Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02); | ||||
|                     Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10); | ||||
|                     Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11); | ||||
|                     Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12); | ||||
|  | ||||
|                     vstream(chi[ss+v]()(0)(0), p_00); | ||||
|                     vstream(chi[ss+v]()(0)(1), p_01); | ||||
|                     vstream(chi[ss+v]()(0)(2), p_02); | ||||
|                     vstream(chi[ss+v]()(1)(0), p_10); | ||||
|                     vstream(chi[ss+v]()(1)(1), p_11); | ||||
|                     vstream(chi[ss+v]()(1)(2), p_12); | ||||
|                     vstream(chi[ss+v]()(2)(0), p_20); | ||||
|                     vstream(chi[ss+v]()(2)(1), p_21); | ||||
|                     vstream(chi[ss+v]()(2)(2), p_22); | ||||
|                     vstream(chi[ss+v]()(3)(0), p_30); | ||||
|                     vstream(chi[ss+v]()(3)(1), p_31); | ||||
|                     vstream(chi[ss+v]()(3)(2), p_32); | ||||
|                 } | ||||
|  | ||||
|             #endif | ||||
|         } | ||||
|  | ||||
|         this->M5Dtime += usecond(); | ||||
|     } | ||||
|  | ||||
|     template<class Impl> | ||||
|     void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi, | ||||
|         FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper) | ||||
|     { | ||||
|         GridBase* grid = psi._grid; | ||||
|         int Ls  = this->Ls; | ||||
|         int LLs = grid->_rdimensions[0]; | ||||
|         int nsimd = Simd::Nsimd(); | ||||
|  | ||||
|         Vector<iSinglet<Simd> > u(LLs); | ||||
|         Vector<iSinglet<Simd> > l(LLs); | ||||
|         Vector<iSinglet<Simd> > d(LLs); | ||||
|  | ||||
|         assert(Ls/LLs == nsimd); | ||||
|         assert(phi.checkerboard == psi.checkerboard); | ||||
|  | ||||
|         chi.checkerboard = psi.checkerboard; | ||||
|  | ||||
|         // just directly address via type pun | ||||
|         typedef typename Simd::scalar_type scalar_type; | ||||
|         scalar_type* u_p = (scalar_type*) &u[0]; | ||||
|         scalar_type* l_p = (scalar_type*) &l[0]; | ||||
|         scalar_type* d_p = (scalar_type*) &d[0]; | ||||
|  | ||||
|         for(int o=0; o<LLs; o++){ // outer | ||||
|         for(int i=0; i<nsimd; i++){ //inner | ||||
|             int s  = o + i*LLs; | ||||
|             int ss = o*nsimd + i; | ||||
|             u_p[ss] = upper[s]; | ||||
|             l_p[ss] = lower[s]; | ||||
|             d_p[ss] = diag[s]; | ||||
|         }} | ||||
|  | ||||
|         this->M5Dcalls++; | ||||
|         this->M5Dtime -= usecond(); | ||||
|  | ||||
|         parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs | ||||
|  | ||||
|         #if 0 | ||||
|  | ||||
|             alignas(64) SiteHalfSpinor hp; | ||||
|             alignas(64) SiteHalfSpinor hm; | ||||
|             alignas(64) SiteSpinor fp; | ||||
|             alignas(64) SiteSpinor fm; | ||||
|  | ||||
|             for(int v=0; v<LLs; v++){ | ||||
|  | ||||
|                 int vp = (v+1)%LLs; | ||||
|                 int vm = (v+LLs-1)%LLs; | ||||
|  | ||||
|                 spProj5p(hp, psi[ss+vp]); | ||||
|                 spProj5m(hm, psi[ss+vm]); | ||||
|  | ||||
|                 if(vp <= v){ rotate(hp, hp, 1); } | ||||
|                 if(vm >= v){ rotate(hm, hm, nsimd-1); } | ||||
|  | ||||
|                 hp = hp*0.5; | ||||
|                 hm = hm*0.5; | ||||
|                 spRecon5p(fp, hp); | ||||
|                 spRecon5m(fm, hm); | ||||
|  | ||||
|                 chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp; | ||||
|                 chi[ss+v] = chi[ss+v]     +l[v]*fm; | ||||
|             } | ||||
|  | ||||
|         #else | ||||
|  | ||||
|             for(int v=0; v<LLs; v++){ | ||||
|  | ||||
|                 vprefetch(psi[ss+v+LLs]); | ||||
|  | ||||
|                 int vp = (v == LLs-1) ? 0     : v+1; | ||||
|                 int vm = (v == 0    ) ? LLs-1 : v-1; | ||||
|  | ||||
|                 Simd hp_00 = psi[ss+vp]()(0)(0); | ||||
|                 Simd hp_01 = psi[ss+vp]()(0)(1); | ||||
|                 Simd hp_02 = psi[ss+vp]()(0)(2); | ||||
|                 Simd hp_10 = psi[ss+vp]()(1)(0); | ||||
|                 Simd hp_11 = psi[ss+vp]()(1)(1); | ||||
|                 Simd hp_12 = psi[ss+vp]()(1)(2); | ||||
|  | ||||
|                 Simd hm_00 = psi[ss+vm]()(2)(0); | ||||
|                 Simd hm_01 = psi[ss+vm]()(2)(1); | ||||
|                 Simd hm_02 = psi[ss+vm]()(2)(2); | ||||
|                 Simd hm_10 = psi[ss+vm]()(3)(0); | ||||
|                 Simd hm_11 = psi[ss+vm]()(3)(1); | ||||
|                 Simd hm_12 = psi[ss+vm]()(3)(2); | ||||
|  | ||||
|                 if (vp <= v){ | ||||
|                     hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v); | ||||
|                     hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); | ||||
|                     hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); | ||||
|                     hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); | ||||
|                     hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); | ||||
|                     hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); | ||||
|                 } | ||||
|  | ||||
|                 if(vm >= v){ | ||||
|                     hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); | ||||
|                     hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); | ||||
|                     hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); | ||||
|                     hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); | ||||
|                     hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); | ||||
|                     hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); | ||||
|                 } | ||||
|  | ||||
|                 Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00); | ||||
|                 Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01); | ||||
|                 Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02); | ||||
|                 Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10); | ||||
|                 Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11); | ||||
|                 Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12); | ||||
|                 Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00); | ||||
|                 Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01); | ||||
|                 Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02); | ||||
|                 Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10); | ||||
|                 Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11); | ||||
|                 Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12); | ||||
|  | ||||
|                 vstream(chi[ss+v]()(0)(0), p_00); | ||||
|                 vstream(chi[ss+v]()(0)(1), p_01); | ||||
|                 vstream(chi[ss+v]()(0)(2), p_02); | ||||
|                 vstream(chi[ss+v]()(1)(0), p_10); | ||||
|                 vstream(chi[ss+v]()(1)(1), p_11); | ||||
|                 vstream(chi[ss+v]()(1)(2), p_12); | ||||
|                 vstream(chi[ss+v]()(2)(0), p_20); | ||||
|                 vstream(chi[ss+v]()(2)(1), p_21); | ||||
|                 vstream(chi[ss+v]()(2)(2), p_22); | ||||
|                 vstream(chi[ss+v]()(3)(0), p_30); | ||||
|                 vstream(chi[ss+v]()(3)(1), p_31); | ||||
|                 vstream(chi[ss+v]()(3)(2), p_32); | ||||
|             } | ||||
|         #endif | ||||
|  | ||||
|         } | ||||
|  | ||||
|         this->M5Dtime += usecond(); | ||||
|     } | ||||
|  | ||||
|     #ifdef AVX512 | ||||
|         #include<simd/Intel512common.h> | ||||
|         #include<simd/Intel512avx.h> | ||||
|         #include<simd/Intel512single.h> | ||||
|     #endif | ||||
|  | ||||
|     template<class Impl> | ||||
|     void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi, | ||||
|         int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm) | ||||
|     { | ||||
|         #ifndef AVX512 | ||||
|         { | ||||
|             SiteHalfSpinor BcastP; | ||||
|             SiteHalfSpinor BcastM; | ||||
|             SiteHalfSpinor SiteChiP; | ||||
|             SiteHalfSpinor SiteChiM; | ||||
|  | ||||
|             // Ls*Ls * 2 * 12 * vol flops | ||||
|             for(int s1=0; s1<LLs; s1++){ | ||||
|  | ||||
|                 for(int s2=0; s2<LLs; s2++){ | ||||
|                 for(int l=0; l < Simd::Nsimd(); l++){ // simd lane | ||||
|  | ||||
|                     int s = s2 + l*LLs; | ||||
|                     int lex = s2 + LLs*site; | ||||
|  | ||||
|                     if( s2==0 && l==0 ){ | ||||
|                         SiteChiP=zero; | ||||
|                         SiteChiM=zero; | ||||
|                     } | ||||
|  | ||||
|                     for(int sp=0; sp<2;  sp++){ | ||||
|                     for(int co=0; co<Nc; co++){ | ||||
|                         vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l); | ||||
|                     }} | ||||
|  | ||||
|                     for(int sp=0; sp<2;  sp++){ | ||||
|                     for(int co=0; co<Nc; co++){ | ||||
|                         vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l); | ||||
|                     }} | ||||
|  | ||||
|                     for(int sp=0; sp<2;  sp++){ | ||||
|                     for(int co=0; co<Nc; co++){ | ||||
|                         SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us. | ||||
|                         SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out | ||||
|                     }} | ||||
|                 }} | ||||
|  | ||||
|                 { | ||||
|                     int lex = s1 + LLs*site; | ||||
|                     for(int sp=0; sp<2;  sp++){ | ||||
|                     for(int co=0; co<Nc; co++){ | ||||
|                         vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co)); | ||||
|                         vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co)); | ||||
|                     }} | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|         } | ||||
|         #else | ||||
|         { | ||||
|             // pointers | ||||
|             //  MASK_REGS; | ||||
|             #define Chi_00 %%zmm1 | ||||
|             #define Chi_01 %%zmm2 | ||||
|             #define Chi_02 %%zmm3 | ||||
|             #define Chi_10 %%zmm4 | ||||
|             #define Chi_11 %%zmm5 | ||||
|             #define Chi_12 %%zmm6 | ||||
|             #define Chi_20 %%zmm7 | ||||
|             #define Chi_21 %%zmm8 | ||||
|             #define Chi_22 %%zmm9 | ||||
|             #define Chi_30 %%zmm10 | ||||
|             #define Chi_31 %%zmm11 | ||||
|             #define Chi_32 %%zmm12 | ||||
|  | ||||
|             #define BCAST0  %%zmm13 | ||||
|             #define BCAST1  %%zmm14 | ||||
|             #define BCAST2  %%zmm15 | ||||
|             #define BCAST3  %%zmm16 | ||||
|             #define BCAST4  %%zmm17 | ||||
|             #define BCAST5  %%zmm18 | ||||
|             #define BCAST6  %%zmm19 | ||||
|             #define BCAST7  %%zmm20 | ||||
|             #define BCAST8  %%zmm21 | ||||
|             #define BCAST9  %%zmm22 | ||||
|             #define BCAST10 %%zmm23 | ||||
|             #define BCAST11 %%zmm24 | ||||
|  | ||||
|             int incr = LLs*LLs*sizeof(iSinglet<Simd>); | ||||
|             for(int s1=0; s1<LLs; s1++){ | ||||
|  | ||||
|                 for(int s2=0; s2<LLs; s2++){ | ||||
|  | ||||
|                     int lex = s2 + LLs*site; | ||||
|                     uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable | ||||
|                     uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1]; | ||||
|                     uint64_t a2 = (uint64_t) &psi[lex]; | ||||
|  | ||||
|                     for(int l=0; l<Simd::Nsimd(); l++){ // simd lane | ||||
|                         if((s2+l)==0) { | ||||
|                             asm( | ||||
|                                     VPREFETCH1(0,%2)              VPREFETCH1(0,%1) | ||||
|                                     VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2) | ||||
|                                     VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2) | ||||
|                                     VBCASTCDUP(0,%2,BCAST0) | ||||
|                                     VBCASTCDUP(1,%2,BCAST1) | ||||
|                                     VBCASTCDUP(2,%2,BCAST2) | ||||
|                                     VBCASTCDUP(3,%2,BCAST3) | ||||
|                                     VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00) | ||||
|                                     VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01) | ||||
|                                     VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02) | ||||
|                                     VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10) | ||||
|                                     VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11) | ||||
|                                     VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12) | ||||
|                                     VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20) | ||||
|                                     VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21) | ||||
|                                     VMULMEM(0,%1,BCAST8,Chi_22) | ||||
|                                     VMULMEM(0,%1,BCAST9,Chi_30) | ||||
|                                     VMULMEM(0,%1,BCAST10,Chi_31) | ||||
|                                     VMULMEM(0,%1,BCAST11,Chi_32) | ||||
|                                     : : "r" (a0), "r" (a1), "r" (a2)                            ); | ||||
|                         } else { | ||||
|                             asm( | ||||
|                                     VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00) | ||||
|                                     VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01) | ||||
|                                     VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02) | ||||
|                                     VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10) | ||||
|                                     VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11) | ||||
|                                     VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12) | ||||
|                                     VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20) | ||||
|                                     VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21) | ||||
|                                     VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22) | ||||
|                                     VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30) | ||||
|                                     VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31) | ||||
|                                     VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32) | ||||
|                                     : : "r" (a0), "r" (a1), "r" (a2)                            ); | ||||
|                         } | ||||
|                         a0 = a0 + incr; | ||||
|                         a1 = a1 + incr; | ||||
|                         a2 = a2 + sizeof(Simd::scalar_type); | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 { | ||||
|                   int lexa = s1+LLs*site; | ||||
|                   asm ( | ||||
|                      VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02) | ||||
|                      VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12) | ||||
|                      VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22) | ||||
|                      VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32) | ||||
|                      : : "r" ((uint64_t)&chi[lexa]) : "memory" ); | ||||
|  | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         #undef Chi_00 | ||||
|         #undef Chi_01 | ||||
|         #undef Chi_02 | ||||
|         #undef Chi_10 | ||||
|         #undef Chi_11 | ||||
|         #undef Chi_12 | ||||
|         #undef Chi_20 | ||||
|         #undef Chi_21 | ||||
|         #undef Chi_22 | ||||
|         #undef Chi_30 | ||||
|         #undef Chi_31 | ||||
|         #undef Chi_32 | ||||
|  | ||||
|         #undef BCAST0 | ||||
|         #undef BCAST1 | ||||
|         #undef BCAST2 | ||||
|         #undef BCAST3 | ||||
|         #undef BCAST4 | ||||
|         #undef BCAST5 | ||||
|         #undef BCAST6 | ||||
|         #undef BCAST7 | ||||
|         #undef BCAST8 | ||||
|         #undef BCAST9 | ||||
|         #undef BCAST10 | ||||
|         #undef BCAST11 | ||||
|         #endif | ||||
|     }; | ||||
|  | ||||
|     // Z-mobius version | ||||
|     template<class Impl> | ||||
|     void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi, | ||||
|         int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm) | ||||
|     { | ||||
|         std::cout << "Error: zMobius not implemented for EOFA" << std::endl; | ||||
|         exit(-1); | ||||
|     }; | ||||
|  | ||||
|     template<class Impl> | ||||
|     void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv) | ||||
|     { | ||||
|         int Ls  = this->Ls; | ||||
|         int LLs = psi._grid->_rdimensions[0]; | ||||
|         int vol = psi._grid->oSites()/LLs; | ||||
|  | ||||
|         chi.checkerboard = psi.checkerboard; | ||||
|  | ||||
|         Vector<iSinglet<Simd> > Matp; | ||||
|         Vector<iSinglet<Simd> > Matm; | ||||
|         Vector<iSinglet<Simd> > *_Matp; | ||||
|         Vector<iSinglet<Simd> > *_Matm; | ||||
|  | ||||
|         //  MooeeInternalCompute(dag,inv,Matp,Matm); | ||||
|         if(inv && dag){ | ||||
|             _Matp = &this->MatpInvDag; | ||||
|             _Matm = &this->MatmInvDag; | ||||
|         } | ||||
|  | ||||
|         if(inv && (!dag)){ | ||||
|             _Matp = &this->MatpInv; | ||||
|             _Matm = &this->MatmInv; | ||||
|         } | ||||
|  | ||||
|         if(!inv){ | ||||
|             MooeeInternalCompute(dag, inv, Matp, Matm); | ||||
|             _Matp = &Matp; | ||||
|             _Matm = &Matm; | ||||
|         } | ||||
|  | ||||
|         assert(_Matp->size() == Ls*LLs); | ||||
|  | ||||
|         this->MooeeInvCalls++; | ||||
|         this->MooeeInvTime -= usecond(); | ||||
|  | ||||
|         if(switcheroo<Coeff_t>::iscomplex()){ | ||||
|             parallel_for(auto site=0; site<vol; site++){ | ||||
|                 MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm); | ||||
|             } | ||||
|         } else { | ||||
|             parallel_for(auto site=0; site<vol; site++){ | ||||
|                 MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         this->MooeeInvTime += usecond(); | ||||
|     } | ||||
|  | ||||
|     #ifdef DOMAIN_WALL_EOFA_DPERP_VEC | ||||
|  | ||||
|         INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD); | ||||
|         INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF); | ||||
|         INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD); | ||||
|         INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF); | ||||
|  | ||||
|         INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF); | ||||
|         INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH); | ||||
|         INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF); | ||||
|         INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH); | ||||
|  | ||||
|         template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); | ||||
|         template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); | ||||
|         template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); | ||||
|         template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); | ||||
|  | ||||
|         template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); | ||||
|         template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); | ||||
|         template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); | ||||
|         template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv); | ||||
|  | ||||
|     #endif | ||||
|  | ||||
| }} | ||||
		Reference in New Issue
	
	Block a user