/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc Copyright (C) 2015 Author: Peter Boyle Author: Peter Boyle Author: Peter Boyle Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #include namespace Grid { namespace QCD { /* * Dense matrix versions of routines */ template void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField &chi) { this->MooeeInternal(psi,chi,DaggerYes,InverseYes); } template void CayleyFermion5D::MooeeInv(const FermionField &psi, FermionField &chi) { this->MooeeInternal(psi,chi,DaggerNo,InverseYes); } template void CayleyFermion5D::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, std::vector &lower, std::vector &diag, std::vector &upper) { GridBase *grid=psi._grid; int Ls = this->Ls; int LLs = grid->_rdimensions[0]; const int nsimd= Simd::Nsimd(); Vector > u(LLs); Vector > l(LLs); Vector > d(LLs); assert(Ls/LLs==nsimd); assert(phi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; // just directly address via type pun typedef typename Simd::scalar_type scalar_type; scalar_type * u_p = (scalar_type *)&u[0]; scalar_type * l_p = (scalar_type *)&l[0]; scalar_type * d_p = (scalar_type *)&d[0]; for(int o=0;ooSites();ss+=LLs){ // adds LLs #if 0 alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hm; alignas(64) SiteSpinor fp; alignas(64) SiteSpinor fm; for(int v=0;v=v ) rotate(hm,hm,nsimd-1); hp=0.5*hp; hm=0.5*hm; spRecon5m(fp,hp); spRecon5p(fm,hm); chi[ss+v] = d[v]*phi[ss+v]; chi[ss+v] = chi[ss+v] +u[v]*fp; chi[ss+v] = chi[ss+v] +l[v]*fm; } #else for(int v=0;v(hp_00.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); } if ( vm>=v ) { hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } // Can force these to real arithmetic and save 2x. Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(),hm_00); Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(),hm_01); Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(),hm_02); Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(),hm_10); Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(),hm_11); Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(),hm_12); Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(),hp_00); Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(),hp_01); Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(),hp_02); Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(),hp_10); Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(),hp_11); Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(),hp_12); vstream(chi[ss+v]()(0)(0),p_00); vstream(chi[ss+v]()(0)(1),p_01); vstream(chi[ss+v]()(0)(2),p_02); vstream(chi[ss+v]()(1)(0),p_10); vstream(chi[ss+v]()(1)(1),p_11); vstream(chi[ss+v]()(1)(2),p_12); vstream(chi[ss+v]()(2)(0),p_20); vstream(chi[ss+v]()(2)(1),p_21); vstream(chi[ss+v]()(2)(2),p_22); vstream(chi[ss+v]()(3)(0),p_30); vstream(chi[ss+v]()(3)(1),p_31); vstream(chi[ss+v]()(3)(2),p_32); } #endif } M5Dtime+=usecond(); } template void CayleyFermion5D::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, std::vector &lower, std::vector &diag, std::vector &upper) { GridBase *grid=psi._grid; int Ls = this->Ls; int LLs = grid->_rdimensions[0]; int nsimd= Simd::Nsimd(); Vector > u(LLs); Vector > l(LLs); Vector > d(LLs); assert(Ls/LLs==nsimd); assert(phi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; // just directly address via type pun typedef typename Simd::scalar_type scalar_type; scalar_type * u_p = (scalar_type *)&u[0]; scalar_type * l_p = (scalar_type *)&l[0]; scalar_type * d_p = (scalar_type *)&d[0]; for(int o=0;ooSites();ss+=LLs){ // adds LLs #if 0 alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hm; alignas(64) SiteSpinor fp; alignas(64) SiteSpinor fm; for(int v=0;v=v ) rotate(hm,hm,nsimd-1); hp=hp*0.5; hm=hm*0.5; spRecon5p(fp,hp); spRecon5m(fm,hm); chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp; chi[ss+v] = chi[ss+v] +l[v]*fm; } #else for(int v=0;v(hp_00.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); } if ( vm>=v ) { hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(),hp_00); Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(),hp_01); Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(),hp_02); Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(),hp_10); Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(),hp_11); Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(),hp_12); Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(),hm_00); Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(),hm_01); Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(),hm_02); Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(),hm_10); Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(),hm_11); Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(),hm_12); vstream(chi[ss+v]()(0)(0),p_00); vstream(chi[ss+v]()(0)(1),p_01); vstream(chi[ss+v]()(0)(2),p_02); vstream(chi[ss+v]()(1)(0),p_10); vstream(chi[ss+v]()(1)(1),p_11); vstream(chi[ss+v]()(1)(2),p_12); vstream(chi[ss+v]()(2)(0),p_20); vstream(chi[ss+v]()(2)(1),p_21); vstream(chi[ss+v]()(2)(2),p_22); vstream(chi[ss+v]()(3)(0),p_30); vstream(chi[ss+v]()(3)(1),p_31); vstream(chi[ss+v]()(3)(2),p_32); } #endif } M5Dtime+=usecond(); } #ifdef AVX512 #include #include #include #endif template void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionField &chi, int LLs, int site, Vector > &Matp, Vector > &Matm) { #ifndef AVX512 { SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; SiteHalfSpinor SiteChiP; SiteHalfSpinor SiteChiM; // Ls*Ls * 2 * 12 * vol flops for(int s1=0;s1); for(int s1=0;s1 void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionField &chi, int LLs, int site, Vector > &Matp, Vector > &Matm) { #ifndef AVX512 { SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; SiteHalfSpinor SiteChiP; SiteHalfSpinor SiteChiM; // Ls*Ls * 2 * 12 * vol flops for(int s1=0;s1); for(int s1=0;s1 void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) { int Ls=this->Ls; int LLs = psi._grid->_rdimensions[0]; int vol = psi._grid->oSites()/LLs; chi.checkerboard=psi.checkerboard; Vector > Matp; Vector > Matm; Vector > *_Matp; Vector > *_Matm; // MooeeInternalCompute(dag,inv,Matp,Matm); if ( inv && dag ) { _Matp = &MatpInvDag; _Matm = &MatmInvDag; } if ( inv && (!dag) ) { _Matp = &MatpInv; _Matm = &MatmInv; } if ( !inv ) { MooeeInternalCompute(dag,inv,Matp,Matm); _Matp = &Matp; _Matm = &Matm; } assert(_Matp->size()==Ls*LLs); MooeeInvCalls++; MooeeInvTime-=usecond(); if ( switcheroo::iscomplex() ) { PARALLEL_FOR_LOOP for(auto site=0;site::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); }}