mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-19 00:07:05 +01:00
Compiles GPU and CPU, still gives good performance on CPU
This commit is contained in:
@ -1,668 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid_Eigen_Dense.h>
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD _M5,const ImplParams &p) :
|
||||
WilsonFermion5D<Impl>(_Umu,
|
||||
FiveDimGrid,
|
||||
FiveDimRedBlackGrid,
|
||||
FourDimGrid,
|
||||
FourDimRedBlackGrid,_M5,p),
|
||||
mass(_mass)
|
||||
{
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Physical surface field utilities
|
||||
///////////////////////////////////////////////////////////////
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
FermionField tmp(this->FermionGrid());
|
||||
tmp = solution5d;
|
||||
conformable(solution5d.Grid(),this->FermionGrid());
|
||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||
axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
|
||||
axpby_ssp_pplus (tmp, 1., tmp , 1., solution5d, 0, Ls-1);
|
||||
ExtractSlice(exported4d, tmp, 0, 0);
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::P(const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls= this->Ls;
|
||||
chi=Zero();
|
||||
for(int s=0;s<Ls;s++){
|
||||
axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
|
||||
axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s+1)%Ls);
|
||||
}
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::Pdag(const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls= this->Ls;
|
||||
chi=Zero();
|
||||
for(int s=0;s<Ls;s++){
|
||||
axpby_ssp_pminus(chi,1.0,chi,1.0,psi,s,s);
|
||||
axpby_ssp_pplus (chi,1.0,chi,1.0,psi,s,(s-1+Ls)%Ls);
|
||||
}
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::ExportPhysicalFermionSource(const FermionField &solution5d,FermionField &exported4d)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
FermionField tmp(this->FermionGrid());
|
||||
tmp = solution5d;
|
||||
conformable(solution5d.Grid(),this->FermionGrid());
|
||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||
axpby_ssp_pplus (tmp, 0., solution5d, 1., solution5d, 0, 0);
|
||||
axpby_ssp_pminus(tmp, 1., tmp , 1., solution5d, 0, Ls-1);
|
||||
ExtractSlice(exported4d, tmp, 0, 0);
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::ImportUnphysicalFermion(const FermionField &input4d,FermionField &imported5d)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
FermionField tmp(this->FermionGrid());
|
||||
conformable(imported5d.Grid(),this->FermionGrid());
|
||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||
tmp = Zero();
|
||||
InsertSlice(input4d, tmp, 0 , 0);
|
||||
InsertSlice(input4d, tmp, Ls-1, 0);
|
||||
axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
|
||||
axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
|
||||
imported5d=tmp;
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
FermionField tmp(this->FermionGrid());
|
||||
conformable(imported5d.Grid(),this->FermionGrid());
|
||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||
tmp = Zero();
|
||||
InsertSlice(input4d, tmp, 0 , 0);
|
||||
InsertSlice(input4d, tmp, Ls-1, 0);
|
||||
axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
|
||||
axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
|
||||
Dminus(tmp,imported5d);
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
|
||||
FermionField tmp_f(this->FermionGrid());
|
||||
this->DW(psi,tmp_f,DaggerNo);
|
||||
|
||||
for(int s=0;s<Ls;s++){
|
||||
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
|
||||
}
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
|
||||
FermionField tmp_f(this->FermionGrid());
|
||||
this->DW(psi,tmp_f,DaggerYes);
|
||||
|
||||
for(int s=0;s<Ls;s++){
|
||||
axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
||||
{
|
||||
this->Report();
|
||||
Coordinate latt = GridDefaultLatt();
|
||||
RealD volume = this->Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
RealD NP = this->_FourDimGrid->_Nprocessors;
|
||||
if ( M5Dcalls > 0 ) {
|
||||
std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl;
|
||||
|
||||
// Flops = 10.0*(Nc*Ns) *Ls*vol
|
||||
RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
|
||||
// Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
|
||||
// read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
|
||||
// write = 1
|
||||
RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
|
||||
std::cout << GridLogMessage << "Average bandwidth (GB/s) : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
|
||||
}
|
||||
|
||||
if ( MooeeInvCalls > 0 ) {
|
||||
|
||||
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
||||
#ifdef GRID_NVCC
|
||||
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
#else
|
||||
// Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
|
||||
RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
|
||||
{
|
||||
this->ZeroCounters();
|
||||
M5Dflops=0;
|
||||
M5Dcalls=0;
|
||||
M5Dtime=0;
|
||||
MooeeInvFlops=0;
|
||||
MooeeInvCalls=0;
|
||||
MooeeInvTime=0;
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag (Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass;
|
||||
M5D(psi,chi,chi,lower,diag,upper);
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag = bs;
|
||||
Vector<Coeff_t> upper= cs;
|
||||
Vector<Coeff_t> lower= cs;
|
||||
upper[Ls-1]=-mass*upper[Ls-1];
|
||||
lower[0] =-mass*lower[0];
|
||||
M5D(psi,psi,Din,lower,diag,upper);
|
||||
}
|
||||
// FIXME Redunant with the above routine; check this and eliminate
|
||||
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag = beo;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
for(int i=0;i<Ls;i++) {
|
||||
upper[i]=-ceo[i];
|
||||
lower[i]=-ceo[i];
|
||||
}
|
||||
upper[Ls-1]=-mass*upper[Ls-1];
|
||||
lower[0] =-mass*lower[0];
|
||||
M5D(psi,psi,chi,lower,diag,upper);
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag = bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
for(int i=0;i<Ls;i++) {
|
||||
upper[i]=-cee[i];
|
||||
lower[i]=-cee[i];
|
||||
}
|
||||
upper[Ls-1]=-mass*upper[Ls-1];
|
||||
lower[0] =-mass*lower[0];
|
||||
M5D(psi,psi,chi,lower,diag,upper);
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag = bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
|
||||
for (int s=0;s<Ls;s++){
|
||||
// Assemble the 5d matrix
|
||||
if ( s==0 ) {
|
||||
upper[s] = -cee[s+1] ;
|
||||
lower[s] = mass*cee[Ls-1];
|
||||
} else if ( s==(Ls-1)) {
|
||||
upper[s] = mass*cee[0];
|
||||
lower[s] = -cee[s-1];
|
||||
} else {
|
||||
upper[s]=-cee[s+1];
|
||||
lower[s]=-cee[s-1];
|
||||
}
|
||||
}
|
||||
// Conjugate the terms
|
||||
for (int s=0;s<Ls;s++){
|
||||
diag[s] =conjugate(diag[s]);
|
||||
upper[s]=conjugate(upper[s]);
|
||||
lower[s]=conjugate(lower[s]);
|
||||
}
|
||||
M5Ddag(psi,psi,chi,lower,diag,upper);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0);
|
||||
Vector<Coeff_t> lower(Ls,-1.0);
|
||||
upper[Ls-1]=-mass*upper[Ls-1];
|
||||
lower[0] =-mass*lower[0];
|
||||
M5Ddag(psi,chi,chi,lower,diag,upper);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag =bs;
|
||||
Vector<Coeff_t> upper=cs;
|
||||
Vector<Coeff_t> lower=cs;
|
||||
|
||||
for (int s=0;s<Ls;s++){
|
||||
if ( s== 0 ) {
|
||||
upper[s] = cs[s+1];
|
||||
lower[s] =-mass*cs[Ls-1];
|
||||
} else if ( s==(Ls-1) ) {
|
||||
upper[s] =-mass*cs[0];
|
||||
lower[s] = cs[s-1];
|
||||
} else {
|
||||
upper[s] = cs[s+1];
|
||||
lower[s] = cs[s-1];
|
||||
}
|
||||
upper[s] = conjugate(upper[s]);
|
||||
lower[s] = conjugate(lower[s]);
|
||||
diag[s] = conjugate(diag[s]);
|
||||
}
|
||||
M5Ddag(psi,psi,Din,lower,diag,upper);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
RealD CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
FermionField Din(psi.Grid());
|
||||
|
||||
// Assemble Din
|
||||
Meooe5D(psi,Din);
|
||||
|
||||
this->DW(Din,chi,DaggerNo);
|
||||
// ((b D_W + D_w hop terms +1) on s-diag
|
||||
axpby(chi,1.0,1.0,chi,psi);
|
||||
|
||||
M5D(psi,chi);
|
||||
return(norm2(chi));
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
// Under adjoint
|
||||
//D1+ D1- P- -> D1+^dag P+ D2-^dag
|
||||
//D2- P+ D2+ P-D1-^dag D2+dag
|
||||
|
||||
FermionField Din(psi.Grid());
|
||||
// Apply Dw
|
||||
this->DW(psi,Din,DaggerYes);
|
||||
|
||||
MeooeDag5D(Din,chi);
|
||||
|
||||
M5Ddag(psi,chi);
|
||||
// ((b D_W + D_w hop terms +1) on s-diag
|
||||
axpby (chi,1.0,1.0,chi,psi);
|
||||
return norm2(chi);
|
||||
}
|
||||
|
||||
// half checkerboard operations
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::Meooe (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
Meooe5D(psi,this->tmp());
|
||||
|
||||
if ( psi.Checkerboard() == Odd ) {
|
||||
this->DhopEO(this->tmp(),chi,DaggerNo);
|
||||
} else {
|
||||
this->DhopOE(this->tmp(),chi,DaggerNo);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MeooeDag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
// Apply 4d dslash
|
||||
if ( psi.Checkerboard() == Odd ) {
|
||||
this->DhopEO(psi,this->tmp(),DaggerYes);
|
||||
} else {
|
||||
this->DhopOE(psi,this->tmp(),DaggerYes);
|
||||
}
|
||||
MeooeDag5D(this->tmp(),chi);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
|
||||
Meo5D(psi,this->tmp());
|
||||
// Apply 4d dslash fragment
|
||||
this->DhopDir(this->tmp(),chi,dir,disp);
|
||||
}
|
||||
// force terms; five routines; default to Dhop on diagonal
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
|
||||
{
|
||||
FermionField Din(V.Grid());
|
||||
|
||||
if ( dag == DaggerNo ) {
|
||||
// U d/du [D_w D5] V = U d/du DW D5 V
|
||||
Meooe5D(V,Din);
|
||||
this->DhopDeriv(mat,U,Din,dag);
|
||||
} else {
|
||||
// U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
|
||||
Meooe5D(U,Din);
|
||||
this->DhopDeriv(mat,Din,V,dag);
|
||||
}
|
||||
};
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
|
||||
{
|
||||
FermionField Din(V.Grid());
|
||||
|
||||
if ( dag == DaggerNo ) {
|
||||
// U d/du [D_w D5] V = U d/du DW D5 V
|
||||
Meooe5D(V,Din);
|
||||
this->DhopDerivOE(mat,U,Din,dag);
|
||||
} else {
|
||||
// U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
|
||||
Meooe5D(U,Din);
|
||||
this->DhopDerivOE(mat,Din,V,dag);
|
||||
}
|
||||
};
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
|
||||
{
|
||||
FermionField Din(V.Grid());
|
||||
|
||||
if ( dag == DaggerNo ) {
|
||||
// U d/du [D_w D5] V = U d/du DW D5 V
|
||||
Meooe5D(V,Din);
|
||||
this->DhopDerivEO(mat,U,Din,dag);
|
||||
} else {
|
||||
// U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call
|
||||
Meooe5D(U,Din);
|
||||
this->DhopDerivEO(mat,Din,V,dag);
|
||||
}
|
||||
};
|
||||
|
||||
// Tanh
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||
{
|
||||
Vector<Coeff_t> gamma(this->Ls);
|
||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||
SetCoefficientsInternal(1.0,gamma,b,c);
|
||||
}
|
||||
//Zolo
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||
{
|
||||
Vector<Coeff_t> gamma(this->Ls);
|
||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||
SetCoefficientsInternal(zolo_hi,gamma,b,c);
|
||||
}
|
||||
//Zolo
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// The Cayley coeffs (unprec)
|
||||
///////////////////////////////////////////////////////////
|
||||
assert(gamma.size()==Ls);
|
||||
|
||||
omega.resize(Ls);
|
||||
bs.resize(Ls);
|
||||
cs.resize(Ls);
|
||||
as.resize(Ls);
|
||||
|
||||
//
|
||||
// Ts = ( [bs+cs]Dw )^-1 ( (bs+cs) Dw )
|
||||
// -(g5 ------- -1 ) ( g5 --------- + 1 )
|
||||
// ( {2+(bs-cs)Dw} ) ( 2+(bs-cs) Dw )
|
||||
//
|
||||
// bs = 1/2( (1/omega_s + 1)*b + (1/omega - 1)*c ) = 1/2( 1/omega(b+c) + (b-c) )
|
||||
// cs = 1/2( (1/omega_s - 1)*b + (1/omega + 1)*c ) = 1/2( 1/omega(b+c) - (b-c) )
|
||||
//
|
||||
// bs+cs = 0.5*( 1/omega(b+c) + (b-c) + 1/omega(b+c) - (b-c) ) = 1/omega(b+c)
|
||||
// bs-cs = 0.5*( 1/omega(b+c) + (b-c) - 1/omega(b+c) + (b-c) ) = b-c
|
||||
//
|
||||
// So
|
||||
//
|
||||
// Ts = ( [b+c]Dw/omega_s )^-1 ( (b+c) Dw /omega_s )
|
||||
// -(g5 ------- -1 ) ( g5 --------- + 1 )
|
||||
// ( {2+(b-c)Dw} ) ( 2+(b-c) Dw )
|
||||
//
|
||||
// Ts = ( [b+c]Dw )^-1 ( (b+c) Dw )
|
||||
// -(g5 ------- -omega_s) ( g5 --------- + omega_s )
|
||||
// ( {2+(b-c)Dw} ) ( 2+(b-c) Dw )
|
||||
//
|
||||
|
||||
double bpc = b+c;
|
||||
double bmc = b-c;
|
||||
_b = b;
|
||||
_c = c;
|
||||
_gamma = gamma; // Save the parameters so we can change mass later.
|
||||
_zolo_hi= zolo_hi;
|
||||
for(int i=0; i < Ls; i++){
|
||||
as[i] = 1.0;
|
||||
omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code
|
||||
assert(omega[i]!=Coeff_t(0.0));
|
||||
bs[i] = 0.5*(bpc/omega[i] + bmc);
|
||||
cs[i] = 0.5*(bpc/omega[i] - bmc);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Constants for the preconditioned matrix Cayley form
|
||||
////////////////////////////////////////////////////////
|
||||
bee.resize(Ls);
|
||||
cee.resize(Ls);
|
||||
beo.resize(Ls);
|
||||
ceo.resize(Ls);
|
||||
|
||||
for(int i=0;i<Ls;i++){
|
||||
bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
|
||||
assert(bee[i]!=Coeff_t(0.0));
|
||||
cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
|
||||
beo[i]=as[i]*bs[i];
|
||||
ceo[i]=-as[i]*cs[i];
|
||||
}
|
||||
aee.resize(Ls);
|
||||
aeo.resize(Ls);
|
||||
for(int i=0;i<Ls;i++){
|
||||
aee[i]=cee[i];
|
||||
aeo[i]=ceo[i];
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
// LDU decomposition of eeoo
|
||||
//////////////////////////////////////////
|
||||
dee.resize(Ls);
|
||||
lee.resize(Ls);
|
||||
leem.resize(Ls);
|
||||
uee.resize(Ls);
|
||||
ueem.resize(Ls);
|
||||
|
||||
for(int i=0;i<Ls;i++){
|
||||
|
||||
dee[i] = bee[i];
|
||||
|
||||
if ( i < Ls-1 ) {
|
||||
|
||||
assert(bee[i]!=Coeff_t(0.0));
|
||||
assert(bee[0]!=Coeff_t(0.0));
|
||||
|
||||
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
|
||||
|
||||
leem[i]=mass*cee[Ls-1]/bee[0];
|
||||
for(int j=0;j<i;j++) {
|
||||
assert(bee[j+1]!=Coeff_t(0.0));
|
||||
leem[i]*= aee[j]/bee[j+1];
|
||||
}
|
||||
|
||||
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
|
||||
|
||||
ueem[i]=mass;
|
||||
for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
|
||||
ueem[i]*= aee[0]/bee[0];
|
||||
|
||||
} else {
|
||||
lee[i] =0.0;
|
||||
leem[i]=0.0;
|
||||
uee[i] =0.0;
|
||||
ueem[i]=0.0;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
Coeff_t delta_d=mass*cee[Ls-1];
|
||||
for(int j=0;j<Ls-1;j++) {
|
||||
assert(bee[j] != Coeff_t(0.0));
|
||||
delta_d *= cee[j]/bee[j];
|
||||
}
|
||||
dee[Ls-1] += delta_d;
|
||||
}
|
||||
|
||||
int inv=1;
|
||||
this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
|
||||
this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
|
||||
Vector<iSinglet<Simd> > & Matp,
|
||||
Vector<iSinglet<Simd> > & Matm)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
|
||||
GridBase *grid = this->FermionRedBlackGrid();
|
||||
int LLs = grid->_rdimensions[0];
|
||||
|
||||
if ( LLs == Ls ) {
|
||||
return; // Not vectorised in 5th direction
|
||||
}
|
||||
|
||||
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
|
||||
for(int s=0;s<Ls;s++){
|
||||
Pplus(s,s) = bee[s];
|
||||
Pminus(s,s)= bee[s];
|
||||
}
|
||||
|
||||
for(int s=0;s<Ls-1;s++){
|
||||
Pminus(s,s+1) = -cee[s];
|
||||
}
|
||||
|
||||
for(int s=0;s<Ls-1;s++){
|
||||
Pplus(s+1,s) = -cee[s+1];
|
||||
}
|
||||
Pplus (0,Ls-1) = mass*cee[0];
|
||||
Pminus(Ls-1,0) = mass*cee[Ls-1];
|
||||
|
||||
Eigen::MatrixXcd PplusMat ;
|
||||
Eigen::MatrixXcd PminusMat;
|
||||
|
||||
if ( inv ) {
|
||||
PplusMat =Pplus.inverse();
|
||||
PminusMat=Pminus.inverse();
|
||||
} else {
|
||||
PplusMat =Pplus;
|
||||
PminusMat=Pminus;
|
||||
}
|
||||
|
||||
if(dag){
|
||||
PplusMat.adjointInPlace();
|
||||
PminusMat.adjointInPlace();
|
||||
}
|
||||
|
||||
typedef typename SiteHalfSpinor::scalar_type scalar_type;
|
||||
const int Nsimd=Simd::Nsimd();
|
||||
Matp.resize(Ls*LLs);
|
||||
Matm.resize(Ls*LLs);
|
||||
|
||||
for(int s2=0;s2<Ls;s2++){
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
int istride = LLs;
|
||||
int ostride = 1;
|
||||
Simd Vp;
|
||||
Simd Vm;
|
||||
scalar_type *sp = (scalar_type *)&Vp;
|
||||
scalar_type *sm = (scalar_type *)&Vm;
|
||||
for(int l=0;l<Nsimd;l++){
|
||||
if ( switcheroo<Coeff_t>::iscomplex() ) {
|
||||
sp[l] = PplusMat (l*istride+s1*ostride,s2);
|
||||
sm[l] = PminusMat(l*istride+s1*ostride,s2);
|
||||
} else {
|
||||
// if real
|
||||
scalar_type tmp;
|
||||
tmp = PplusMat (l*istride+s1*ostride,s2);
|
||||
sp[l] = scalar_type(tmp.real(),tmp.real());
|
||||
tmp = PminusMat(l*istride+s1*ostride,s2);
|
||||
sm[l] = scalar_type(tmp.real(),tmp.real());
|
||||
}
|
||||
}
|
||||
Matp[LLs*s2+s1] = Vp;
|
||||
Matm[LLs*s2+s1] = Vm;
|
||||
}}
|
||||
}
|
||||
|
||||
|
||||
FermOpTemplateInstantiate(CayleyFermion5D);
|
||||
GparityFermOpTemplateInstantiate(CayleyFermion5D);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@ -1,247 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// Pminus fowards
|
||||
// Pplus backwards..
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||
const FermionField &phi_i,
|
||||
FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls =this->Ls;
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
// 10 = 3 complex mult + 2 complex add
|
||||
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
|
||||
thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||
for(int s=0;s<Ls;s++){
|
||||
auto tmp = psi[0];
|
||||
if ( s==0 ) {
|
||||
spProj5m(tmp,psi[ss+s+1]);
|
||||
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
|
||||
|
||||
spProj5p(tmp,psi[ss+Ls-1]);
|
||||
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
||||
} else if ( s==(Ls-1)) {
|
||||
spProj5m(tmp,psi[ss+0]);
|
||||
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
|
||||
|
||||
spProj5p(tmp,psi[ss+s-1]);
|
||||
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
||||
} else {
|
||||
spProj5m(tmp,psi[ss+s+1]);
|
||||
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
|
||||
|
||||
spProj5p(tmp,psi[ss+s-1]);
|
||||
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||
const FermionField &phi_i,
|
||||
FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls =this->Ls;
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
|
||||
thread_loop( (int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||
auto tmp = psi[0];
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( s==0 ) {
|
||||
spProj5p(tmp,psi[ss+s+1]);
|
||||
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
|
||||
|
||||
spProj5m(tmp,psi[ss+Ls-1]);
|
||||
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
||||
} else if ( s==(Ls-1)) {
|
||||
spProj5p(tmp,psi[ss+0]);
|
||||
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
|
||||
|
||||
spProj5m(tmp,psi[ss+s-1]);
|
||||
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
||||
} else {
|
||||
spProj5p(tmp,psi[ss+s+1]);
|
||||
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
|
||||
|
||||
spProj5m(tmp,psi[ss+s-1]);
|
||||
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi_i)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
int Ls=this->Ls;
|
||||
|
||||
MooeeInvCalls++;
|
||||
MooeeInvTime-=usecond();
|
||||
|
||||
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||
auto tmp = psi[0];
|
||||
|
||||
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
||||
// Apply (L^{\prime})^{-1}
|
||||
chi[ss]=psi[ss]; // chi[0]=psi[0]
|
||||
for(int s=1;s<Ls;s++){
|
||||
spProj5p(tmp,chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
|
||||
}
|
||||
// L_m^{-1}
|
||||
for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
|
||||
spProj5m(tmp,chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
|
||||
}
|
||||
// U_m^{-1} D^{-1}
|
||||
for (int s=0;s<Ls-1;s++){
|
||||
// Chi[s] + 1/d chi[s]
|
||||
spProj5p(tmp,chi[ss+Ls-1]);
|
||||
chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
|
||||
}
|
||||
chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
|
||||
|
||||
// Apply U^{-1}
|
||||
for (int s=Ls-2;s>=0;s--){
|
||||
spProj5m(tmp,chi[ss+s+1]);
|
||||
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
|
||||
}
|
||||
});
|
||||
|
||||
MooeeInvTime+=usecond();
|
||||
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
int Ls=this->Ls;
|
||||
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
assert(psi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
MooeeInvCalls++;
|
||||
MooeeInvTime-=usecond();
|
||||
|
||||
thread_loop((int ss=0;ss<grid->oSites();ss+=Ls),{ // adds Ls
|
||||
|
||||
auto tmp = psi[0];
|
||||
|
||||
// Apply (U^{\prime})^{-dagger}
|
||||
chi[ss]=psi[ss];
|
||||
for (int s=1;s<Ls;s++){
|
||||
spProj5m(tmp,chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s]-conjugate(uee[s-1])*tmp;
|
||||
}
|
||||
// U_m^{-\dagger}
|
||||
for (int s=0;s<Ls-1;s++){
|
||||
spProj5p(tmp,chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - conjugate(ueem[s])*tmp;
|
||||
}
|
||||
|
||||
// L_m^{-\dagger} D^{-dagger}
|
||||
for (int s=0;s<Ls-1;s++){
|
||||
spProj5m(tmp,chi[ss+Ls-1]);
|
||||
chi[ss+s] = conjugate(1.0/dee[s])*chi[ss+s]-conjugate(leem[s]/dee[Ls-1])*tmp;
|
||||
}
|
||||
chi[ss+Ls-1]= conjugate(1.0/dee[Ls-1])*chi[ss+Ls-1];
|
||||
|
||||
// Apply L^{-dagger}
|
||||
for (int s=Ls-2;s>=0;s--){
|
||||
spProj5p(tmp,chi[ss+s+1]);
|
||||
chi[ss+s] = chi[ss+s] - conjugate(lee[s])*tmp;
|
||||
}
|
||||
});
|
||||
|
||||
MooeeInvTime+=usecond();
|
||||
|
||||
}
|
||||
|
||||
#ifdef CAYLEY_DPERP_CACHE
|
||||
INSTANTIATE_DPERP(WilsonImplF);
|
||||
INSTANTIATE_DPERP(WilsonImplD);
|
||||
INSTANTIATE_DPERP(GparityWilsonImplF);
|
||||
INSTANTIATE_DPERP(GparityWilsonImplD);
|
||||
INSTANTIATE_DPERP(ZWilsonImplF);
|
||||
INSTANTIATE_DPERP(ZWilsonImplD);
|
||||
|
||||
INSTANTIATE_DPERP(WilsonImplFH);
|
||||
INSTANTIATE_DPERP(WilsonImplDF);
|
||||
INSTANTIATE_DPERP(GparityWilsonImplFH);
|
||||
INSTANTIATE_DPERP(GparityWilsonImplDF);
|
||||
INSTANTIATE_DPERP(ZWilsonImplFH);
|
||||
INSTANTIATE_DPERP(ZWilsonImplDF);
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,284 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// Pminus fowards
|
||||
// Pplus backwards..
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||
const FermionField &phi_i,
|
||||
FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
Coeff_t *lower_v = &lower[0];
|
||||
Coeff_t *diag_v = &diag[0];
|
||||
Coeff_t *upper_v = &upper[0];
|
||||
int Ls =this->Ls;
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites4d = nsimd * grid->oSites() / Ls;
|
||||
|
||||
// 10 = 3 complex mult + 2 complex add
|
||||
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
|
||||
accelerator_loopN( sss, sites4d ,{
|
||||
uint64_t lane = sss % nsimd;
|
||||
uint64_t ss = Ls * (sss / nsimd);
|
||||
|
||||
for(int s=0;s<Ls;s++){
|
||||
auto res = extractLane(lane,phi[ss+s]);
|
||||
res = diag_v[s]*res;
|
||||
|
||||
auto tmp = extractLane(lane,psi[ss+(s+1)%Ls]);
|
||||
spProj5m(tmp,tmp);
|
||||
res += upper_v[s]*tmp;
|
||||
|
||||
tmp = extractLane(lane,psi[ss+(s+Ls-1)%Ls]);
|
||||
spProj5p(tmp,tmp);
|
||||
res += lower_v[s]*tmp;
|
||||
|
||||
insertLane(lane,chi[ss+s],res);
|
||||
}
|
||||
});
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||
const FermionField &phi_i,
|
||||
FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
Coeff_t *lower_v = &lower[0];
|
||||
Coeff_t *diag_v = &diag[0];
|
||||
Coeff_t *upper_v = &upper[0];
|
||||
int Ls =this->Ls;
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites4d = nsimd * grid->oSites() / Ls;
|
||||
|
||||
// 10 = 3 complex mult + 2 complex add
|
||||
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
|
||||
accelerator_loopN( sss, sites4d ,{
|
||||
uint64_t lane = sss % nsimd;
|
||||
uint64_t ss = Ls * (sss / nsimd);
|
||||
|
||||
for(int s=0;s<Ls;s++){
|
||||
auto res = extractLane(lane,phi[ss+s]);
|
||||
res = diag_v[s]*res;
|
||||
|
||||
auto tmp = extractLane(lane,psi[ss+(s+1)%Ls]);
|
||||
spProj5p(tmp,tmp);
|
||||
res += upper_v[s]*tmp;
|
||||
|
||||
tmp = extractLane(lane,psi[ss+(s+Ls-1)%Ls]);
|
||||
spProj5m(tmp,tmp);
|
||||
res += lower_v[s]*tmp;
|
||||
|
||||
insertLane(lane,chi[ss+s],res);
|
||||
}
|
||||
});
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi_i)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
Coeff_t *lee_v = &lee[0];
|
||||
Coeff_t *leem_v = &leem[0];
|
||||
Coeff_t *uee_v = &uee[0];
|
||||
Coeff_t *ueem_v = &ueem[0];
|
||||
Coeff_t *dee_v = &dee[0];
|
||||
|
||||
int Ls=this->Ls;
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites4d = nsimd * grid->oSites() / Ls;
|
||||
|
||||
typedef typename SiteSpinor::scalar_object ScalarSiteSpinor;
|
||||
|
||||
MooeeInvCalls++;
|
||||
MooeeInvTime-=usecond();
|
||||
|
||||
accelerator_loopN( sss, sites4d ,{
|
||||
uint64_t lane = sss % nsimd;
|
||||
uint64_t ss = Ls * (sss / nsimd);
|
||||
ScalarSiteSpinor res, tmp, acc;
|
||||
|
||||
// X = Nc*Ns
|
||||
// flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
|
||||
// Apply (L^{\prime})^{-1} L_m^{-1}
|
||||
res = extractLane(lane,psi[ss]);
|
||||
spProj5m(tmp,res);
|
||||
acc = leem_v[0]*tmp;
|
||||
spProj5p(tmp,res);
|
||||
insertLane(lane,chi[ss],res);
|
||||
|
||||
for(int s=1;s<Ls-1;s++){
|
||||
res = extractLane(lane,psi[ss+s]);
|
||||
res -= lee_v[s-1]*tmp;
|
||||
spProj5m(tmp,res);
|
||||
acc += leem_v[s]*tmp;
|
||||
spProj5p(tmp,res);
|
||||
insertLane(lane,chi[ss+s],res);
|
||||
}
|
||||
res = extractLane(lane,psi[ss+Ls-1]);
|
||||
res = res - lee_v[Ls-2]*tmp - acc;
|
||||
|
||||
// Apply U_m^{-1} D^{-1} U^{-1}
|
||||
res = (1.0/dee_v[Ls-1])*res;
|
||||
insertLane(lane,chi[ss+Ls-1],res);
|
||||
spProj5p(acc,res);
|
||||
spProj5m(tmp,res);
|
||||
for (int s=Ls-2;s>=0;s--){
|
||||
res = extractLane(lane,chi[ss+s]);
|
||||
res = (1.0/dee_v[s])*res - uee_v[s]*tmp - ueem_v[s]*acc;
|
||||
spProj5m(tmp,res);
|
||||
insertLane(lane,chi[ss+s],res);
|
||||
}
|
||||
});
|
||||
|
||||
MooeeInvTime+=usecond();
|
||||
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi_i)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
Coeff_t *lee_v = &lee[0];
|
||||
Coeff_t *leem_v = &leem[0];
|
||||
Coeff_t *uee_v = &uee[0];
|
||||
Coeff_t *ueem_v = &ueem[0];
|
||||
Coeff_t *dee_v = &dee[0];
|
||||
|
||||
int Ls=this->Ls;
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites4d = nsimd * grid->oSites() / Ls;
|
||||
|
||||
typedef typename SiteSpinor::scalar_object ScalarSiteSpinor;
|
||||
|
||||
MooeeInvCalls++;
|
||||
MooeeInvTime-=usecond();
|
||||
|
||||
accelerator_loopN( sss, sites4d ,{
|
||||
uint64_t lane = sss % nsimd;
|
||||
uint64_t ss = Ls * (sss / nsimd);
|
||||
ScalarSiteSpinor res, tmp, acc;
|
||||
|
||||
// X = Nc*Ns
|
||||
// flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
|
||||
// Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
|
||||
res = extractLane(lane,psi[ss]);
|
||||
spProj5p(tmp,res);
|
||||
acc = conjugate(ueem_v[0])*tmp;
|
||||
spProj5m(tmp,res);
|
||||
insertLane(lane,chi[ss],res);
|
||||
|
||||
for(int s=1;s<Ls-1;s++){
|
||||
res = extractLane(lane,psi[ss+s]);
|
||||
res -= conjugate(uee_v[s-1])*tmp;
|
||||
spProj5p(tmp,res);
|
||||
acc += conjugate(ueem_v[s])*tmp;
|
||||
spProj5m(tmp,res);
|
||||
insertLane(lane,chi[ss+s],res);
|
||||
}
|
||||
res = extractLane(lane,psi[ss+Ls-1]);
|
||||
res = res - conjugate(uee_v[Ls-2])*tmp - acc;
|
||||
|
||||
// Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
|
||||
res = conjugate(1.0/dee_v[Ls-1])*res;
|
||||
insertLane(lane,chi[ss+Ls-1],res);
|
||||
spProj5m(acc,res);
|
||||
spProj5p(tmp,res);
|
||||
for (int s=Ls-2;s>=0;s--){
|
||||
res = extractLane(lane,chi[ss+s]);
|
||||
res = conjugate(1.0/dee_v[s])*res - conjugate(lee_v[s])*tmp - conjugate(leem_v[s])*acc;
|
||||
spProj5p(tmp,res);
|
||||
insertLane(lane,chi[ss+s],res);
|
||||
}
|
||||
});
|
||||
|
||||
MooeeInvTime+=usecond();
|
||||
|
||||
}
|
||||
|
||||
#ifdef CAYLEY_DPERP_GPU
|
||||
INSTANTIATE_DPERP(WilsonImplF);
|
||||
INSTANTIATE_DPERP(WilsonImplD);
|
||||
INSTANTIATE_DPERP(GparityWilsonImplF);
|
||||
INSTANTIATE_DPERP(GparityWilsonImplD);
|
||||
INSTANTIATE_DPERP(ZWilsonImplF);
|
||||
INSTANTIATE_DPERP(ZWilsonImplD);
|
||||
|
||||
INSTANTIATE_DPERP(WilsonImplFH);
|
||||
INSTANTIATE_DPERP(WilsonImplDF);
|
||||
INSTANTIATE_DPERP(GparityWilsonImplFH);
|
||||
INSTANTIATE_DPERP(GparityWilsonImplDF);
|
||||
INSTANTIATE_DPERP(ZWilsonImplFH);
|
||||
INSTANTIATE_DPERP(ZWilsonImplDF);
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,838 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*
|
||||
* Dense matrix versions of routines
|
||||
*/
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||
const FermionField &phi_i,
|
||||
FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
const int nsimd= Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd> > u(LLs);
|
||||
Vector<iSinglet<Simd> > l(LLs);
|
||||
Vector<iSinglet<Simd> > d(LLs);
|
||||
|
||||
assert(Ls/LLs==nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type * u_p = (scalar_type *)&u[0];
|
||||
scalar_type * l_p = (scalar_type *)&l[0];
|
||||
scalar_type * d_p = (scalar_type *)&d[0];
|
||||
|
||||
for(int o=0;o<LLs;o++){ // outer
|
||||
for(int i=0;i<nsimd;i++){ //inner
|
||||
int s = o+i*LLs;
|
||||
int ss = o*nsimd+i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
|
||||
assert(Nc==3);
|
||||
|
||||
thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
|
||||
#if 0
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0;v<LLs;v++){
|
||||
|
||||
int vp=(v+1)%LLs;
|
||||
int vm=(v+LLs-1)%LLs;
|
||||
|
||||
spProj5m(hp,psi[ss+vp]);
|
||||
spProj5p(hm,psi[ss+vm]);
|
||||
|
||||
if ( vp<=v ) rotate(hp,hp,1);
|
||||
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
||||
|
||||
hp=0.5*hp;
|
||||
hm=0.5*hm;
|
||||
|
||||
spRecon5m(fp,hp);
|
||||
spRecon5p(fm,hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v];
|
||||
chi[ss+v] = chi[ss+v] +u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
|
||||
}
|
||||
#else
|
||||
for(int v=0;v<LLs;v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp= (v==LLs-1) ? 0 : v+1;
|
||||
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
if ( vp<=v ) {
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
if ( vm>=v ) {
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
// Can force these to real arithmetic and save 2x.
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0),p_00);
|
||||
vstream(chi[ss+v]()(0)(1),p_01);
|
||||
vstream(chi[ss+v]()(0)(2),p_02);
|
||||
vstream(chi[ss+v]()(1)(0),p_10);
|
||||
vstream(chi[ss+v]()(1)(1),p_11);
|
||||
vstream(chi[ss+v]()(1)(2),p_12);
|
||||
vstream(chi[ss+v]()(2)(0),p_20);
|
||||
vstream(chi[ss+v]()(2)(1),p_21);
|
||||
vstream(chi[ss+v]()(2)(2),p_22);
|
||||
vstream(chi[ss+v]()(3)(0),p_30);
|
||||
vstream(chi[ss+v]()(3)(1),p_31);
|
||||
vstream(chi[ss+v]()(3)(2),p_32);
|
||||
|
||||
}
|
||||
#endif
|
||||
});
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||
const FermionField &phi_i,
|
||||
FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
auto psi=psi_i.View();
|
||||
auto phi=phi_i.View();
|
||||
auto chi=chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
int nsimd= Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd> > u(LLs);
|
||||
Vector<iSinglet<Simd> > l(LLs);
|
||||
Vector<iSinglet<Simd> > d(LLs);
|
||||
|
||||
assert(Ls/LLs==nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type * u_p = (scalar_type *)&u[0];
|
||||
scalar_type * l_p = (scalar_type *)&l[0];
|
||||
scalar_type * d_p = (scalar_type *)&d[0];
|
||||
|
||||
for(int o=0;o<LLs;o++){ // outer
|
||||
for(int i=0;i<nsimd;i++){ //inner
|
||||
int s = o+i*LLs;
|
||||
int ss = o*nsimd+i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
M5Dcalls++;
|
||||
M5Dtime-=usecond();
|
||||
thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
|
||||
#if 0
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0;v<LLs;v++){
|
||||
|
||||
int vp=(v+1)%LLs;
|
||||
int vm=(v+LLs-1)%LLs;
|
||||
|
||||
spProj5p(hp,psi[ss+vp]);
|
||||
spProj5m(hm,psi[ss+vm]);
|
||||
|
||||
if ( vp<=v ) rotate(hp,hp,1);
|
||||
if ( vm>=v ) rotate(hm,hm,nsimd-1);
|
||||
|
||||
hp=hp*0.5;
|
||||
hm=hm*0.5;
|
||||
spRecon5p(fp,hp);
|
||||
spRecon5m(fm,hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
|
||||
}
|
||||
#else
|
||||
for(int v=0;v<LLs;v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp= (v==LLs-1) ? 0 : v+1;
|
||||
int vm= (v==0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||
|
||||
if ( vp<=v ) {
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
if ( vm>=v ) {
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
|
||||
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0),p_00);
|
||||
vstream(chi[ss+v]()(0)(1),p_01);
|
||||
vstream(chi[ss+v]()(0)(2),p_02);
|
||||
vstream(chi[ss+v]()(1)(0),p_10);
|
||||
vstream(chi[ss+v]()(1)(1),p_11);
|
||||
vstream(chi[ss+v]()(1)(2),p_12);
|
||||
vstream(chi[ss+v]()(2)(0),p_20);
|
||||
vstream(chi[ss+v]()(2)(1),p_21);
|
||||
vstream(chi[ss+v]()(2)(2),p_22);
|
||||
vstream(chi[ss+v]()(3)(0),p_30);
|
||||
vstream(chi[ss+v]()(3)(1),p_31);
|
||||
vstream(chi[ss+v]()(3)(2),p_32);
|
||||
}
|
||||
#endif
|
||||
});
|
||||
M5Dtime+=usecond();
|
||||
}
|
||||
|
||||
|
||||
#ifdef AVX512
|
||||
#include <simd/Intel512common.h>
|
||||
#include <simd/Intel512avx.h>
|
||||
#include <simd/Intel512single.h>
|
||||
#endif
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField &chi_i,
|
||||
int LLs, int site,
|
||||
Vector<iSinglet<Simd> > &Matp,
|
||||
Vector<iSinglet<Simd> > &Matm)
|
||||
{
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
#ifndef AVX512
|
||||
{
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
|
||||
int s=s2+l*LLs;
|
||||
int lex=s2+LLs*site;
|
||||
|
||||
if ( s2==0 && l==0) {
|
||||
SiteChiP=Zero();
|
||||
SiteChiM=Zero();
|
||||
}
|
||||
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
|
||||
}}
|
||||
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
|
||||
SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
|
||||
}}
|
||||
|
||||
}}
|
||||
{
|
||||
int lex = s1+LLs*site;
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
{
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %%zmm1
|
||||
#define Chi_01 %%zmm2
|
||||
#define Chi_02 %%zmm3
|
||||
#define Chi_10 %%zmm4
|
||||
#define Chi_11 %%zmm5
|
||||
#define Chi_12 %%zmm6
|
||||
#define Chi_20 %%zmm7
|
||||
#define Chi_21 %%zmm8
|
||||
#define Chi_22 %%zmm9
|
||||
#define Chi_30 %%zmm10
|
||||
#define Chi_31 %%zmm11
|
||||
#define Chi_32 %%zmm12
|
||||
|
||||
#define BCAST0 %%zmm13
|
||||
#define BCAST1 %%zmm14
|
||||
#define BCAST2 %%zmm15
|
||||
#define BCAST3 %%zmm16
|
||||
#define BCAST4 %%zmm17
|
||||
#define BCAST5 %%zmm18
|
||||
#define BCAST6 %%zmm19
|
||||
#define BCAST7 %%zmm20
|
||||
#define BCAST8 %%zmm21
|
||||
#define BCAST9 %%zmm22
|
||||
#define BCAST10 %%zmm23
|
||||
#define BCAST11 %%zmm24
|
||||
|
||||
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
int lex=s2+LLs*site;
|
||||
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t)&psi[lex];
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
if ( (s2+l)==0 ) {
|
||||
asm (
|
||||
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||
VBCASTCDUP(0,%2,BCAST0)
|
||||
VBCASTCDUP(1,%2,BCAST1)
|
||||
VBCASTCDUP(2,%2,BCAST2)
|
||||
VBCASTCDUP(3,%2,BCAST3)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMULMEM (0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMULMEM (0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMULMEM (0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMULMEM (0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMULMEM (0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMULMEM (0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMULMEM (0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMULMEM (0,%1,BCAST7,Chi_21)
|
||||
VMULMEM (0,%1,BCAST8,Chi_22)
|
||||
VMULMEM (0,%1,BCAST9,Chi_30)
|
||||
VMULMEM (0,%1,BCAST10,Chi_31)
|
||||
VMULMEM (0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
} else {
|
||||
asm (
|
||||
VBCASTCDUP(0,%2,BCAST0) VMADDMEM (0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(1,%2,BCAST1) VMADDMEM (0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(2,%2,BCAST2) VMADDMEM (0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(3,%2,BCAST3) VMADDMEM (0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMADDMEM (0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMADDMEM (0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMADDMEM (0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMADDMEM (0,%1,BCAST7,Chi_21)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMADDMEM (0,%1,BCAST8,Chi_22)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMADDMEM (0,%1,BCAST9,Chi_30)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMADDMEM (0,%1,BCAST10,Chi_31)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMADDMEM (0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
}
|
||||
a0 = a0+incr;
|
||||
a1 = a1+incr;
|
||||
a2 = a2+sizeof(typename Simd::scalar_type);
|
||||
}}
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
asm (
|
||||
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
|
||||
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
|
||||
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
|
||||
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
#undef Chi_10
|
||||
#undef Chi_11
|
||||
#undef Chi_12
|
||||
#undef Chi_20
|
||||
#undef Chi_21
|
||||
#undef Chi_22
|
||||
#undef Chi_30
|
||||
#undef Chi_31
|
||||
#undef Chi_32
|
||||
|
||||
#undef BCAST0
|
||||
#undef BCAST1
|
||||
#undef BCAST2
|
||||
#undef BCAST3
|
||||
#undef BCAST4
|
||||
#undef BCAST5
|
||||
#undef BCAST6
|
||||
#undef BCAST7
|
||||
#undef BCAST8
|
||||
#undef BCAST9
|
||||
#undef BCAST10
|
||||
#undef BCAST11
|
||||
#endif
|
||||
};
|
||||
|
||||
// Z-mobius version
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField &chi_i,
|
||||
int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
|
||||
{
|
||||
#ifndef AVX512
|
||||
{
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
|
||||
int s=s2+l*LLs;
|
||||
int lex=s2+LLs*site;
|
||||
|
||||
if ( s2==0 && l==0) {
|
||||
SiteChiP=Zero();
|
||||
SiteChiM=Zero();
|
||||
}
|
||||
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
|
||||
}}
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
|
||||
}}
|
||||
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co);
|
||||
SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co);
|
||||
}}
|
||||
|
||||
|
||||
}}
|
||||
{
|
||||
int lex = s1+LLs*site;
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<Nc;co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
{
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %zmm0
|
||||
#define Chi_01 %zmm1
|
||||
#define Chi_02 %zmm2
|
||||
#define Chi_10 %zmm3
|
||||
#define Chi_11 %zmm4
|
||||
#define Chi_12 %zmm5
|
||||
#define Chi_20 %zmm6
|
||||
#define Chi_21 %zmm7
|
||||
#define Chi_22 %zmm8
|
||||
#define Chi_30 %zmm9
|
||||
#define Chi_31 %zmm10
|
||||
#define Chi_32 %zmm11
|
||||
#define pChi_00 %%zmm0
|
||||
#define pChi_01 %%zmm1
|
||||
#define pChi_02 %%zmm2
|
||||
#define pChi_10 %%zmm3
|
||||
#define pChi_11 %%zmm4
|
||||
#define pChi_12 %%zmm5
|
||||
#define pChi_20 %%zmm6
|
||||
#define pChi_21 %%zmm7
|
||||
#define pChi_22 %%zmm8
|
||||
#define pChi_30 %%zmm9
|
||||
#define pChi_31 %%zmm10
|
||||
#define pChi_32 %%zmm11
|
||||
|
||||
#define BCAST_00 %zmm12
|
||||
#define SHUF_00 %zmm13
|
||||
#define BCAST_01 %zmm14
|
||||
#define SHUF_01 %zmm15
|
||||
#define BCAST_02 %zmm16
|
||||
#define SHUF_02 %zmm17
|
||||
#define BCAST_10 %zmm18
|
||||
#define SHUF_10 %zmm19
|
||||
#define BCAST_11 %zmm20
|
||||
#define SHUF_11 %zmm21
|
||||
#define BCAST_12 %zmm22
|
||||
#define SHUF_12 %zmm23
|
||||
|
||||
#define Mp %zmm24
|
||||
#define Mps %zmm25
|
||||
#define Mm %zmm26
|
||||
#define Mms %zmm27
|
||||
#define N 8
|
||||
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
for(int s1=0;s1<LLs;s1++){
|
||||
for(int s2=0;s2<LLs;s2++){
|
||||
int lex=s2+LLs*site;
|
||||
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t)&psi[lex];
|
||||
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
|
||||
if ( (s2+l)==0 ) {
|
||||
LOAD64(%r8,a0);
|
||||
LOAD64(%r9,a1);
|
||||
LOAD64(%r10,a2);
|
||||
asm (
|
||||
VLOAD(0,%r8,Mp)// i r
|
||||
VLOAD(0,%r9,Mm)
|
||||
VSHUF(Mp,Mps) // r i
|
||||
VSHUF(Mm,Mms)
|
||||
VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
|
||||
VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
|
||||
|
||||
VMULIDUP(0*N,%r10,Mps,Chi_00)
|
||||
VMULIDUP(1*N,%r10,Mps,Chi_01)
|
||||
VMULIDUP(2*N,%r10,Mps,Chi_02)
|
||||
VMULIDUP(3*N,%r10,Mps,Chi_10)
|
||||
VMULIDUP(4*N,%r10,Mps,Chi_11)
|
||||
VMULIDUP(5*N,%r10,Mps,Chi_12)
|
||||
|
||||
VMULIDUP(6*N ,%r10,Mms,Chi_20)
|
||||
VMULIDUP(7*N ,%r10,Mms,Chi_21)
|
||||
VMULIDUP(8*N ,%r10,Mms,Chi_22)
|
||||
VMULIDUP(9*N ,%r10,Mms,Chi_30)
|
||||
VMULIDUP(10*N,%r10,Mms,Chi_31)
|
||||
VMULIDUP(11*N,%r10,Mms,Chi_32)
|
||||
|
||||
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
|
||||
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
|
||||
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
|
||||
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
|
||||
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
|
||||
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
|
||||
|
||||
VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
|
||||
VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
|
||||
VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
|
||||
VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
|
||||
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
|
||||
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
|
||||
);
|
||||
} else {
|
||||
LOAD64(%r8,a0);
|
||||
LOAD64(%r9,a1);
|
||||
LOAD64(%r10,a2);
|
||||
asm (
|
||||
VLOAD(0,%r8,Mp)
|
||||
VSHUF(Mp,Mps)
|
||||
|
||||
VLOAD(0,%r9,Mm)
|
||||
VSHUF(Mm,Mms)
|
||||
|
||||
VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) // Mri * Pii +- Cir
|
||||
VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
|
||||
VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
|
||||
VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
|
||||
VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
|
||||
VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
|
||||
|
||||
VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
|
||||
VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
|
||||
VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
|
||||
VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
|
||||
VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
|
||||
VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
|
||||
|
||||
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) // Cir = Mir * Prr +- ( Mri * Pii +- Cir)
|
||||
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) // Ci = MiPr + Ci + MrPi ; Cr = MrPr - ( MiPi - Cr)
|
||||
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
|
||||
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
|
||||
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
|
||||
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
|
||||
|
||||
VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
|
||||
VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
|
||||
VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
|
||||
VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
|
||||
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
|
||||
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
|
||||
);
|
||||
}
|
||||
a0 = a0+incr;
|
||||
a1 = a1+incr;
|
||||
a2 = a2+sizeof(typename Simd::scalar_type);
|
||||
}}
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
/*
|
||||
SiteSpinor tmp;
|
||||
asm (
|
||||
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
|
||||
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
|
||||
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
|
||||
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
|
||||
: : "r" ((uint64_t)&tmp) : "memory" );
|
||||
*/
|
||||
|
||||
asm (
|
||||
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
|
||||
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
|
||||
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
|
||||
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
|
||||
// if ( 1 || (site==0) ) {
|
||||
// std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
#undef Chi_10
|
||||
#undef Chi_11
|
||||
#undef Chi_12
|
||||
#undef Chi_20
|
||||
#undef Chi_21
|
||||
#undef Chi_22
|
||||
#undef Chi_30
|
||||
#undef Chi_31
|
||||
#undef Chi_32
|
||||
|
||||
#undef BCAST0
|
||||
#undef BCAST1
|
||||
#undef BCAST2
|
||||
#undef BCAST3
|
||||
#undef BCAST4
|
||||
#undef BCAST5
|
||||
#undef BCAST6
|
||||
#undef BCAST7
|
||||
#undef BCAST8
|
||||
#undef BCAST9
|
||||
#undef BCAST10
|
||||
#undef BCAST11
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
|
||||
{
|
||||
chi.Checkerboard()=psi.Checkerboard();
|
||||
|
||||
int Ls=this->Ls;
|
||||
int LLs = psi.Grid()->_rdimensions[0];
|
||||
int vol = psi.Grid()->oSites()/LLs;
|
||||
|
||||
|
||||
Vector<iSinglet<Simd> > Matp;
|
||||
Vector<iSinglet<Simd> > Matm;
|
||||
Vector<iSinglet<Simd> > *_Matp;
|
||||
Vector<iSinglet<Simd> > *_Matm;
|
||||
|
||||
// MooeeInternalCompute(dag,inv,Matp,Matm);
|
||||
if ( inv && dag ) {
|
||||
_Matp = &MatpInvDag;
|
||||
_Matm = &MatmInvDag;
|
||||
}
|
||||
if ( inv && (!dag) ) {
|
||||
_Matp = &MatpInv;
|
||||
_Matm = &MatmInv;
|
||||
}
|
||||
if ( !inv ) {
|
||||
MooeeInternalCompute(dag,inv,Matp,Matm);
|
||||
_Matp = &Matp;
|
||||
_Matm = &Matm;
|
||||
}
|
||||
assert(_Matp->size()==Ls*LLs);
|
||||
|
||||
MooeeInvCalls++;
|
||||
MooeeInvTime-=usecond();
|
||||
|
||||
if ( switcheroo<Coeff_t>::iscomplex() ) {
|
||||
thread_loop( (auto site=0;site<vol;site++),{
|
||||
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
||||
});
|
||||
} else {
|
||||
thread_loop( (auto site=0;site<vol;site++),{
|
||||
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
|
||||
});
|
||||
}
|
||||
MooeeInvTime+=usecond();
|
||||
}
|
||||
|
||||
INSTANTIATE_DPERP(DomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP(DomainWallVec5dImplF);
|
||||
INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
|
||||
|
||||
INSTANTIATE_DPERP(DomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP(DomainWallVec5dImplFH);
|
||||
INSTANTIATE_DPERP(ZDomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP(ZDomainWallVec5dImplFH);
|
||||
|
||||
template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
|
||||
template void CayleyFermion5D<DomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
template void CayleyFermion5D<DomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
template void CayleyFermion5D<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
template void CayleyFermion5D<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,320 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale)
|
||||
{
|
||||
SetCoefficientsZolotarev(1.0/scale,zdata);
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
|
||||
{
|
||||
// How to check Ls matches??
|
||||
// std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
|
||||
int Ls = this->Ls;
|
||||
assert(zdata->db==Ls);// Beta has Ls coeffs
|
||||
|
||||
R=(1+this->mass)/(1-this->mass);
|
||||
|
||||
Beta.resize(Ls);
|
||||
cc.resize(Ls);
|
||||
cc_d.resize(Ls);
|
||||
sqrt_cc.resize(Ls);
|
||||
for(int i=0; i < Ls ; i++){
|
||||
Beta[i] = zdata -> beta[i];
|
||||
cc[i] = 1.0/Beta[i];
|
||||
cc_d[i]=std::sqrt(cc[i]);
|
||||
}
|
||||
|
||||
cc_d[Ls-1]=1.0;
|
||||
for(int i=0; i < Ls-1 ; i++){
|
||||
sqrt_cc[i]= std::sqrt(cc[i]*cc[i+1]);
|
||||
}
|
||||
sqrt_cc[Ls-2]=std::sqrt(cc[Ls-2]);
|
||||
|
||||
|
||||
ZoloHiInv =1.0/zolo_hi;
|
||||
dw_diag = (4.0-this->M5)*ZoloHiInv;
|
||||
|
||||
See.resize(Ls);
|
||||
Aee.resize(Ls);
|
||||
int sign=1;
|
||||
for(int s=0;s<Ls;s++){
|
||||
Aee[s] = sign * Beta[s] * dw_diag;
|
||||
sign = - sign;
|
||||
}
|
||||
Aee[Ls-1] += R;
|
||||
|
||||
See[0] = Aee[0];
|
||||
for(int s=1;s<Ls;s++){
|
||||
See[s] = Aee[s] - 1.0/See[s-1];
|
||||
}
|
||||
for(int s=0;s<Ls;s++){
|
||||
std::cout<<GridLogMessage <<"s = "<<s<<" Beta "<<Beta[s]<<" Aee "<<Aee[s] <<" See "<<See[s] <<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class Impl>
|
||||
RealD ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
FermionField D(psi.Grid());
|
||||
|
||||
this->DW(psi,D,DaggerNo);
|
||||
|
||||
int sign=1;
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( s==0 ) {
|
||||
ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*ZoloHiInv,D,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
|
||||
} else if ( s==(Ls-1) ){
|
||||
RealD R=(1.0+mass)/(1.0-mass);
|
||||
ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,D,sqrt_cc[s-1],psi,s,s-1);
|
||||
ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
|
||||
} else {
|
||||
ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,D,sqrt_cc[s],psi,s,s+1);
|
||||
axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
|
||||
}
|
||||
sign=-sign;
|
||||
}
|
||||
return norm2(chi);
|
||||
}
|
||||
template<class Impl>
|
||||
RealD ContinuedFractionFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
// This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
|
||||
// The rest of matrix is symmetric.
|
||||
// Can ignore "dag"
|
||||
return M(psi,chi);
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
|
||||
int Ls = this->Ls;
|
||||
|
||||
this->DhopDir(psi,chi,dir,disp); // Dslash on diagonal. g5 Dslash is hermitian
|
||||
|
||||
int sign=1;
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( s==(Ls-1) ){
|
||||
ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
|
||||
} else {
|
||||
ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
|
||||
}
|
||||
sign=-sign;
|
||||
}
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::Meooe (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
// Apply 4d dslash
|
||||
if ( psi.Checkerboard() == Odd ) {
|
||||
this->DhopEO(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
|
||||
} else {
|
||||
this->DhopOE(psi,chi,DaggerNo); // Dslash on diagonal. g5 Dslash is hermitian
|
||||
}
|
||||
|
||||
int sign=1;
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( s==(Ls-1) ){
|
||||
ag5xpby_ssp(chi,Beta[s]*ZoloHiInv,chi,0.0,chi,s,s);
|
||||
} else {
|
||||
ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*ZoloHiInv,chi,0.0,chi,s,s);
|
||||
}
|
||||
sign=-sign;
|
||||
}
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::MeooeDag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
this->Meooe(psi,chi);
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
int sign=1;
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( s==0 ) {
|
||||
ag5xpby_ssp(chi,cc[0]*Beta[0]*sign*dw_diag,psi,sqrt_cc[0],psi,s,s+1); // Multiplies Dw by G5 so Hw
|
||||
} else if ( s==(Ls-1) ){
|
||||
// Drop the CC here.
|
||||
double R=(1+mass)/(1-mass);
|
||||
ag5xpby_ssp(chi,Beta[s]*dw_diag,psi,sqrt_cc[s-1],psi,s,s-1);
|
||||
ag5xpby_ssp(chi,R,psi,1.0,chi,s,s);
|
||||
} else {
|
||||
ag5xpby_ssp(chi,cc[s]*Beta[s]*sign*dw_diag,psi,sqrt_cc[s],psi,s,s+1);
|
||||
axpby_ssp(chi,1.0,chi,sqrt_cc[s-1],psi,s,s-1);
|
||||
}
|
||||
sign=-sign;
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
this->Mooee(psi,chi);
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
// Apply Linv
|
||||
axpby_ssp(chi,1.0/cc_d[0],psi,0.0,psi,0,0);
|
||||
for(int s=1;s<Ls;s++){
|
||||
axpbg5y_ssp(chi,1.0/cc_d[s],psi,-1.0/See[s-1],chi,s,s-1);
|
||||
}
|
||||
// Apply Dinv
|
||||
for(int s=0;s<Ls;s++){
|
||||
ag5xpby_ssp(chi,1.0/See[s],chi,0.0,chi,s,s); //only appearance of See[0]
|
||||
}
|
||||
// Apply Uinv = (Linv)^T
|
||||
axpby_ssp(chi,1.0/cc_d[Ls-1],chi,0.0,chi,Ls-1,Ls-1);
|
||||
for(int s=Ls-2;s>=0;s--){
|
||||
axpbg5y_ssp(chi,1.0/cc_d[s],chi,-1.0*cc_d[s+1]/See[s]/cc_d[s],chi,s,s+1);
|
||||
}
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
this->MooeeInv(psi,chi);
|
||||
}
|
||||
|
||||
// force terms; five routines; default to Dhop on diagonal
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::MDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
FermionField D(V.Grid());
|
||||
|
||||
int sign=1;
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( s==(Ls-1) ){
|
||||
ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
|
||||
} else {
|
||||
ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
|
||||
}
|
||||
sign=-sign;
|
||||
}
|
||||
this->DhopDeriv(mat,D,V,DaggerNo);
|
||||
};
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
FermionField D(V.Grid());
|
||||
|
||||
int sign=1;
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( s==(Ls-1) ){
|
||||
ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
|
||||
} else {
|
||||
ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
|
||||
}
|
||||
sign=-sign;
|
||||
}
|
||||
this->DhopDerivOE(mat,D,V,DaggerNo);
|
||||
};
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
FermionField D(V.Grid());
|
||||
|
||||
int sign=1;
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( s==(Ls-1) ){
|
||||
ag5xpby_ssp(D,Beta[s]*ZoloHiInv,U,0.0,U,s,s);
|
||||
} else {
|
||||
ag5xpby_ssp(D,cc[s]*Beta[s]*sign*ZoloHiInv,U,0.0,U,s,s);
|
||||
}
|
||||
sign=-sign;
|
||||
}
|
||||
this->DhopDerivEO(mat,D,V,DaggerNo);
|
||||
};
|
||||
|
||||
// Constructors
|
||||
template<class Impl>
|
||||
ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
|
||||
GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,const ImplParams &p) :
|
||||
WilsonFermion5D<Impl>(_Umu,
|
||||
FiveDimGrid, FiveDimRedBlackGrid,
|
||||
FourDimGrid, FourDimRedBlackGrid,M5,p),
|
||||
mass(_mass)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
assert((Ls&0x1)==1); // Odd Ls required
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
conformable(solution5d.Grid(),this->FermionGrid());
|
||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
conformable(imported5d.Grid(),this->FermionGrid());
|
||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||
FermionField tmp(this->FermionGrid());
|
||||
tmp=Zero();
|
||||
InsertSlice(input4d, tmp, Ls-1, Ls-1);
|
||||
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
||||
this->Dminus(tmp,imported5d);
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(ContinuedFractionFermion5D);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,433 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid_Eigen_Dense.h>
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
|
||||
GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mq1, RealD _mq2, RealD _mq3,
|
||||
RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
|
||||
AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
|
||||
FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
|
||||
_shift, _pm, _M5, 1.0, 0.0, p)
|
||||
{
|
||||
RealD eps = 1.0;
|
||||
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
|
||||
assert(zdata->n == this->Ls);
|
||||
|
||||
std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
|
||||
this->SetCoefficientsTanh(zdata, 1.0, 0.0);
|
||||
|
||||
Approx::zolotarev_free(zdata);
|
||||
}
|
||||
|
||||
/***************************************************************
|
||||
* Additional EOFA operators only called outside the inverter.
|
||||
* Since speed is not essential, simple axpby-style
|
||||
* implementations should be fine.
|
||||
***************************************************************/
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Din = Zero();
|
||||
if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
|
||||
else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
|
||||
else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
|
||||
else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
|
||||
}
|
||||
|
||||
// This is just the identity for DWF
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
|
||||
|
||||
// This is just the identity for DWF
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
|
||||
|
||||
/*****************************************************************************************************/
|
||||
|
||||
template<class Impl>
|
||||
RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
FermionField Din(psi.Grid());
|
||||
|
||||
this->Meooe5D(psi, Din);
|
||||
this->DW(Din, chi, DaggerNo);
|
||||
axpby(chi, 1.0, 1.0, chi, psi);
|
||||
this->M5D(psi, chi);
|
||||
return(norm2(chi));
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
FermionField Din(psi.Grid());
|
||||
|
||||
this->DW(psi, Din, DaggerYes);
|
||||
this->MeooeDag5D(Din, chi);
|
||||
this->M5Ddag(psi, chi);
|
||||
axpby(chi, 1.0, 1.0, chi, psi);
|
||||
return(norm2(chi));
|
||||
}
|
||||
|
||||
/********************************************************************
|
||||
* Performance critical fermion operators called inside the inverter
|
||||
********************************************************************/
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
int pm = this->pm;
|
||||
RealD shift = this->shift;
|
||||
RealD mq1 = this->mq1;
|
||||
RealD mq2 = this->mq2;
|
||||
RealD mq3 = this->mq3;
|
||||
|
||||
// coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
|
||||
Coeff_t shiftp(0.0), shiftm(0.0);
|
||||
if(shift != 0.0){
|
||||
if(pm == 1){ shiftp = shift*(mq3-mq2); }
|
||||
else{ shiftm = -shift*(mq3-mq2); }
|
||||
}
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
||||
|
||||
#if(0)
|
||||
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
|
||||
for(int i=0; i<diag.size(); ++i){
|
||||
std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
|
||||
}
|
||||
for(int i=0; i<upper.size(); ++i){
|
||||
std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
|
||||
}
|
||||
for(int i=0; i<lower.size(); ++i){
|
||||
std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
this->M5D(psi, chi, chi, lower, diag, upper);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
int pm = this->pm;
|
||||
RealD shift = this->shift;
|
||||
RealD mq1 = this->mq1;
|
||||
RealD mq2 = this->mq2;
|
||||
RealD mq3 = this->mq3;
|
||||
|
||||
// coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
|
||||
Coeff_t shiftp(0.0), shiftm(0.0);
|
||||
if(shift != 0.0){
|
||||
if(pm == 1){ shiftp = shift*(mq3-mq2); }
|
||||
else{ shiftm = -shift*(mq3-mq2); }
|
||||
}
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
||||
|
||||
#if(0)
|
||||
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
|
||||
for(int i=0; i<diag.size(); ++i){
|
||||
std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
|
||||
}
|
||||
for(int i=0; i<upper.size(); ++i){
|
||||
std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
|
||||
}
|
||||
for(int i=0; i<lower.size(); ++i){
|
||||
std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
this->M5Ddag(psi, chi, chi, lower, diag, upper);
|
||||
}
|
||||
|
||||
// half checkerboard operations
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
upper[s] = -this->cee[s];
|
||||
lower[s] = -this->cee[s];
|
||||
}
|
||||
upper[Ls-1] = this->dm;
|
||||
lower[0] = this->dp;
|
||||
|
||||
this->M5D(psi, psi, chi, lower, diag, upper);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
upper[s] = -this->cee[s];
|
||||
lower[s] = -this->cee[s];
|
||||
}
|
||||
upper[Ls-1] = this->dp;
|
||||
lower[0] = this->dm;
|
||||
|
||||
this->M5Ddag(psi, psi, chi, lower, diag, upper);
|
||||
}
|
||||
|
||||
/****************************************************************************************/
|
||||
|
||||
//Zolo
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
int pm = this->pm;
|
||||
RealD mq1 = this->mq1;
|
||||
RealD mq2 = this->mq2;
|
||||
RealD mq3 = this->mq3;
|
||||
RealD shift = this->shift;
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Constants for the preconditioned matrix Cayley form
|
||||
////////////////////////////////////////////////////////
|
||||
this->bs.resize(Ls);
|
||||
this->cs.resize(Ls);
|
||||
this->aee.resize(Ls);
|
||||
this->aeo.resize(Ls);
|
||||
this->bee.resize(Ls);
|
||||
this->beo.resize(Ls);
|
||||
this->cee.resize(Ls);
|
||||
this->ceo.resize(Ls);
|
||||
|
||||
for(int i=0; i<Ls; ++i){
|
||||
this->bee[i] = 4.0 - this->M5 + 1.0;
|
||||
this->cee[i] = 1.0;
|
||||
}
|
||||
|
||||
for(int i=0; i<Ls; ++i){
|
||||
this->aee[i] = this->cee[i];
|
||||
this->bs[i] = this->beo[i] = 1.0;
|
||||
this->cs[i] = this->ceo[i] = 0.0;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
// EOFA shift terms
|
||||
//////////////////////////////////////////
|
||||
if(pm == 1){
|
||||
this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
|
||||
this->dm = mq1*this->cee[Ls-1];
|
||||
} else if(this->pm == -1) {
|
||||
this->dp = mq1*this->cee[0];
|
||||
this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
|
||||
} else {
|
||||
this->dp = mq1*this->cee[0];
|
||||
this->dm = mq1*this->cee[Ls-1];
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
// LDU decomposition of eeoo
|
||||
//////////////////////////////////////////
|
||||
this->dee.resize(Ls+1);
|
||||
this->lee.resize(Ls);
|
||||
this->leem.resize(Ls);
|
||||
this->uee.resize(Ls);
|
||||
this->ueem.resize(Ls);
|
||||
|
||||
for(int i=0; i<Ls; ++i){
|
||||
|
||||
if(i < Ls-1){
|
||||
|
||||
this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
|
||||
|
||||
this->leem[i] = this->dm/this->bee[i];
|
||||
for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
|
||||
|
||||
this->dee[i] = this->bee[i];
|
||||
|
||||
this->uee[i] = -this->aee[i]/this->bee[i]; // up-diag entry on the ith row
|
||||
|
||||
this->ueem[i] = this->dp / this->bee[0];
|
||||
for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
|
||||
|
||||
} else {
|
||||
|
||||
this->lee[i] = 0.0;
|
||||
this->leem[i] = 0.0;
|
||||
this->uee[i] = 0.0;
|
||||
this->ueem[i] = 0.0;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
Coeff_t delta_d = 1.0 / this->bee[0];
|
||||
for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
|
||||
this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
|
||||
this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
|
||||
}
|
||||
|
||||
int inv = 1;
|
||||
this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
|
||||
this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
|
||||
}
|
||||
|
||||
// Recompute Cayley-form coefficients for different shift
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
|
||||
{
|
||||
this->shift = new_shift;
|
||||
Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
|
||||
this->SetCoefficientsTanh(zdata, 1.0, 0.0);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
|
||||
Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
GridBase* grid = this->FermionRedBlackGrid();
|
||||
int LLs = grid->_rdimensions[0];
|
||||
|
||||
if(LLs == Ls){ return; } // Not vectorised in 5th direction
|
||||
|
||||
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
Pplus(s,s) = this->bee[s];
|
||||
Pminus(s,s) = this->bee[s];
|
||||
}
|
||||
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
Pminus(s,s+1) = -this->cee[s];
|
||||
}
|
||||
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
Pplus(s+1,s) = -this->cee[s+1];
|
||||
}
|
||||
|
||||
Pplus (0,Ls-1) = this->dp;
|
||||
Pminus(Ls-1,0) = this->dm;
|
||||
|
||||
Eigen::MatrixXcd PplusMat ;
|
||||
Eigen::MatrixXcd PminusMat;
|
||||
|
||||
#if(0)
|
||||
std::cout << GridLogMessage << "Pplus:" << std::endl;
|
||||
for(int s=0; s<Ls; ++s){
|
||||
for(int ss=0; ss<Ls; ++ss){
|
||||
std::cout << Pplus(s,ss) << "\t";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << GridLogMessage << "Pminus:" << std::endl;
|
||||
for(int s=0; s<Ls; ++s){
|
||||
for(int ss=0; ss<Ls; ++ss){
|
||||
std::cout << Pminus(s,ss) << "\t";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if(inv) {
|
||||
PplusMat = Pplus.inverse();
|
||||
PminusMat = Pminus.inverse();
|
||||
} else {
|
||||
PplusMat = Pplus;
|
||||
PminusMat = Pminus;
|
||||
}
|
||||
|
||||
if(dag){
|
||||
PplusMat.adjointInPlace();
|
||||
PminusMat.adjointInPlace();
|
||||
}
|
||||
|
||||
typedef typename SiteHalfSpinor::scalar_type scalar_type;
|
||||
const int Nsimd = Simd::Nsimd();
|
||||
Matp.resize(Ls*LLs);
|
||||
Matm.resize(Ls*LLs);
|
||||
|
||||
for(int s2=0; s2<Ls; s2++){
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
int istride = LLs;
|
||||
int ostride = 1;
|
||||
Simd Vp;
|
||||
Simd Vm;
|
||||
scalar_type *sp = (scalar_type*) &Vp;
|
||||
scalar_type *sm = (scalar_type*) &Vm;
|
||||
for(int l=0; l<Nsimd; l++){
|
||||
if(switcheroo<Coeff_t>::iscomplex()) {
|
||||
sp[l] = PplusMat (l*istride+s1*ostride,s2);
|
||||
sm[l] = PminusMat(l*istride+s1*ostride,s2);
|
||||
} else {
|
||||
// if real
|
||||
scalar_type tmp;
|
||||
tmp = PplusMat (l*istride+s1*ostride,s2);
|
||||
sp[l] = scalar_type(tmp.real(),tmp.real());
|
||||
tmp = PminusMat(l*istride+s1*ostride,s2);
|
||||
sm[l] = scalar_type(tmp.real(),tmp.real());
|
||||
}
|
||||
}
|
||||
Matp[LLs*s2+s1] = Vp;
|
||||
Matm[LLs*s2+s1] = Vm;
|
||||
}}
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(DomainWallEOFAFermion);
|
||||
GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,255 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
|
||||
|
||||
// Pminus fowards
|
||||
// Pplus backwards..
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
int Ls = this->Ls;
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto phi = phi_i.View();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||
for(int s=0; s<Ls; s++){
|
||||
auto tmp = psi[0];
|
||||
if(s==0) {
|
||||
spProj5m(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+Ls-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else if(s==(Ls-1)) {
|
||||
spProj5m(tmp, psi[ss+0]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else {
|
||||
spProj5m(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||
auto tmp = psi[0];
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(s==0) {
|
||||
spProj5p(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+Ls-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else if(s==(Ls-1)) {
|
||||
spProj5p(tmp, psi[ss+0]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else {
|
||||
spProj5p(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi=psi_i.View();
|
||||
auto chi=chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||
|
||||
auto tmp1 = psi[0];
|
||||
auto tmp2 = psi[0];
|
||||
|
||||
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
||||
// Apply (L^{\prime})^{-1}
|
||||
chi[ss] = psi[ss]; // chi[0]=psi[0]
|
||||
for(int s=1; s<Ls; s++){
|
||||
spProj5p(tmp1, chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
|
||||
}
|
||||
|
||||
// L_m^{-1}
|
||||
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
|
||||
spProj5m(tmp1, chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
|
||||
}
|
||||
|
||||
// U_m^{-1} D^{-1}
|
||||
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
|
||||
spProj5p(tmp1, chi[ss+Ls-1]);
|
||||
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
|
||||
}
|
||||
spProj5m(tmp2, chi[ss+Ls-1]);
|
||||
chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
|
||||
|
||||
// Apply U^{-1}
|
||||
for(int s=Ls-2; s>=0; s--){
|
||||
spProj5m(tmp1, chi[ss+s+1]);
|
||||
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
|
||||
}
|
||||
});
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
|
||||
assert(psi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
Vector<Coeff_t> ueec(Ls);
|
||||
Vector<Coeff_t> deec(Ls+1);
|
||||
Vector<Coeff_t> leec(Ls);
|
||||
Vector<Coeff_t> ueemc(Ls);
|
||||
Vector<Coeff_t> leemc(Ls);
|
||||
|
||||
for(int s=0; s<ueec.size(); s++){
|
||||
ueec[s] = conjugate(this->uee[s]);
|
||||
deec[s] = conjugate(this->dee[s]);
|
||||
leec[s] = conjugate(this->lee[s]);
|
||||
ueemc[s] = conjugate(this->ueem[s]);
|
||||
leemc[s] = conjugate(this->leem[s]);
|
||||
}
|
||||
deec[Ls] = conjugate(this->dee[Ls]);
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||
|
||||
auto tmp1 = psi[0];
|
||||
auto tmp2 = psi[0];
|
||||
|
||||
// Apply (U^{\prime})^{-dagger}
|
||||
chi[ss] = psi[ss];
|
||||
for(int s=1; s<Ls; s++){
|
||||
spProj5m(tmp1, chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
|
||||
}
|
||||
|
||||
// U_m^{-\dagger}
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
spProj5p(tmp1, chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
|
||||
}
|
||||
|
||||
// L_m^{-\dagger} D^{-dagger}
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
spProj5m(tmp1, chi[ss+Ls-1]);
|
||||
chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
|
||||
}
|
||||
spProj5p(tmp2, chi[ss+Ls-1]);
|
||||
chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
|
||||
|
||||
// Apply L^{-dagger}
|
||||
for(int s=Ls-2; s>=0; s--){
|
||||
spProj5p(tmp1, chi[ss+s+1]);
|
||||
chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
|
||||
}
|
||||
});
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
|
||||
|
||||
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
|
||||
|
||||
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
|
||||
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,613 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
/*
|
||||
* Dense matrix versions of routines
|
||||
*/
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
const int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd> > u(LLs);
|
||||
Vector<iSinglet<Simd> > l(LLs);
|
||||
Vector<iSinglet<Simd> > d(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
|
||||
for(int o=0;o<LLs;o++){ // outer
|
||||
for(int i=0;i<nsimd;i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
assert(Nc == 3);
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
#if 0
|
||||
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
int vp = (v+1)%LLs;
|
||||
int vm = (v+LLs-1)%LLs;
|
||||
|
||||
spProj5m(hp, psi[ss+vp]);
|
||||
spProj5p(hm, psi[ss+vm]);
|
||||
|
||||
if (vp <= v){ rotate(hp, hp, 1); }
|
||||
if (vm >= v){ rotate(hm, hm, nsimd-1); }
|
||||
|
||||
hp = 0.5*hp;
|
||||
hm = 0.5*hm;
|
||||
|
||||
spRecon5m(fp, hp);
|
||||
spRecon5p(fm, hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v];
|
||||
chi[ss+v] = chi[ss+v] + u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] + l[v]*fm;
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v==LLs-1) ? 0 : v+1;
|
||||
int vm = (v==0) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
if(vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
// Can force these to real arithmetic and save 2x.
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
}
|
||||
|
||||
#endif
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd> > u(LLs);
|
||||
Vector<iSinglet<Simd> > l(LLs);
|
||||
Vector<iSinglet<Simd> > d(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
#if 0
|
||||
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
int vp = (v+1)%LLs;
|
||||
int vm = (v+LLs-1)%LLs;
|
||||
|
||||
spProj5p(hp, psi[ss+vp]);
|
||||
spProj5m(hm, psi[ss+vm]);
|
||||
|
||||
if(vp <= v){ rotate(hp, hp, 1); }
|
||||
if(vm >= v){ rotate(hm, hm, nsimd-1); }
|
||||
|
||||
hp = hp*0.5;
|
||||
hm = hm*0.5;
|
||||
spRecon5p(fp, hp);
|
||||
spRecon5m(fm, hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||
|
||||
if (vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
}
|
||||
#endif
|
||||
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
#ifdef AVX512
|
||||
#include<simd/Intel512common.h>
|
||||
#include<simd/Intel512avx.h>
|
||||
#include<simd/Intel512single.h>
|
||||
#endif
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
|
||||
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
#ifndef AVX512
|
||||
{
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
|
||||
for(int s2=0; s2<LLs; s2++){
|
||||
for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
|
||||
|
||||
int s = s2 + l*LLs;
|
||||
int lex = s2 + LLs*site;
|
||||
|
||||
if( s2==0 && l==0 ){
|
||||
SiteChiP=Zero();
|
||||
SiteChiM=Zero();
|
||||
}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
|
||||
}}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
|
||||
}}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
|
||||
SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
|
||||
}}
|
||||
}}
|
||||
|
||||
{
|
||||
int lex = s1 + LLs*site;
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
{
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %%zmm1
|
||||
#define Chi_01 %%zmm2
|
||||
#define Chi_02 %%zmm3
|
||||
#define Chi_10 %%zmm4
|
||||
#define Chi_11 %%zmm5
|
||||
#define Chi_12 %%zmm6
|
||||
#define Chi_20 %%zmm7
|
||||
#define Chi_21 %%zmm8
|
||||
#define Chi_22 %%zmm9
|
||||
#define Chi_30 %%zmm10
|
||||
#define Chi_31 %%zmm11
|
||||
#define Chi_32 %%zmm12
|
||||
|
||||
#define BCAST0 %%zmm13
|
||||
#define BCAST1 %%zmm14
|
||||
#define BCAST2 %%zmm15
|
||||
#define BCAST3 %%zmm16
|
||||
#define BCAST4 %%zmm17
|
||||
#define BCAST5 %%zmm18
|
||||
#define BCAST6 %%zmm19
|
||||
#define BCAST7 %%zmm20
|
||||
#define BCAST8 %%zmm21
|
||||
#define BCAST9 %%zmm22
|
||||
#define BCAST10 %%zmm23
|
||||
#define BCAST11 %%zmm24
|
||||
|
||||
int incr = LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
|
||||
for(int s2=0; s2<LLs; s2++){
|
||||
|
||||
int lex = s2 + LLs*site;
|
||||
uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t) &psi[lex];
|
||||
|
||||
for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
|
||||
if((s2+l)==0) {
|
||||
asm(
|
||||
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||
VBCASTCDUP(0,%2,BCAST0)
|
||||
VBCASTCDUP(1,%2,BCAST1)
|
||||
VBCASTCDUP(2,%2,BCAST2)
|
||||
VBCASTCDUP(3,%2,BCAST3)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMULMEM(0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMULMEM(0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMULMEM(0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMULMEM(0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMULMEM(0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMULMEM(0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMULMEM(0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMULMEM(0,%1,BCAST7,Chi_21)
|
||||
VMULMEM(0,%1,BCAST8,Chi_22)
|
||||
VMULMEM(0,%1,BCAST9,Chi_30)
|
||||
VMULMEM(0,%1,BCAST10,Chi_31)
|
||||
VMULMEM(0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
} else {
|
||||
asm(
|
||||
VBCASTCDUP(0,%2,BCAST0) VMADDMEM(0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(1,%2,BCAST1) VMADDMEM(0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(2,%2,BCAST2) VMADDMEM(0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(3,%2,BCAST3) VMADDMEM(0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMADDMEM(0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMADDMEM(0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMADDMEM(0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMADDMEM(0,%1,BCAST7,Chi_21)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMADDMEM(0,%1,BCAST8,Chi_22)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMADDMEM(0,%1,BCAST9,Chi_30)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
}
|
||||
a0 = a0 + incr;
|
||||
a1 = a1 + incr;
|
||||
a2 = a2 + sizeof(typename Simd::scalar_type);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
asm (
|
||||
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
|
||||
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
|
||||
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
|
||||
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
#undef Chi_10
|
||||
#undef Chi_11
|
||||
#undef Chi_12
|
||||
#undef Chi_20
|
||||
#undef Chi_21
|
||||
#undef Chi_22
|
||||
#undef Chi_30
|
||||
#undef Chi_31
|
||||
#undef Chi_32
|
||||
|
||||
#undef BCAST0
|
||||
#undef BCAST1
|
||||
#undef BCAST2
|
||||
#undef BCAST3
|
||||
#undef BCAST4
|
||||
#undef BCAST5
|
||||
#undef BCAST6
|
||||
#undef BCAST7
|
||||
#undef BCAST8
|
||||
#undef BCAST9
|
||||
#undef BCAST10
|
||||
#undef BCAST11
|
||||
#endif
|
||||
};
|
||||
|
||||
// Z-mobius version
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
|
||||
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
|
||||
exit(-1);
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
|
||||
{
|
||||
chi.Checkerboard() = psi.Checkerboard();
|
||||
int Ls = this->Ls;
|
||||
int LLs = psi.Grid()->_rdimensions[0];
|
||||
int vol = psi.Grid()->oSites()/LLs;
|
||||
|
||||
Vector<iSinglet<Simd> > Matp;
|
||||
Vector<iSinglet<Simd> > Matm;
|
||||
Vector<iSinglet<Simd> > *_Matp;
|
||||
Vector<iSinglet<Simd> > *_Matm;
|
||||
|
||||
// MooeeInternalCompute(dag,inv,Matp,Matm);
|
||||
if(inv && dag){
|
||||
_Matp = &this->MatpInvDag;
|
||||
_Matm = &this->MatmInvDag;
|
||||
}
|
||||
|
||||
if(inv && (!dag)){
|
||||
_Matp = &this->MatpInv;
|
||||
_Matm = &this->MatmInv;
|
||||
}
|
||||
|
||||
if(!inv){
|
||||
MooeeInternalCompute(dag, inv, Matp, Matm);
|
||||
_Matp = &Matp;
|
||||
_Matm = &Matm;
|
||||
}
|
||||
|
||||
assert(_Matp->size() == Ls*LLs);
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
if(switcheroo<Coeff_t>::iscomplex()){
|
||||
thread_loop((auto site=0; site<vol; site++),{
|
||||
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||
});
|
||||
} else {
|
||||
thread_loop((auto site=0; site<vol; site++){
|
||||
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||
});
|
||||
}
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
|
||||
|
||||
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
|
||||
|
||||
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
|
||||
|
||||
template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
|
||||
template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
267
Grid/qcd/action/fermion/DomainWallVec5dImpl.h
Normal file
267
Grid/qcd/action/fermion/DomainWallVec5dImpl.h
Normal file
@ -0,0 +1,267 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
|
||||
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > {
|
||||
public:
|
||||
|
||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
static const int Dimension = Representation::Dimension;
|
||||
static const bool isFundamental = Representation::isFundamental;
|
||||
static const bool LsVectorised=true;
|
||||
static const int Nhcs = Options::Nhcs;
|
||||
|
||||
typedef typename Options::_Coeff_t Coeff_t;
|
||||
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
|
||||
|
||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
|
||||
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
|
||||
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
||||
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
|
||||
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
|
||||
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
|
||||
|
||||
typedef iImplSpinor<Simd> SiteSpinor;
|
||||
typedef iImplPropagator<Simd> SitePropagator;
|
||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||
typedef iImplHalfCommSpinor<SimdL> SiteHalfCommSpinor;
|
||||
typedef Lattice<SiteSpinor> FermionField;
|
||||
typedef Lattice<SitePropagator> PropagatorField;
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Make the doubled gauge field a *scalar*
|
||||
/////////////////////////////////////////////////
|
||||
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
|
||||
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
|
||||
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
|
||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||
|
||||
typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
|
||||
typedef WilsonImplParams ImplParams;
|
||||
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
|
||||
typedef typename StencilImpl::View_type StencilView;
|
||||
|
||||
ImplParams Params;
|
||||
|
||||
DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
|
||||
|
||||
template <class ref>
|
||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
||||
{
|
||||
vsplat(reg, memory);
|
||||
}
|
||||
|
||||
template<class _Spinor>
|
||||
static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U,
|
||||
const _Spinor &chi, int mu, StencilEntry *SE,
|
||||
StencilView &St)
|
||||
{
|
||||
#ifdef GPU_VEC
|
||||
// Gauge link is scalarised
|
||||
mult(&phi(), &U(mu), &chi());
|
||||
#else
|
||||
SiteGaugeLink UU;
|
||||
for (int i = 0; i < Dimension; i++) {
|
||||
for (int j = 0; j < Dimension; j++) {
|
||||
vsplat(UU()()(i, j), U(mu)()(i, j));
|
||||
}
|
||||
}
|
||||
mult(&phi(), &UU(), &chi());
|
||||
#endif
|
||||
}
|
||||
#ifdef GPU_VEC
|
||||
static accelerator_inline void copyLinkGpu(int lane,
|
||||
SiteDoubledGaugeField & UU,
|
||||
const SiteDoubledGaugeField &U)
|
||||
{
|
||||
UU = U;
|
||||
}
|
||||
static accelerator_inline void multLinkGpu(int lane,
|
||||
typename SiteHalfSpinor::scalar_object &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const typename SiteHalfSpinor::scalar_object &chi,
|
||||
int mu)
|
||||
{
|
||||
#if 1
|
||||
typedef typename ExtractTypeMap<typename Simd::scalar_type>::extract_type extract_type;
|
||||
|
||||
SiteScalarGaugeLink U_l;
|
||||
|
||||
extract_type * U_mem = (extract_type *) &U(mu);
|
||||
extract_type * U_stack= (extract_type *) &U_l;
|
||||
|
||||
for(int w=0;w<(sizeof(U_l)/sizeof(extract_type)) ;w++) U_stack[w] = U_mem[w];
|
||||
|
||||
phi() = U_l() * chi();
|
||||
#else
|
||||
auto U_l = U(mu);
|
||||
|
||||
phi() = U_l * chi();
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
static accelerator_inline void multLinkGpu(int lane,
|
||||
SiteHalfSpinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SiteHalfSpinor &chi,
|
||||
int mu)
|
||||
{
|
||||
auto U_l = U(mu);
|
||||
phi() = U_l * chi();
|
||||
}
|
||||
#endif
|
||||
|
||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SitePropagator &chi,int mu)
|
||||
{
|
||||
SiteGaugeLink UU;
|
||||
for (int i = 0; i < Dimension; i++) {
|
||||
for (int j = 0; j < Dimension; j++) {
|
||||
vsplat(UU()()(i, j), U(mu)()(i, j));
|
||||
}
|
||||
}
|
||||
mult(&phi(), &UU(), &chi());
|
||||
}
|
||||
|
||||
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||
{
|
||||
SiteScalarGaugeField ScalarUmu;
|
||||
SiteDoubledGaugeField ScalarUds;
|
||||
|
||||
GaugeLinkField U(Umu.Grid());
|
||||
GaugeField Uadj(Umu.Grid());
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
U = PeekIndex<LorentzIndex>(Umu, mu);
|
||||
U = adj(Cshift(U, mu, -1));
|
||||
PokeIndex<LorentzIndex>(Uadj, U, mu);
|
||||
}
|
||||
|
||||
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||
Coordinate lcoor;
|
||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||
|
||||
peekLocalSite(ScalarUmu, Umu, lcoor);
|
||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
||||
|
||||
peekLocalSite(ScalarUmu, Uadj, lcoor);
|
||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
|
||||
|
||||
pokeLocalSite(ScalarUds, Uds, lcoor);
|
||||
}
|
||||
}
|
||||
|
||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void outerProductImpl(PropagatorField &mat, const FermionField &Btilde, const FermionField &A){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) {
|
||||
|
||||
assert(0);
|
||||
// Following lines to be revised after Peter's addition of half prec
|
||||
// missing put lane...
|
||||
/*
|
||||
typedef decltype(traceIndex<SpinIndex>(outerProduct(Btilde[0], Atilde[0]))) result_type;
|
||||
unsigned int LLs = Btilde.Grid()->_rdimensions[0];
|
||||
conformable(Atilde.Grid(),Btilde.Grid());
|
||||
GridBase* grid = mat.Grid();
|
||||
GridBase* Bgrid = Btilde.Grid();
|
||||
unsigned int dimU = grid->Nd();
|
||||
unsigned int dimF = Bgrid->Nd();
|
||||
GaugeLinkField tmp(grid);
|
||||
tmp = Zero();
|
||||
|
||||
// FIXME
|
||||
// Current implementation works, thread safe, probably suboptimal
|
||||
// Passing through the local coordinate for grid transformation
|
||||
// the force grid is in general very different from the Ls vectorized grid
|
||||
|
||||
for (int so = 0; so < grid->oSites(); so++) {
|
||||
std::vector<typename result_type::scalar_object> vres(Bgrid->Nsimd());
|
||||
std::vector<int> ocoor; grid->oCoorFromOindex(ocoor,so);
|
||||
for (int si = 0; si < tmp.Grid()->iSites(); si++){
|
||||
typename result_type::scalar_object scalar_object; scalar_object = Zero();
|
||||
std::vector<int> local_coor;
|
||||
std::vector<int> icoor; grid->iCoorFromIindex(icoor,si);
|
||||
grid->InOutCoorToLocalCoor(ocoor, icoor, local_coor);
|
||||
for (int s = 0; s < LLs; s++) {
|
||||
std::vector<int> slocal_coor(dimF);
|
||||
slocal_coor[0] = s;
|
||||
for (int s4d = 1; s4d< dimF; s4d++) slocal_coor[s4d] = local_coor[s4d-1];
|
||||
int sF = Bgrid->oIndexReduced(slocal_coor);
|
||||
assert(sF < Bgrid->oSites());
|
||||
|
||||
extract(traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])), vres);
|
||||
// sum across the 5d dimension
|
||||
for (auto v : vres) scalar_object += v;
|
||||
}
|
||||
tmp[so].putlane(scalar_object, si);
|
||||
}
|
||||
}
|
||||
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
||||
*/
|
||||
}
|
||||
};
|
||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
|
||||
|
||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
|
||||
|
||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
|
||||
|
||||
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
|
||||
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
|
||||
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,625 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi, Peter Boyle
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
const std::vector<int>
|
||||
ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
|
||||
const std::vector<int>
|
||||
ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
|
||||
|
||||
/////////////////////////////////
|
||||
// Constructor and gauge import
|
||||
/////////////////////////////////
|
||||
|
||||
|
||||
template <class Impl>
|
||||
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
||||
RealD _mass,
|
||||
RealD _c1, RealD _c2,RealD _u0,
|
||||
const ImplParams &p)
|
||||
: Kernels(p),
|
||||
_grid(&Fgrid),
|
||||
_cbgrid(&Hgrid),
|
||||
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
|
||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||
mass(_mass),
|
||||
Lebesgue(_grid),
|
||||
LebesgueEvenOdd(_cbgrid),
|
||||
Umu(&Fgrid),
|
||||
UmuEven(&Hgrid),
|
||||
UmuOdd(&Hgrid),
|
||||
UUUmu(&Fgrid),
|
||||
UUUmuEven(&Hgrid),
|
||||
UUUmuOdd(&Hgrid) ,
|
||||
_tmp(&Hgrid)
|
||||
{
|
||||
int vol4;
|
||||
int LLs=1;
|
||||
c1=_c1;
|
||||
c2=_c2;
|
||||
u0=_u0;
|
||||
vol4= _grid->oSites();
|
||||
Stencil.BuildSurfaceList(LLs,vol4);
|
||||
vol4= _cbgrid->oSites();
|
||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
RealD _c1, RealD _c2,RealD _u0,
|
||||
const ImplParams &p)
|
||||
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
|
||||
{
|
||||
ImportGauge(_Uthin,_Ufat);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Momentum space propagator should be
|
||||
// https://arxiv.org/pdf/hep-lat/9712010.pdf
|
||||
//
|
||||
// mom space action.
|
||||
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
|
||||
//
|
||||
// must track through staggered flavour/spin reduction in literature to
|
||||
// turn to free propagator for the one component chi field, a la page 4/5
|
||||
// of above link to implmement fourier based solver.
|
||||
////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
||||
{
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Trivial import; phases and fattening and such like preapplied
|
||||
/////////////////////////////////////////////////////////////////
|
||||
GaugeLinkField U(GaugeGrid());
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
U = PeekIndex<LorentzIndex>(_Utriple, mu);
|
||||
PokeIndex<LorentzIndex>(UUUmu, U, mu );
|
||||
|
||||
U = adj( Cshift(U, mu, -3));
|
||||
PokeIndex<LorentzIndex>(UUUmu, -U, mu+4 );
|
||||
|
||||
U = PeekIndex<LorentzIndex>(_Ufat, mu);
|
||||
PokeIndex<LorentzIndex>(Umu, U, mu);
|
||||
|
||||
U = adj( Cshift(U, mu, -1));
|
||||
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
|
||||
|
||||
}
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
||||
{
|
||||
|
||||
Umu = _U;
|
||||
UUUmu = _UUU;
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
|
||||
{
|
||||
pickCheckerboard(Even, UmuEven, Umu);
|
||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
|
||||
{
|
||||
GaugeLinkField U(GaugeGrid());
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Double Store should take two fields for Naik and one hop separately.
|
||||
////////////////////////////////////////////////////////
|
||||
Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Apply scale factors to get the right fermion Kinetic term
|
||||
// Could pass coeffs into the double store to save work.
|
||||
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
|
||||
////////////////////////////////////////////////////////
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
U = PeekIndex<LorentzIndex>(Umu, mu);
|
||||
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
|
||||
|
||||
U = PeekIndex<LorentzIndex>(Umu, mu+4);
|
||||
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
|
||||
|
||||
U = PeekIndex<LorentzIndex>(UUUmu, mu);
|
||||
PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
|
||||
|
||||
U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
|
||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||
}
|
||||
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Implement the interface
|
||||
/////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Dhop(in, out, DaggerNo);
|
||||
return axpy_norm(out, mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Dhop(in, out, DaggerYes);
|
||||
return axpy_norm(out, mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerNo);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerNo);
|
||||
}
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerYes);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerYes);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
typename FermionField::scalar_type scal(mass);
|
||||
out = scal * in;
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Mooee(in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
out = (1.0 / (mass)) * in;
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
|
||||
FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
MooeeInv(in, out);
|
||||
}
|
||||
|
||||
///////////////////////////////////
|
||||
// Internal
|
||||
///////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
GaugeField & mat,
|
||||
const FermionField &A, const FermionField &B, int dag) {
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
|
||||
Compressor compressor;
|
||||
|
||||
FermionField Btilde(B.Grid());
|
||||
FermionField Atilde(B.Grid());
|
||||
Atilde = A;
|
||||
|
||||
st.HaloExchange(B, compressor);
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
////////////////////////
|
||||
// Call the single hop
|
||||
////////////////////////
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto B_v = B.View();
|
||||
auto Btilde_v = Btilde.View();
|
||||
thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++), {
|
||||
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
||||
});
|
||||
|
||||
// Force in three link terms
|
||||
//
|
||||
// Impl::InsertForce4D(mat, Btilde, Atilde, mu);
|
||||
//
|
||||
// dU_ac(x)/dt = i p_ab U_bc(x)
|
||||
//
|
||||
// => dS_f/dt = dS_f/dU_ac(x) . dU_ac(x)/dt = i p_ab U_bc(x) dS_f/dU_ac(x)
|
||||
//
|
||||
// One link: form fragments S_f = A U B
|
||||
//
|
||||
// write Btilde = U(x) B(x+mu)
|
||||
//
|
||||
// mat+= TraceIndex<SpinIndex>(outerProduct(Btilde,A));
|
||||
//
|
||||
// Three link: form fragments S_f = A UUU B
|
||||
//
|
||||
// mat+= outer ( A, UUUB) <-- Best take DhopDeriv with one linke or identity matrix
|
||||
// mat+= outer ( AU, UUB) <-- and then use covariant cshift?
|
||||
// mat+= outer ( AUU, UB) <-- Returned from call to DhopDir
|
||||
|
||||
assert(0);// need to figure out the force interface with a blasted three link term.
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||
|
||||
conformable(U.Grid(), _grid);
|
||||
conformable(U.Grid(), V.Grid());
|
||||
conformable(U.Grid(), mat.Grid());
|
||||
|
||||
mat.Checkerboard() = U.Checkerboard();
|
||||
|
||||
DerivInternal(Stencil, Umu, UUUmu, mat, U, V, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||
|
||||
conformable(U.Grid(), _cbgrid);
|
||||
conformable(U.Grid(), V.Grid());
|
||||
conformable(U.Grid(), mat.Grid());
|
||||
|
||||
assert(V.Checkerboard() == Even);
|
||||
assert(U.Checkerboard() == Odd);
|
||||
mat.Checkerboard() = Odd;
|
||||
|
||||
DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||
|
||||
conformable(U.Grid(), _cbgrid);
|
||||
conformable(U.Grid(), V.Grid());
|
||||
conformable(U.Grid(), mat.Grid());
|
||||
|
||||
assert(V.Checkerboard() == Odd);
|
||||
assert(U.Checkerboard() == Even);
|
||||
mat.Checkerboard() = Even;
|
||||
|
||||
DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
|
||||
{
|
||||
DhopCalls+=2;
|
||||
conformable(in.Grid(), _grid); // verifies full grid
|
||||
conformable(in.Grid(), out.Grid());
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
|
||||
{
|
||||
DhopCalls+=1;
|
||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||
|
||||
assert(in.Checkerboard() == Even);
|
||||
out.Checkerboard() = Odd;
|
||||
|
||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
|
||||
{
|
||||
DhopCalls+=1;
|
||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||
|
||||
assert(in.Checkerboard() == Odd);
|
||||
out.Checkerboard() = Even;
|
||||
|
||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
|
||||
DhopDir(in, out, dir, disp);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
|
||||
|
||||
Compressor compressor;
|
||||
Stencil.HaloExchange(in, compressor);
|
||||
auto Umu_v = Umu.View();
|
||||
auto UUUmu_v = UUUmu.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) , {
|
||||
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
||||
});
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
Compressor compressor;
|
||||
int len = U.Grid()->oSites();
|
||||
const int LLs = 1;
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
|
||||
DhopFaceTime -= usecond();
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
st.CommsMergeSHM(compressor);
|
||||
DhopFaceTime += usecond();
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
DhopComputeTime -= usecond();
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
|
||||
if (tid >= ncomms) {
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = len;
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
|
||||
// do the compute
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
if (dag == DaggerYes) {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
// Interior = 1; Exterior = 0; must implement for staggered
|
||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
|
||||
}
|
||||
} else {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
// Interior = 1; Exterior = 0;
|
||||
int sU = ss;
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
st.CommunicateThreaded();
|
||||
}
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
|
||||
// First to enter, last to leave timing
|
||||
DhopFaceTime -= usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime -= usecond();
|
||||
|
||||
DhopComputeTime2 -= usecond();
|
||||
{
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
if (dag == DaggerYes) {
|
||||
int sz=st.surface_list.size();
|
||||
thread_loop( (int ss = 0; ss < sz; ss++) ,{
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
|
||||
});
|
||||
} else {
|
||||
int sz=st.surface_list.size();
|
||||
thread_loop( (int ss = 0; ss < sz; ss++) ,{
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
|
||||
});
|
||||
}
|
||||
}
|
||||
DhopComputeTime2 += usecond();
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
|
||||
DhopCommTime -= usecond();
|
||||
Compressor compressor;
|
||||
st.HaloExchange(in, compressor);
|
||||
DhopCommTime += usecond();
|
||||
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
DhopComputeTime -= usecond();
|
||||
if (dag == DaggerYes) {
|
||||
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++), {
|
||||
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
|
||||
});
|
||||
} else {
|
||||
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++), {
|
||||
Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
|
||||
});
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Reporting
|
||||
////////////////////////////////////////////////////////////////
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::Report(void)
|
||||
{
|
||||
Coordinate latt = _grid->GlobalDimensions();
|
||||
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
RealD NP = _grid->_Nprocessors;
|
||||
RealD NN = _grid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
|
||||
<< DhopCalls << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
|
||||
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
|
||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
|
||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||
|
||||
// Average the compute time
|
||||
_grid->GlobalSum(DhopComputeTime);
|
||||
DhopComputeTime/=NP;
|
||||
|
||||
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||
|
||||
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
|
||||
{
|
||||
DhopCalls = 0;
|
||||
DhopTotalTime = 0;
|
||||
DhopCommTime = 0;
|
||||
DhopComputeTime = 0;
|
||||
DhopFaceTime = 0;
|
||||
|
||||
Stencil.ZeroCounters();
|
||||
StencilEven.ZeroCounters();
|
||||
StencilOdd.ZeroCounters();
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Conserved current - not yet implemented.
|
||||
////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax,
|
||||
ComplexField &lattice_cmplx)
|
||||
{
|
||||
assert(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
|
||||
|
||||
//AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
|
||||
//TwoIndexFermOpTemplateInstantiate(ImprovedStaggeredFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,672 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
|
||||
#include <Grid/perfmon/PerfCount.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
const std::vector<int>
|
||||
ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
|
||||
const std::vector<int>
|
||||
ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
|
||||
|
||||
// 5d lattice for DWF.
|
||||
template<class Impl>
|
||||
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,
|
||||
RealD _c1,RealD _c2, RealD _u0,
|
||||
const ImplParams &p) :
|
||||
Kernels(p),
|
||||
_FiveDimGrid (&FiveDimGrid),
|
||||
_FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
|
||||
_FourDimGrid (&FourDimGrid),
|
||||
_FourDimRedBlackGrid(&FourDimRedBlackGrid),
|
||||
Stencil (&FiveDimGrid,npoint,Even,directions,displacements,p),
|
||||
StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
|
||||
StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
|
||||
mass(_mass),
|
||||
c1(_c1),
|
||||
c2(_c2),
|
||||
u0(_u0),
|
||||
Umu(&FourDimGrid),
|
||||
UmuEven(&FourDimRedBlackGrid),
|
||||
UmuOdd (&FourDimRedBlackGrid),
|
||||
UUUmu(&FourDimGrid),
|
||||
UUUmuEven(&FourDimRedBlackGrid),
|
||||
UUUmuOdd(&FourDimRedBlackGrid),
|
||||
Lebesgue(&FourDimGrid),
|
||||
LebesgueEvenOdd(&FourDimRedBlackGrid),
|
||||
_tmp(&FiveDimRedBlackGrid)
|
||||
{
|
||||
|
||||
// some assertions
|
||||
assert(FiveDimGrid._ndimension==5);
|
||||
assert(FourDimGrid._ndimension==4);
|
||||
assert(FourDimRedBlackGrid._ndimension==4);
|
||||
assert(FiveDimRedBlackGrid._ndimension==5);
|
||||
assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
|
||||
|
||||
// extent of fifth dim and not spread out
|
||||
Ls=FiveDimGrid._fdimensions[0];
|
||||
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
|
||||
assert(FiveDimGrid._processors[0] ==1);
|
||||
assert(FiveDimRedBlackGrid._processors[0] ==1);
|
||||
|
||||
// Other dimensions must match the decomposition of the four-D fields
|
||||
for(int d=0;d<4;d++){
|
||||
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
|
||||
|
||||
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
|
||||
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
|
||||
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
|
||||
|
||||
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
|
||||
assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
|
||||
assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]);
|
||||
}
|
||||
|
||||
if (Impl::LsVectorised) {
|
||||
|
||||
int nsimd = Simd::Nsimd();
|
||||
|
||||
// Dimension zero of the five-d is the Ls direction
|
||||
assert(FiveDimGrid._simd_layout[0] ==nsimd);
|
||||
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
|
||||
|
||||
for(int d=0;d<4;d++){
|
||||
assert(FourDimGrid._simd_layout[d]==1);
|
||||
assert(FourDimRedBlackGrid._simd_layout[d]==1);
|
||||
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// Dimension zero of the five-d is the Ls direction
|
||||
assert(FiveDimRedBlackGrid._simd_layout[0]==1);
|
||||
assert(FiveDimGrid._simd_layout[0] ==1);
|
||||
|
||||
}
|
||||
int LLs = FiveDimGrid._rdimensions[0];
|
||||
int vol4= FourDimGrid.oSites();
|
||||
Stencil.BuildSurfaceList(LLs,vol4);
|
||||
|
||||
vol4=FourDimRedBlackGrid.oSites();
|
||||
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
|
||||
{
|
||||
pickCheckerboard(Even, UmuEven, Umu);
|
||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
||||
}
|
||||
template<class Impl>
|
||||
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,
|
||||
RealD _c1,RealD _c2, RealD _u0,
|
||||
const ImplParams &p) :
|
||||
ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
|
||||
FourDimGrid,FourDimRedBlackGrid,
|
||||
_mass,_c1,_c2,_u0,p)
|
||||
{
|
||||
ImportGauge(_Uthin,_Ufat);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// For MILC use; pass three link U's and 1 link U
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
||||
{
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Trivial import; phases and fattening and such like preapplied
|
||||
/////////////////////////////////////////////////////////////////
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
|
||||
Impl::InsertGaugeField(UUUmu,U,mu);
|
||||
|
||||
U = adj( Cshift(U, mu, -3));
|
||||
Impl::InsertGaugeField(UUUmu,-U,mu+4);
|
||||
|
||||
U = PeekIndex<LorentzIndex>(_Ufat, mu);
|
||||
Impl::InsertGaugeField(Umu,U,mu);
|
||||
|
||||
U = adj( Cshift(U, mu, -1));
|
||||
Impl::InsertGaugeField(Umu,-U,mu+4);
|
||||
|
||||
}
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
||||
{
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Trivial import; phases and fattening and such like preapplied
|
||||
/////////////////////////////////////////////////////////////////
|
||||
Umu = _U;
|
||||
UUUmu = _UUU;
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
|
||||
{
|
||||
////////////////////////////////////////////////////////
|
||||
// Double Store should take two fields for Naik and one hop separately.
|
||||
////////////////////////////////////////////////////////
|
||||
Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Apply scale factors to get the right fermion Kinetic term
|
||||
// Could pass coeffs into the double store to save work.
|
||||
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
|
||||
////////////////////////////////////////////////////////
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
auto U = PeekIndex<LorentzIndex>(Umu, mu);
|
||||
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
|
||||
|
||||
U = PeekIndex<LorentzIndex>(Umu, mu+4);
|
||||
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
|
||||
|
||||
U = PeekIndex<LorentzIndex>(UUUmu, mu);
|
||||
PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
|
||||
|
||||
U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
|
||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||
}
|
||||
|
||||
CopyGaugeCheckerboards();
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
|
||||
{
|
||||
int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
|
||||
// we drop off the innermost fifth dimension
|
||||
|
||||
Compressor compressor;
|
||||
Stencil.HaloExchange(in,compressor);
|
||||
auto Umu_v = Umu.View();
|
||||
auto UUUmu_v = UUUmu.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
thread_loop( (int ss=0;ss<Umu.Grid()->oSites();ss++),{
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sU=ss;
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sF, sU, in_v, out_v, dir, disp);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
||||
DoubledGaugeField & U,
|
||||
DoubledGaugeField & UUU,
|
||||
GaugeField &mat,
|
||||
const FermionField &A,
|
||||
const FermionField &B,
|
||||
int dag)
|
||||
{
|
||||
// No force terms in multi-rhs solver staggered
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat,
|
||||
const FermionField &A,
|
||||
const FermionField &B,
|
||||
int dag)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
|
||||
const FermionField &A,
|
||||
const FermionField &B,
|
||||
int dag)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||
const FermionField &A,
|
||||
const FermionField &B,
|
||||
int dag)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
/*CHANGE */
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
|
||||
Compressor compressor;
|
||||
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
int len = U.Grid()->oSites();
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
double ctime=0;
|
||||
double ptime=0;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
if (tid >= ncomms) {
|
||||
double start = usecond();
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = U.Grid()->oSites(); // 4d vol
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
|
||||
// do the compute
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
|
||||
if (dag == DaggerYes) {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
// Interior = 1; Exterior = 0; must implement for staggered
|
||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
|
||||
}
|
||||
} else {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
// Interior = 1; Exterior = 0;
|
||||
int sU = ss;
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
|
||||
}
|
||||
}
|
||||
ptime = usecond() - start;
|
||||
} else {
|
||||
double start = usecond();
|
||||
st.CommunicateThreaded();
|
||||
ctime = usecond() - start;
|
||||
}
|
||||
}
|
||||
DhopCommTime += ctime;
|
||||
DhopComputeTime+=ptime;
|
||||
|
||||
// First to enter, last to leave timing
|
||||
st.CollateThreads();
|
||||
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
DhopComputeTime2-=usecond();
|
||||
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
if (dag == DaggerYes) {
|
||||
int sz=st.surface_list.size();
|
||||
thread_loop( (int ss = 0; ss < sz; ss++) ,{
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
|
||||
});
|
||||
} else {
|
||||
int sz=st.surface_list.size();
|
||||
thread_loop( (int ss = 0; ss < sz; ss++) ,{
|
||||
int sU = st.surface_list[ss];
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
|
||||
});
|
||||
}
|
||||
DhopComputeTime2+=usecond();
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
Compressor compressor;
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
|
||||
|
||||
|
||||
//double t1=usecond();
|
||||
DhopTotalTime -= usecond();
|
||||
DhopCommTime -= usecond();
|
||||
st.HaloExchange(in,compressor);
|
||||
DhopCommTime += usecond();
|
||||
|
||||
DhopComputeTime -= usecond();
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
if (dag == DaggerYes) {
|
||||
thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++), {
|
||||
int sU=ss;
|
||||
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
|
||||
});
|
||||
} else {
|
||||
thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
|
||||
int sU=ss;
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
|
||||
});
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
//double t2=usecond();
|
||||
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
|
||||
|
||||
}
|
||||
/*CHANGE END*/
|
||||
|
||||
/* ORG
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
Compressor compressor;
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
DhopCommTime -= usecond();
|
||||
st.HaloExchange(in,compressor);
|
||||
DhopCommTime += usecond();
|
||||
|
||||
DhopComputeTime -= usecond();
|
||||
auto U_v = U.View();
|
||||
auto UUU_v = UUU.View();
|
||||
auto out_v = out.View();
|
||||
auto in_v = in.View();
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
if (dag == DaggerYes) {
|
||||
thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++), {
|
||||
int sU=ss;
|
||||
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
|
||||
});
|
||||
} else {
|
||||
thread_loop( (int ss = 0; ss < U.Grid()->oSites(); ss++) ,{
|
||||
int sU=ss;
|
||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
|
||||
});
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
DhopCalls+=1;
|
||||
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
|
||||
conformable(in.Grid(),out.Grid()); // drops the cb check
|
||||
|
||||
assert(in.Checkerboard()==Even);
|
||||
out.Checkerboard() = Odd;
|
||||
|
||||
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
DhopCalls+=1;
|
||||
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
|
||||
conformable(in.Grid(),out.Grid()); // drops the cb check
|
||||
|
||||
assert(in.Checkerboard()==Odd);
|
||||
out.Checkerboard() = Even;
|
||||
|
||||
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
DhopCalls+=2;
|
||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
||||
conformable(in.Grid(),out.Grid());
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::Report(void)
|
||||
{
|
||||
Coordinate latt = GridDefaultLatt();
|
||||
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
RealD NP = _FourDimGrid->_Nprocessors;
|
||||
RealD NN = _FourDimGrid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : "
|
||||
<< DhopCalls << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : "
|
||||
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : "
|
||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : "
|
||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||
|
||||
// Average the compute time
|
||||
_FourDimGrid->GlobalSum(DhopComputeTime);
|
||||
DhopComputeTime/=NP;
|
||||
|
||||
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||
|
||||
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" <<std::endl; Stencil.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
|
||||
{
|
||||
DhopCalls = 0;
|
||||
DhopTotalTime = 0;
|
||||
DhopCommTime = 0;
|
||||
DhopComputeTime = 0;
|
||||
DhopFaceTime = 0;
|
||||
|
||||
|
||||
Stencil.ZeroCounters();
|
||||
StencilEven.ZeroCounters();
|
||||
StencilOdd.ZeroCounters();
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// Implement the general interface. Here we use SAME mass on all slices
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
|
||||
DhopDir(in, out, dir, disp);
|
||||
}
|
||||
template <class Impl>
|
||||
RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Dhop(in, out, DaggerNo);
|
||||
return axpy_norm(out, mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Dhop(in, out, DaggerYes);
|
||||
return axpy_norm(out, mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerNo);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerNo);
|
||||
}
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerYes);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerYes);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
typename FermionField::scalar_type scal(mass);
|
||||
out = scal * in;
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Mooee(in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
out = (1.0 / (mass)) * in;
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
|
||||
FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
MooeeInv(in, out);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Conserved current - not yet implemented.
|
||||
////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax,
|
||||
ComplexField &lattice_cmplx)
|
||||
{
|
||||
assert(0);
|
||||
|
||||
}
|
||||
|
||||
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
|
||||
FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
|
@ -1,497 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid_Eigen_Dense.h>
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
|
||||
GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mq1, RealD _mq2, RealD _mq3,
|
||||
RealD _shift, int _pm, RealD _M5,
|
||||
RealD _b, RealD _c, const ImplParams &p) :
|
||||
AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
|
||||
FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
|
||||
_shift, _pm, _M5, _b, _c, p)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
RealD eps = 1.0;
|
||||
Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
|
||||
assert(zdata->n == this->Ls);
|
||||
|
||||
std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
|
||||
",c=" << _c << ") with Ls=" << Ls << std::endl;
|
||||
this->SetCoefficientsTanh(zdata, _b, _c);
|
||||
std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
|
||||
",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
|
||||
",pm=" << _pm << ")" << std::endl;
|
||||
|
||||
Approx::zolotarev_free(zdata);
|
||||
|
||||
if(_shift != 0.0){
|
||||
SetCoefficientsPrecondShiftOps();
|
||||
} else {
|
||||
Mooee_shift.resize(Ls, 0.0);
|
||||
MooeeInv_shift_lc.resize(Ls, 0.0);
|
||||
MooeeInv_shift_norm.resize(Ls, 0.0);
|
||||
MooeeInvDag_shift_lc.resize(Ls, 0.0);
|
||||
MooeeInvDag_shift_norm.resize(Ls, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************
|
||||
* Additional EOFA operators only called outside the inverter.
|
||||
* Since speed is not essential, simple axpby-style
|
||||
* implementations should be fine.
|
||||
***************************************************************/
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
RealD alpha = this->alpha;
|
||||
|
||||
Din = Zero();
|
||||
if((sign == 1) && (dag == 0)) { // \Omega_{+}
|
||||
for(int s=0; s<Ls; ++s){
|
||||
axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
|
||||
}
|
||||
} else if((sign == -1) && (dag == 0)) { // \Omega_{-}
|
||||
for(int s=0; s<Ls; ++s){
|
||||
axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
|
||||
}
|
||||
} else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
|
||||
for(int sp=0; sp<Ls; ++sp){
|
||||
axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
|
||||
}
|
||||
} else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
|
||||
for(int sp=0; sp<Ls; ++sp){
|
||||
axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
|
||||
// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
RealD b = 0.5 * ( 1.0 + this->alpha );
|
||||
RealD c = 0.5 * ( 1.0 - this->alpha );
|
||||
RealD mq1 = this->mq1;
|
||||
|
||||
for(int s=0; s<Ls; ++s){
|
||||
if(s == 0) {
|
||||
axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
|
||||
axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
|
||||
} else if(s == (Ls-1)) {
|
||||
axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
|
||||
axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
|
||||
} else {
|
||||
axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
|
||||
axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
RealD m = this->mq1;
|
||||
RealD c = 0.5 * this->alpha;
|
||||
RealD d = 0.5;
|
||||
|
||||
RealD DtInv_p(0.0), DtInv_m(0.0);
|
||||
RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
|
||||
FermionField tmp(this->FermionGrid());
|
||||
|
||||
for(int s=0; s<Ls; ++s){
|
||||
for(int sp=0; sp<Ls; ++sp){
|
||||
|
||||
DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
|
||||
DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
|
||||
DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
|
||||
DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
|
||||
|
||||
if(sp == 0){
|
||||
axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
|
||||
axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
|
||||
} else {
|
||||
axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
|
||||
axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
|
||||
}
|
||||
|
||||
}}
|
||||
}
|
||||
|
||||
/*****************************************************************************************************/
|
||||
|
||||
template<class Impl>
|
||||
RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
FermionField Din(psi.Grid());
|
||||
|
||||
this->Meooe5D(psi, Din);
|
||||
this->DW(Din, chi, DaggerNo);
|
||||
axpby(chi, 1.0, 1.0, chi, psi);
|
||||
this->M5D(psi, chi);
|
||||
return(norm2(chi));
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
FermionField Din(psi.Grid());
|
||||
|
||||
this->DW(psi, Din, DaggerYes);
|
||||
this->MeooeDag5D(Din, chi);
|
||||
this->M5Ddag(psi, chi);
|
||||
axpby(chi, 1.0, 1.0, chi, psi);
|
||||
return(norm2(chi));
|
||||
}
|
||||
|
||||
/********************************************************************
|
||||
* Performance critical fermion operators called inside the inverter
|
||||
********************************************************************/
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
|
||||
|
||||
// fused M + shift operation
|
||||
else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
|
||||
|
||||
// fused M + shift operation
|
||||
else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
|
||||
}
|
||||
|
||||
// half checkerboard operations
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
// coefficients of Mooee
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
for(int s=0; s<Ls; s++){
|
||||
upper[s] = -this->cee[s];
|
||||
lower[s] = -this->cee[s];
|
||||
}
|
||||
upper[Ls-1] *= -this->mq1;
|
||||
lower[0] *= -this->mq1;
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
|
||||
|
||||
// fused M + shift operation
|
||||
else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
// coefficients of MooeeDag
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(s==0) {
|
||||
upper[s] = -this->cee[s+1];
|
||||
lower[s] = this->mq1*this->cee[Ls-1];
|
||||
} else if(s==(Ls-1)) {
|
||||
upper[s] = this->mq1*this->cee[0];
|
||||
lower[s] = -this->cee[s-1];
|
||||
} else {
|
||||
upper[s] = -this->cee[s+1];
|
||||
lower[s] = -this->cee[s-1];
|
||||
}
|
||||
}
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
|
||||
|
||||
// fused M + shift operation
|
||||
else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
|
||||
}
|
||||
|
||||
/****************************************************************************************/
|
||||
|
||||
// Computes coefficients for applying Cayley preconditioned shift operators
|
||||
// (Mooee + \Delta) --> Mooee_shift
|
||||
// (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
|
||||
// (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
|
||||
// For the latter two cases, the operation takes the form
|
||||
// [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
|
||||
// ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
int pm = this->pm;
|
||||
RealD alpha = this->alpha;
|
||||
RealD k = this->k;
|
||||
RealD mq1 = this->mq1;
|
||||
RealD shift = this->shift;
|
||||
|
||||
// Initialize
|
||||
Mooee_shift.resize(Ls);
|
||||
MooeeInv_shift_lc.resize(Ls);
|
||||
MooeeInv_shift_norm.resize(Ls);
|
||||
MooeeInvDag_shift_lc.resize(Ls);
|
||||
MooeeInvDag_shift_norm.resize(Ls);
|
||||
|
||||
// Construct Mooee_shift
|
||||
int idx(0);
|
||||
Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
|
||||
( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
|
||||
for(int s=0; s<Ls; ++s){
|
||||
idx = (pm == 1) ? (s) : (Ls-1-s);
|
||||
Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
|
||||
}
|
||||
|
||||
// Tridiagonal solve for MooeeInvDag_shift_lc
|
||||
{
|
||||
Coeff_t m(0.0);
|
||||
Vector<Coeff_t> d = Mooee_shift;
|
||||
Vector<Coeff_t> u(Ls,0.0);
|
||||
Vector<Coeff_t> y(Ls,0.0);
|
||||
Vector<Coeff_t> q(Ls,0.0);
|
||||
if(pm == 1){ u[0] = 1.0; }
|
||||
else{ u[Ls-1] = 1.0; }
|
||||
|
||||
// Tridiagonal matrix algorithm + Sherman-Morrison formula
|
||||
//
|
||||
// We solve
|
||||
// ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
|
||||
// where Mooee' is the tridiagonal part of Mooee_{+}, and
|
||||
// u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
|
||||
// so that the outer-product u \otimes v gives the (0,Ls-1)
|
||||
// entry of Mooee_{+}.
|
||||
//
|
||||
// We do this as two solves: Mooee'*y = d and Mooee'*q = u,
|
||||
// and then construct the solution to the original system
|
||||
// MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
|
||||
if(pm == 1){
|
||||
for(int s=1; s<Ls; ++s){
|
||||
m = -this->cee[s] / this->bee[s-1];
|
||||
d[s] -= m*d[s-1];
|
||||
u[s] -= m*u[s-1];
|
||||
}
|
||||
}
|
||||
y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
|
||||
q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
|
||||
for(int s=Ls-2; s>=0; --s){
|
||||
if(pm == 1){
|
||||
y[s] = d[s] / this->bee[s];
|
||||
q[s] = u[s] / this->bee[s];
|
||||
} else {
|
||||
y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
|
||||
q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
|
||||
}
|
||||
}
|
||||
|
||||
// Construct MooeeInvDag_shift_lc
|
||||
for(int s=0; s<Ls; ++s){
|
||||
if(pm == 1){
|
||||
MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
|
||||
(1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
|
||||
} else {
|
||||
MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
|
||||
(1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
|
||||
}
|
||||
}
|
||||
|
||||
// Compute remaining coefficients
|
||||
N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
|
||||
for(int s=0; s<Ls; ++s){
|
||||
|
||||
// MooeeInv_shift_lc
|
||||
if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s) * pow(this->cee[s],Ls-1-s); }
|
||||
else { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
|
||||
|
||||
// MooeeInv_shift_norm
|
||||
MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
|
||||
( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
|
||||
|
||||
// MooeeInvDag_shift_norm
|
||||
if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
|
||||
( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
|
||||
else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
|
||||
( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Recompute coefficients for a different value of shift constant
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
|
||||
{
|
||||
this->shift = new_shift;
|
||||
if(new_shift != 0.0){
|
||||
SetCoefficientsPrecondShiftOps();
|
||||
} else {
|
||||
int Ls = this->Ls;
|
||||
Mooee_shift.resize(Ls,0.0);
|
||||
MooeeInv_shift_lc.resize(Ls,0.0);
|
||||
MooeeInv_shift_norm.resize(Ls,0.0);
|
||||
MooeeInvDag_shift_lc.resize(Ls,0.0);
|
||||
MooeeInvDag_shift_norm.resize(Ls,0.0);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
|
||||
Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
GridBase* grid = this->FermionRedBlackGrid();
|
||||
int LLs = grid->_rdimensions[0];
|
||||
|
||||
if(LLs == Ls){ return; } // Not vectorised in 5th direction
|
||||
|
||||
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
Pplus(s,s) = this->bee[s];
|
||||
Pminus(s,s) = this->bee[s];
|
||||
}
|
||||
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
Pminus(s,s+1) = -this->cee[s];
|
||||
Pplus(s+1,s) = -this->cee[s+1];
|
||||
}
|
||||
|
||||
Pplus (0,Ls-1) = this->mq1*this->cee[0];
|
||||
Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
|
||||
|
||||
if(this->shift != 0.0){
|
||||
RealD c = 0.5 * this->alpha;
|
||||
RealD d = 0.5;
|
||||
RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
|
||||
if(this->pm == 1) {
|
||||
for(int s=0; s<Ls; ++s){
|
||||
Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
|
||||
}
|
||||
} else {
|
||||
for(int s=0; s<Ls; ++s){
|
||||
Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Eigen::MatrixXcd PplusMat ;
|
||||
Eigen::MatrixXcd PminusMat;
|
||||
|
||||
if(inv) {
|
||||
PplusMat = Pplus.inverse();
|
||||
PminusMat = Pminus.inverse();
|
||||
} else {
|
||||
PplusMat = Pplus;
|
||||
PminusMat = Pminus;
|
||||
}
|
||||
|
||||
if(dag){
|
||||
PplusMat.adjointInPlace();
|
||||
PminusMat.adjointInPlace();
|
||||
}
|
||||
|
||||
typedef typename SiteHalfSpinor::scalar_type scalar_type;
|
||||
const int Nsimd = Simd::Nsimd();
|
||||
Matp.resize(Ls*LLs);
|
||||
Matm.resize(Ls*LLs);
|
||||
|
||||
for(int s2=0; s2<Ls; s2++){
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
int istride = LLs;
|
||||
int ostride = 1;
|
||||
Simd Vp;
|
||||
Simd Vm;
|
||||
scalar_type *sp = (scalar_type*) &Vp;
|
||||
scalar_type *sm = (scalar_type*) &Vm;
|
||||
for(int l=0; l<Nsimd; l++){
|
||||
if(switcheroo<Coeff_t>::iscomplex()) {
|
||||
sp[l] = PplusMat (l*istride+s1*ostride,s2);
|
||||
sm[l] = PminusMat(l*istride+s1*ostride,s2);
|
||||
} else {
|
||||
// if real
|
||||
scalar_type tmp;
|
||||
tmp = PplusMat (l*istride+s1*ostride,s2);
|
||||
sp[l] = scalar_type(tmp.real(),tmp.real());
|
||||
tmp = PminusMat(l*istride+s1*ostride,s2);
|
||||
sm[l] = scalar_type(tmp.real(),tmp.real());
|
||||
}
|
||||
}
|
||||
Matp[LLs*s2+s1] = Vp;
|
||||
Matm[LLs*s2+s1] = Vm;
|
||||
}}
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(MobiusEOFAFermion);
|
||||
GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,998 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*
|
||||
* Dense matrix versions of routines
|
||||
*/
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
const int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd>> u(LLs);
|
||||
Vector<iSinglet<Simd>> l(LLs);
|
||||
Vector<iSinglet<Simd>> d(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
assert(Nc == 3);
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
#if 0
|
||||
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
int vp = (v+1)%LLs;
|
||||
int vm = (v+LLs-1)%LLs;
|
||||
|
||||
spProj5m(hp, psi[ss+vp]);
|
||||
spProj5p(hm, psi[ss+vm]);
|
||||
|
||||
if (vp <= v){ rotate(hp, hp, 1); }
|
||||
if (vm >= v){ rotate(hm, hm, nsimd-1); }
|
||||
|
||||
hp = 0.5*hp;
|
||||
hm = 0.5*hm;
|
||||
|
||||
spRecon5m(fp, hp);
|
||||
spRecon5p(fm, hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v];
|
||||
chi[ss+v] = chi[ss+v] + u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] + l[v]*fm;
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
if(vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
// Can force these to real arithmetic and save 2x.
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
}
|
||||
|
||||
#endif
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi_i, const FermionField& phi_i,
|
||||
FermionField& chi_i, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||
Vector<Coeff_t>& shift_coeffs)
|
||||
{
|
||||
#if 0
|
||||
auto & psi = psi_i;
|
||||
auto & phi = phi_i;
|
||||
auto & chi = chi_i;
|
||||
|
||||
this->M5D(psi, phi, chi, lower, diag, upper);
|
||||
|
||||
// FIXME: possible gain from vectorizing shift operation as well?
|
||||
Coeff_t one(1.0);
|
||||
int Ls = this->Ls;
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
|
||||
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
|
||||
}
|
||||
|
||||
#else
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
const int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd>> u(LLs);
|
||||
Vector<iSinglet<Simd>> l(LLs);
|
||||
Vector<iSinglet<Simd>> d(LLs);
|
||||
Vector<iSinglet<Simd>> s(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
scalar_type* s_p = (scalar_type*) &s[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
s_p[ss] = shift_coeffs[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
assert(Nc == 3);
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
int vs = (this->pm == 1) ? LLs-1 : 0;
|
||||
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
|
||||
Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
|
||||
Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
|
||||
Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
|
||||
Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
|
||||
Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
if(vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(this->pm == 1 && vs <= v){
|
||||
hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
|
||||
hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
|
||||
hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
|
||||
hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
|
||||
hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
|
||||
hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
if(this->pm == -1 && vs >= v){
|
||||
hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
|
||||
hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
|
||||
hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
|
||||
hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
|
||||
hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
|
||||
hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
|
||||
}
|
||||
|
||||
// Can force these to real arithmetic and save 2x.
|
||||
Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
|
||||
Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
|
||||
Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
|
||||
Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
|
||||
Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
|
||||
Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
|
||||
Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd>> u(LLs);
|
||||
Vector<iSinglet<Simd>> l(LLs);
|
||||
Vector<iSinglet<Simd>> d(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
#if 0
|
||||
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
int vp = (v+1)%LLs;
|
||||
int vm = (v+LLs-1)%LLs;
|
||||
|
||||
spProj5p(hp, psi[ss+vp]);
|
||||
spProj5m(hm, psi[ss+vm]);
|
||||
|
||||
if(vp <= v){ rotate(hp, hp, 1); }
|
||||
if(vm >= v){ rotate(hm, hm, nsimd-1); }
|
||||
|
||||
hp = hp*0.5;
|
||||
hm = hm*0.5;
|
||||
spRecon5p(fp, hp);
|
||||
spRecon5m(fm, hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||
|
||||
if (vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||
Vector<Coeff_t>& shift_coeffs)
|
||||
{
|
||||
#if 0
|
||||
auto & psi = psi_i;
|
||||
auto & phi = phi_i;
|
||||
auto & chi = chi_i;
|
||||
this->M5Ddag(psi, phi, chi, lower, diag, upper);
|
||||
|
||||
// FIXME: possible gain from vectorizing shift operation as well?
|
||||
Coeff_t one(1.0);
|
||||
int Ls = this->Ls;
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
|
||||
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
|
||||
}
|
||||
|
||||
#else
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd>> u(LLs);
|
||||
Vector<iSinglet<Simd>> l(LLs);
|
||||
Vector<iSinglet<Simd>> d(LLs);
|
||||
Vector<iSinglet<Simd>> s(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
scalar_type* s_p = (scalar_type*) &s[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
s_p[ss] = shift_coeffs[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
int vs = (this->pm == 1) ? LLs-1 : 0;
|
||||
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
|
||||
Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
|
||||
Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
|
||||
Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
|
||||
Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
|
||||
Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||
|
||||
if (vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(this->pm == 1 && vs <= v){
|
||||
hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
|
||||
hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
|
||||
hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
|
||||
hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
|
||||
hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
|
||||
hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
if(this->pm == -1 && vs >= v){
|
||||
hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
|
||||
hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
|
||||
hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
|
||||
hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
|
||||
hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
|
||||
hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
|
||||
}
|
||||
|
||||
Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
|
||||
Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
|
||||
Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
|
||||
Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
|
||||
Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
|
||||
Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef AVX512
|
||||
#include<simd/Intel512common.h>
|
||||
#include<simd/Intel512avx.h>
|
||||
#include<simd/Intel512single.h>
|
||||
#endif
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
|
||||
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
#ifndef AVX512
|
||||
{
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
|
||||
for(int s2=0; s2<LLs; s2++){
|
||||
for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
|
||||
|
||||
int s = s2 + l*LLs;
|
||||
int lex = s2 + LLs*site;
|
||||
|
||||
if( s2==0 && l==0 ){
|
||||
SiteChiP=Zero();
|
||||
SiteChiM=Zero();
|
||||
}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
|
||||
}}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
|
||||
}}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
|
||||
SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
|
||||
}}
|
||||
}}
|
||||
|
||||
{
|
||||
int lex = s1 + LLs*site;
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
{
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %%zmm1
|
||||
#define Chi_01 %%zmm2
|
||||
#define Chi_02 %%zmm3
|
||||
#define Chi_10 %%zmm4
|
||||
#define Chi_11 %%zmm5
|
||||
#define Chi_12 %%zmm6
|
||||
#define Chi_20 %%zmm7
|
||||
#define Chi_21 %%zmm8
|
||||
#define Chi_22 %%zmm9
|
||||
#define Chi_30 %%zmm10
|
||||
#define Chi_31 %%zmm11
|
||||
#define Chi_32 %%zmm12
|
||||
|
||||
#define BCAST0 %%zmm13
|
||||
#define BCAST1 %%zmm14
|
||||
#define BCAST2 %%zmm15
|
||||
#define BCAST3 %%zmm16
|
||||
#define BCAST4 %%zmm17
|
||||
#define BCAST5 %%zmm18
|
||||
#define BCAST6 %%zmm19
|
||||
#define BCAST7 %%zmm20
|
||||
#define BCAST8 %%zmm21
|
||||
#define BCAST9 %%zmm22
|
||||
#define BCAST10 %%zmm23
|
||||
#define BCAST11 %%zmm24
|
||||
|
||||
int incr = LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
|
||||
for(int s2=0; s2<LLs; s2++){
|
||||
|
||||
int lex = s2 + LLs*site;
|
||||
uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t) &psi[lex];
|
||||
|
||||
for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
|
||||
|
||||
if((s2+l)==0) {
|
||||
asm(
|
||||
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||
VBCASTCDUP(0,%2,BCAST0)
|
||||
VBCASTCDUP(1,%2,BCAST1)
|
||||
VBCASTCDUP(2,%2,BCAST2)
|
||||
VBCASTCDUP(3,%2,BCAST3)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMULMEM(0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMULMEM(0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMULMEM(0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMULMEM(0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMULMEM(0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMULMEM(0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMULMEM(0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMULMEM(0,%1,BCAST7,Chi_21)
|
||||
VMULMEM(0,%1,BCAST8,Chi_22)
|
||||
VMULMEM(0,%1,BCAST9,Chi_30)
|
||||
VMULMEM(0,%1,BCAST10,Chi_31)
|
||||
VMULMEM(0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
} else {
|
||||
asm(
|
||||
VBCASTCDUP(0,%2,BCAST0) VMADDMEM(0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(1,%2,BCAST1) VMADDMEM(0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(2,%2,BCAST2) VMADDMEM(0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(3,%2,BCAST3) VMADDMEM(0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMADDMEM(0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMADDMEM(0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMADDMEM(0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMADDMEM(0,%1,BCAST7,Chi_21)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMADDMEM(0,%1,BCAST8,Chi_22)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMADDMEM(0,%1,BCAST9,Chi_30)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
}
|
||||
|
||||
a0 = a0 + incr;
|
||||
a1 = a1 + incr;
|
||||
a2 = a2 + sizeof(typename Simd::scalar_type);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
asm (
|
||||
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
|
||||
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
|
||||
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
|
||||
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
#undef Chi_10
|
||||
#undef Chi_11
|
||||
#undef Chi_12
|
||||
#undef Chi_20
|
||||
#undef Chi_21
|
||||
#undef Chi_22
|
||||
#undef Chi_30
|
||||
#undef Chi_31
|
||||
#undef Chi_32
|
||||
|
||||
#undef BCAST0
|
||||
#undef BCAST1
|
||||
#undef BCAST2
|
||||
#undef BCAST3
|
||||
#undef BCAST4
|
||||
#undef BCAST5
|
||||
#undef BCAST6
|
||||
#undef BCAST7
|
||||
#undef BCAST8
|
||||
#undef BCAST9
|
||||
#undef BCAST10
|
||||
#undef BCAST11
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
// Z-mobius version
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
|
||||
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
|
||||
exit(-1);
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
|
||||
{
|
||||
chi.Checkerboard() = psi.Checkerboard();
|
||||
|
||||
int Ls = this->Ls;
|
||||
int LLs = psi.Grid()->_rdimensions[0];
|
||||
int vol = psi.Grid()->oSites()/LLs;
|
||||
|
||||
Vector<iSinglet<Simd>> Matp;
|
||||
Vector<iSinglet<Simd>> Matm;
|
||||
Vector<iSinglet<Simd>>* _Matp;
|
||||
Vector<iSinglet<Simd>>* _Matm;
|
||||
|
||||
// MooeeInternalCompute(dag,inv,Matp,Matm);
|
||||
if(inv && dag){
|
||||
_Matp = &this->MatpInvDag;
|
||||
_Matm = &this->MatmInvDag;
|
||||
}
|
||||
|
||||
if(inv && (!dag)){
|
||||
_Matp = &this->MatpInv;
|
||||
_Matm = &this->MatmInv;
|
||||
}
|
||||
|
||||
if(!inv){
|
||||
MooeeInternalCompute(dag, inv, Matp, Matm);
|
||||
_Matp = &Matp;
|
||||
_Matm = &Matm;
|
||||
}
|
||||
|
||||
assert(_Matp->size() == Ls*LLs);
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
if(switcheroo<Coeff_t>::iscomplex()){
|
||||
thread_loop( (auto site=0; site<vol; site++),{
|
||||
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||
});
|
||||
} else {
|
||||
thread_loop( (auto site=0; site<vol; site++),{
|
||||
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||
});
|
||||
}
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
#ifdef MOBIUS_EOFA_DPERP_VEC
|
||||
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
|
||||
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
|
||||
|
||||
template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
|
||||
template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,452 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
|
||||
// this does both dag and undag but is trivial; make a common helper routing
|
||||
int Ls = this->Ls;
|
||||
|
||||
this->DhopDir(psi,chi,dir,disp);
|
||||
|
||||
int nblock=(Ls-1)/2;
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b;
|
||||
ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s);
|
||||
ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1);
|
||||
}
|
||||
ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
|
||||
|
||||
}
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::Meooe_internal(const FermionField &psi, FermionField &chi,int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
if ( psi.Checkerboard() == Odd ) {
|
||||
this->DhopEO(psi,chi,DaggerNo);
|
||||
} else {
|
||||
this->DhopOE(psi,chi,DaggerNo);
|
||||
}
|
||||
|
||||
int nblock=(Ls-1)/2;
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b;
|
||||
ag5xpby_ssp(chi,-scale,chi,0.0,chi,s,s);
|
||||
ag5xpby_ssp(chi, scale,chi,0.0,chi,s+1,s+1);
|
||||
}
|
||||
ag5xpby_ssp(chi,p[nblock]*scale/amax,chi,0.0,chi,Ls-1,Ls-1);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::Mooee_internal(const FermionField &psi, FermionField &chi,int dag)
|
||||
{
|
||||
// again dag and undag are trivially related
|
||||
int sign = dag ? (-1) : 1;
|
||||
int Ls = this->Ls;
|
||||
|
||||
int nblock=(Ls-1)/2;
|
||||
for(int b=0;b<nblock;b++){
|
||||
|
||||
int s = 2*b;
|
||||
RealD pp = p[nblock-1-b];
|
||||
RealD qq = q[nblock-1-b];
|
||||
|
||||
// Do each 2x2 block aligned at s and multiplies Dw site diagonal by G5 so Hw
|
||||
ag5xpby_ssp(chi,-dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s ,s+1);
|
||||
ag5xpby_ssp(chi, dw_diag*scale,psi,amax*sqrt(qq)*scale,psi, s+1,s);
|
||||
axpby_ssp (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
|
||||
}
|
||||
|
||||
{
|
||||
RealD R=(1+mass)/(1-mass);
|
||||
//R g5 psi[Ls-1] + p[0] H
|
||||
ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale*dw_diag/amax,psi,Ls-1,Ls-1);
|
||||
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b+1;
|
||||
RealD pp = p[nblock-1-b];
|
||||
axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::MooeeInv_internal(const FermionField &psi, FermionField &chi,int dag)
|
||||
{
|
||||
int sign = dag ? (-1) : 1;
|
||||
int Ls = this->Ls;
|
||||
|
||||
FermionField tmp(psi.Grid());
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
//Linv
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
int nblock=(Ls-1)/2;
|
||||
|
||||
axpy(chi,0.0,psi,psi); // Identity piece
|
||||
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b;
|
||||
RealD pp = p[nblock-1-b];
|
||||
RealD qq = q[nblock-1-b];
|
||||
RealD coeff1=sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
|
||||
RealD coeff2=sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
|
||||
axpby_ssp (chi,1.0,chi,coeff1,psi,Ls-1,s);
|
||||
axpbg5y_ssp(chi,1.0,chi,coeff2,psi,Ls-1,s+1);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
//Dinv (note D isn't really diagonal -- just diagonal enough that we can still invert)
|
||||
// Compute Seeinv (coeff of gamma5)
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
RealD R=(1+mass)/(1-mass);
|
||||
RealD Seeinv = R + p[nblock]*dw_diag/amax;
|
||||
for(int b=0;b<nblock;b++){
|
||||
Seeinv += p[nblock-1-b]*dw_diag/amax / ( dw_diag*dw_diag/amax/amax + q[nblock-1-b]);
|
||||
}
|
||||
Seeinv = 1.0/Seeinv;
|
||||
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b;
|
||||
RealD pp = p[nblock-1-b];
|
||||
RealD qq = q[nblock-1-b];
|
||||
RealD coeff1=dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
|
||||
RealD coeff2=amax*sqrt(qq) / ( dw_diag*dw_diag + amax*amax* qq);
|
||||
ag5xpby_ssp (tmp,-coeff1,chi,coeff2,chi,s,s+1);
|
||||
ag5xpby_ssp (tmp, coeff1,chi,coeff2,chi,s+1,s);
|
||||
}
|
||||
ag5xpby_ssp (tmp, Seeinv,chi,0.0,chi,Ls-1,Ls-1);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Uinv
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b;
|
||||
RealD pp = p[nblock-1-b];
|
||||
RealD qq = q[nblock-1-b];
|
||||
RealD coeff1=-sign*sqrt(amax*amax*amax*pp*qq) / ( dw_diag*dw_diag + amax*amax* qq);
|
||||
RealD coeff2=-sign*sqrt(amax*pp)*dw_diag / ( dw_diag*dw_diag + amax*amax* qq); // Implicit g5 here
|
||||
axpby_ssp (chi,1.0/scale,tmp,coeff1/scale,tmp,s,Ls-1);
|
||||
axpbg5y_ssp(chi,1.0/scale,tmp,coeff2/scale,tmp,s+1,Ls-1);
|
||||
}
|
||||
axpby_ssp (chi, 1.0/scale,tmp,0.0,tmp,Ls-1,Ls-1);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, FermionField &chi,int dag)
|
||||
{
|
||||
FermionField D(psi.Grid());
|
||||
|
||||
int Ls = this->Ls;
|
||||
int sign = dag ? (-1) : 1;
|
||||
|
||||
// For partial frac Hw case (b5=c5=1) chroma quirkily computes
|
||||
//
|
||||
// Conventions for partfrac appear to be a mess.
|
||||
// Tony's Nara lectures have
|
||||
//
|
||||
// BlockDiag( H/p_i 1 | 1 )
|
||||
// ( 1 p_i H / q_i^2 | 0 )
|
||||
// ---------------------------------
|
||||
// ( -1 0 | R +p0 H )
|
||||
//
|
||||
//Chroma ( -2H 2sqrt(q_i) | 0 )
|
||||
// (2 sqrt(q_i) 2H | 2 sqrt(p_i) )
|
||||
// ---------------------------------
|
||||
// ( 0 -2 sqrt(p_i) | 2 R gamma_5 + p0 2H
|
||||
//
|
||||
// Edwards/Joo/Kennedy/Wenger
|
||||
//
|
||||
// Here, the "beta's" selected by chroma to scale the unphysical bulk constraint fields
|
||||
// incorporate the approx scale factor. This is obtained by propagating the
|
||||
// scale on "H" out to the off diagonal elements as follows:
|
||||
//
|
||||
// BlockDiag( H/p_i 1 | 1 )
|
||||
// ( 1 p_i H / q_i^2 | 0 )
|
||||
// ---------------------------------
|
||||
// ( -1 0 | R + p_0 H )
|
||||
//
|
||||
// becomes:
|
||||
// BlockDiag( H/ sp_i 1 | 1 )
|
||||
// ( 1 sp_i H / s^2q_i^2 | 0 )
|
||||
// ---------------------------------
|
||||
// ( -1 0 | R + p_0/s H )
|
||||
//
|
||||
//
|
||||
// This is implemented in Chroma by
|
||||
// p0' = p0/approxMax
|
||||
// p_i' = p_i*approxMax
|
||||
// q_i' = q_i*approxMax*approxMax
|
||||
//
|
||||
// After the equivalence transform is applied the matrix becomes
|
||||
//
|
||||
//Chroma ( -2H sqrt(q'_i) | 0 )
|
||||
// (sqrt(q'_i) 2H | sqrt(p'_i) )
|
||||
// ---------------------------------
|
||||
// ( 0 -sqrt(p'_i) | 2 R gamma_5 + p'0 2H
|
||||
//
|
||||
// = ( -2H sqrt(q_i)amax | 0 )
|
||||
// (sqrt(q_i)amax 2H | sqrt(p_i*amax) )
|
||||
// ---------------------------------
|
||||
// ( 0 -sqrt(p_i)*amax | 2 R gamma_5 + p0/amax 2H
|
||||
//
|
||||
|
||||
this->DW(psi,D,DaggerNo);
|
||||
|
||||
int nblock=(Ls-1)/2;
|
||||
for(int b=0;b<nblock;b++){
|
||||
|
||||
int s = 2*b;
|
||||
double pp = p[nblock-1-b];
|
||||
double qq = q[nblock-1-b];
|
||||
|
||||
// Do each 2x2 block aligned at s and
|
||||
ag5xpby_ssp(chi,-1.0*scale,D,amax*sqrt(qq)*scale,psi, s ,s+1); // Multiplies Dw by G5 so Hw
|
||||
ag5xpby_ssp(chi, 1.0*scale,D,amax*sqrt(qq)*scale,psi, s+1,s);
|
||||
|
||||
// Pick up last column
|
||||
axpby_ssp (chi, 1.0, chi,sqrt(amax*pp)*scale*sign,psi,s+1,Ls-1);
|
||||
}
|
||||
|
||||
{
|
||||
double R=(1+this->mass)/(1-this->mass);
|
||||
//R g5 psi[Ls] + p[0] H
|
||||
ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
|
||||
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b+1;
|
||||
double pp = p[nblock-1-b];
|
||||
axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
RealD PartialFractionFermion5D<Impl>::M (const FermionField &in, FermionField &out)
|
||||
{
|
||||
M_internal(in,out,DaggerNo);
|
||||
return norm2(out);
|
||||
}
|
||||
template<class Impl>
|
||||
RealD PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
|
||||
{
|
||||
M_internal(in,out,DaggerYes);
|
||||
return norm2(out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::Meooe (const FermionField &in, FermionField &out)
|
||||
{
|
||||
Meooe_internal(in,out,DaggerNo);
|
||||
}
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::MeooeDag (const FermionField &in, FermionField &out)
|
||||
{
|
||||
Meooe_internal(in,out,DaggerYes);
|
||||
}
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::Mooee (const FermionField &in, FermionField &out)
|
||||
{
|
||||
Mooee_internal(in,out,DaggerNo);
|
||||
}
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::MooeeDag (const FermionField &in, FermionField &out)
|
||||
{
|
||||
Mooee_internal(in,out,DaggerYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::MooeeInv (const FermionField &in, FermionField &out)
|
||||
{
|
||||
MooeeInv_internal(in,out,DaggerNo);
|
||||
}
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::MooeeInvDag (const FermionField &in, FermionField &out)
|
||||
{
|
||||
MooeeInv_internal(in,out,DaggerYes);
|
||||
}
|
||||
|
||||
|
||||
// force terms; five routines; default to Dhop on diagonal
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::MDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
FermionField D(V.Grid());
|
||||
|
||||
int nblock=(Ls-1)/2;
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b;
|
||||
ag5xpby_ssp(D,-scale,U,0.0,U,s,s);
|
||||
ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1);
|
||||
}
|
||||
ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
|
||||
|
||||
this->DhopDeriv(mat,D,V,DaggerNo);
|
||||
};
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
FermionField D(V.Grid());
|
||||
|
||||
int nblock=(Ls-1)/2;
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b;
|
||||
ag5xpby_ssp(D,-scale,U,0.0,U,s,s);
|
||||
ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1);
|
||||
}
|
||||
ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
|
||||
|
||||
this->DhopDerivOE(mat,D,V,DaggerNo);
|
||||
};
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
FermionField D(V.Grid());
|
||||
|
||||
int nblock=(Ls-1)/2;
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b;
|
||||
ag5xpby_ssp(D,-scale,U,0.0,U,s,s);
|
||||
ag5xpby_ssp(D, scale,U,0.0,U,s+1,s+1);
|
||||
}
|
||||
ag5xpby_ssp(D,p[nblock]*scale/amax,U,0.0,U,Ls-1,Ls-1);
|
||||
|
||||
this->DhopDerivEO(mat,D,V,DaggerNo);
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale){
|
||||
SetCoefficientsZolotarev(1.0/scale,zdata);
|
||||
}
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata){
|
||||
|
||||
// check on degree matching
|
||||
// std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
|
||||
int Ls = this->Ls;
|
||||
|
||||
assert(Ls == (2*zdata->da -1) );
|
||||
|
||||
// Part frac
|
||||
// RealD R;
|
||||
R=(1+mass)/(1-mass);
|
||||
dw_diag = (4.0-this->M5);
|
||||
|
||||
// std::vector<RealD> p;
|
||||
// std::vector<RealD> q;
|
||||
p.resize(zdata->da);
|
||||
q.resize(zdata->dd);
|
||||
|
||||
for(int n=0;n<zdata->da;n++){
|
||||
p[n] = zdata -> alpha[n];
|
||||
}
|
||||
for(int n=0;n<zdata->dd;n++){
|
||||
q[n] = -zdata -> ap[n];
|
||||
}
|
||||
|
||||
scale= part_frac_chroma_convention ? 2.0 : 1.0; // Chroma conventions annoy me
|
||||
|
||||
amax=zolo_hi;
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
conformable(solution5d.Grid(),this->FermionGrid());
|
||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
|
||||
}
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
conformable(imported5d.Grid(),this->FermionGrid());
|
||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||
FermionField tmp(this->FermionGrid());
|
||||
tmp=Zero();
|
||||
InsertSlice(input4d, tmp, Ls-1, Ls-1);
|
||||
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
||||
this->Dminus(tmp,imported5d);
|
||||
}
|
||||
|
||||
// Constructors
|
||||
template<class Impl>
|
||||
PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,
|
||||
const ImplParams &p) :
|
||||
WilsonFermion5D<Impl>(_Umu,
|
||||
FiveDimGrid, FiveDimRedBlackGrid,
|
||||
FourDimGrid, FourDimRedBlackGrid,M5,p),
|
||||
mass(_mass)
|
||||
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
assert((Ls&0x1)==1); // Odd Ls required
|
||||
int nrational=Ls-1;
|
||||
|
||||
|
||||
Approx::zolotarev_data *zdata = Approx::higham(1.0,nrational);
|
||||
|
||||
// NB: chroma uses a cast to "float" for the zolotarev range(!?).
|
||||
// this creates a real difference in the operator which I do not like but we can replicate here
|
||||
// to demonstrate compatibility
|
||||
// RealD eps = (zolo_lo / zolo_hi);
|
||||
// zdata = bfm_zolotarev(eps,nrational,0);
|
||||
|
||||
SetCoefficientsTanh(zdata,1.0);
|
||||
|
||||
Approx::zolotarev_free(zdata);
|
||||
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(PartialFractionFermion5D);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -1,294 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi, Peter Boyle
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
||||
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
|
||||
|
||||
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
|
||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||
if (SE->_is_local ) { \
|
||||
if (SE->_permute) { \
|
||||
chi_p = χ \
|
||||
permute(chi, in[SE->_offset], ptype); \
|
||||
} else { \
|
||||
chi_p = &in[SE->_offset]; \
|
||||
} \
|
||||
} else { \
|
||||
chi_p = &buf[SE->_offset]; \
|
||||
} \
|
||||
multLink(Uchi, U[sU], *chi_p, Dir);
|
||||
|
||||
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
|
||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||
if (SE->_is_local ) { \
|
||||
if (SE->_permute) { \
|
||||
chi_p = χ \
|
||||
permute(chi, in[SE->_offset], ptype); \
|
||||
} else { \
|
||||
chi_p = &in[SE->_offset]; \
|
||||
} \
|
||||
} else if ( st.same_node[Dir] ) { \
|
||||
chi_p = &buf[SE->_offset]; \
|
||||
} \
|
||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||
multLink(Uchi, U[sU], *chi_p, Dir); \
|
||||
}
|
||||
|
||||
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||
nmu++; \
|
||||
chi_p = &buf[SE->_offset]; \
|
||||
multLink(Uchi, U[sU], *chi_p, Dir); \
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Generic implementation; move to different file?
|
||||
// Int, Ext, Int+Ext cases for comms overlap
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag) {
|
||||
const SiteSpinor *chi_p;
|
||||
SiteSpinor chi;
|
||||
SiteSpinor Uchi;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
skew = 0;
|
||||
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
||||
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
||||
skew=8;
|
||||
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
||||
if ( dag ) {
|
||||
Uchi = - Uchi;
|
||||
}
|
||||
vstream(out[sF], Uchi);
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Only contributions from interior of our node
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
||||
const SiteSpinor *chi_p;
|
||||
SiteSpinor chi;
|
||||
SiteSpinor Uchi;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
int skew ;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
skew = 0;
|
||||
Uchi=Zero();
|
||||
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
||||
skew=8;
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||
if ( dag ) {
|
||||
Uchi = - Uchi;
|
||||
}
|
||||
vstream(out[sF], Uchi);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Only contributions from exterior of our node
|
||||
///////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
||||
const SiteSpinor *chi_p;
|
||||
// SiteSpinor chi;
|
||||
SiteSpinor Uchi;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
int nmu=0;
|
||||
int skew ;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=LLs*sU+s;
|
||||
skew = 0;
|
||||
Uchi=Zero();
|
||||
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
||||
skew=8;
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||
|
||||
if ( nmu ) {
|
||||
if ( dag ) {
|
||||
out[sF] = out[sF] - Uchi;
|
||||
} else {
|
||||
out[sF] = out[sF] + Uchi;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Driving / wrapping routine to select right kernel
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
int dag=1;
|
||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
int dag=0;
|
||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,
|
||||
int dag,int interior,int exterior)
|
||||
{
|
||||
switch(Opt) {
|
||||
#ifdef AVX512
|
||||
case OptInlineAsm:
|
||||
if ( interior && exterior ) {
|
||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else {
|
||||
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
|
||||
assert(0);
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
case OptHandUnroll:
|
||||
if ( interior && exterior ) {
|
||||
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( interior ) {
|
||||
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( exterior ) {
|
||||
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
}
|
||||
break;
|
||||
case OptGeneric:
|
||||
if ( interior && exterior ) {
|
||||
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( interior ) {
|
||||
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
} else if ( exterior ) {
|
||||
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
std::cout<<"Oops Opt = "<<Opt<<std::endl;
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
|
||||
{
|
||||
// Disp should be either +1,-1,+3,-3
|
||||
// What about "dag" ?
|
||||
// Because we work out pU . dS/dU
|
||||
// U
|
||||
assert(0);
|
||||
}
|
||||
|
||||
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
|
||||
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@ -1,972 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
|
||||
#ifdef AVX512
|
||||
#include <simd/Intel512common.h>
|
||||
#include <simd/Intel512avx.h>
|
||||
#endif
|
||||
|
||||
// Interleave operations from two directions
|
||||
// This looks just like a 2 spin multiply and reuse same sequence from the Wilson
|
||||
// Kernel. But the spin index becomes a mu index instead.
|
||||
#define Chi_00 %zmm0
|
||||
#define Chi_01 %zmm1
|
||||
#define Chi_02 %zmm2
|
||||
#define Chi_10 %zmm3
|
||||
#define Chi_11 %zmm4
|
||||
#define Chi_12 %zmm5
|
||||
#define Chi_20 %zmm6
|
||||
#define Chi_21 %zmm7
|
||||
#define Chi_22 %zmm8
|
||||
#define Chi_30 %zmm9
|
||||
#define Chi_31 %zmm10
|
||||
#define Chi_32 %zmm11
|
||||
|
||||
#define UChi_00 %zmm12
|
||||
#define UChi_01 %zmm13
|
||||
#define UChi_02 %zmm14
|
||||
#define UChi_10 %zmm15
|
||||
#define UChi_11 %zmm16
|
||||
#define UChi_12 %zmm17
|
||||
#define UChi_20 %zmm18
|
||||
#define UChi_21 %zmm19
|
||||
#define UChi_22 %zmm20
|
||||
#define UChi_30 %zmm21
|
||||
#define UChi_31 %zmm22
|
||||
#define UChi_32 %zmm23
|
||||
|
||||
#define pChi_00 %%zmm0
|
||||
#define pChi_01 %%zmm1
|
||||
#define pChi_02 %%zmm2
|
||||
#define pChi_10 %%zmm3
|
||||
#define pChi_11 %%zmm4
|
||||
#define pChi_12 %%zmm5
|
||||
#define pChi_20 %%zmm6
|
||||
#define pChi_21 %%zmm7
|
||||
#define pChi_22 %%zmm8
|
||||
#define pChi_30 %%zmm9
|
||||
#define pChi_31 %%zmm10
|
||||
#define pChi_32 %%zmm11
|
||||
|
||||
#define pUChi_00 %%zmm12
|
||||
#define pUChi_01 %%zmm13
|
||||
#define pUChi_02 %%zmm14
|
||||
#define pUChi_10 %%zmm15
|
||||
#define pUChi_11 %%zmm16
|
||||
#define pUChi_12 %%zmm17
|
||||
#define pUChi_20 %%zmm18
|
||||
#define pUChi_21 %%zmm19
|
||||
#define pUChi_22 %%zmm20
|
||||
#define pUChi_30 %%zmm21
|
||||
#define pUChi_31 %%zmm22
|
||||
#define pUChi_32 %%zmm23
|
||||
|
||||
#define T0 %zmm24
|
||||
#define T1 %zmm25
|
||||
#define T2 %zmm26
|
||||
#define T3 %zmm27
|
||||
|
||||
#define Z00 %zmm26
|
||||
#define Z10 %zmm27
|
||||
#define Z0 Z00
|
||||
#define Z1 %zmm28
|
||||
#define Z2 %zmm29
|
||||
|
||||
#define Z3 %zmm30
|
||||
#define Z4 %zmm31
|
||||
#define Z5 Chi_31
|
||||
#define Z6 Chi_32
|
||||
|
||||
#define MULT_ADD_LS(g0,g1,g2,g3) \
|
||||
asm ( "movq %0, %%r8 \n\t" \
|
||||
"movq %1, %%r9 \n\t" \
|
||||
"movq %2, %%r10 \n\t" \
|
||||
"movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
|
||||
asm ( \
|
||||
VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
|
||||
VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
|
||||
VMADDSUBIDUP(0,%r8,T0,UChi_00) VMADDSUBIDUP(0,%r9,T1,UChi_10) \
|
||||
VMADDSUBIDUP(3,%r8,T0,UChi_01) VMADDSUBIDUP(3,%r9,T1,UChi_11) \
|
||||
VMADDSUBIDUP(6,%r8,T0,UChi_02) VMADDSUBIDUP(6,%r9,T1,UChi_12) \
|
||||
VMADDSUBIDUP(0,%r10,T2,UChi_20) VMADDSUBIDUP(0,%r11,T3,UChi_30) \
|
||||
VMADDSUBIDUP(3,%r10,T2,UChi_21) VMADDSUBIDUP(3,%r11,T3,UChi_31) \
|
||||
VMADDSUBIDUP(6,%r10,T2,UChi_22) VMADDSUBIDUP(6,%r11,T3,UChi_32) \
|
||||
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
|
||||
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
|
||||
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
|
||||
VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
|
||||
VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
|
||||
VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
|
||||
VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
|
||||
VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
|
||||
VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
|
||||
VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
|
||||
VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
|
||||
VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
|
||||
VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
|
||||
VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
|
||||
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
|
||||
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
|
||||
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
|
||||
VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
|
||||
VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
|
||||
VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
|
||||
VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
|
||||
VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
|
||||
VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
|
||||
VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
|
||||
VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
|
||||
VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
|
||||
VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
|
||||
VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
|
||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
|
||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
|
||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
|
||||
VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
|
||||
VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
|
||||
VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
|
||||
|
||||
#define MULT_LS(g0,g1,g2,g3) \
|
||||
asm ( "movq %0, %%r8 \n\t" \
|
||||
"movq %1, %%r9 \n\t" \
|
||||
"movq %2, %%r10 \n\t" \
|
||||
"movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
|
||||
asm ( \
|
||||
VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
|
||||
VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
|
||||
VMULIDUP(0,%r8,T0,UChi_00) VMULIDUP(0,%r9,T1,UChi_10) \
|
||||
VMULIDUP(3,%r8,T0,UChi_01) VMULIDUP(3,%r9,T1,UChi_11) \
|
||||
VMULIDUP(6,%r8,T0,UChi_02) VMULIDUP(6,%r9,T1,UChi_12) \
|
||||
VMULIDUP(0,%r10,T2,UChi_20) VMULIDUP(0,%r11,T3,UChi_30) \
|
||||
VMULIDUP(3,%r10,T2,UChi_21) VMULIDUP(3,%r11,T3,UChi_31) \
|
||||
VMULIDUP(6,%r10,T2,UChi_22) VMULIDUP(6,%r11,T3,UChi_32) \
|
||||
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
|
||||
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
|
||||
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
|
||||
VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
|
||||
VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
|
||||
VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
|
||||
VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
|
||||
VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
|
||||
VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
|
||||
VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
|
||||
VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
|
||||
VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
|
||||
VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
|
||||
VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
|
||||
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
|
||||
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
|
||||
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
|
||||
VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
|
||||
VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
|
||||
VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
|
||||
VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
|
||||
VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
|
||||
VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
|
||||
VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
|
||||
VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
|
||||
VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
|
||||
VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
|
||||
VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
|
||||
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
|
||||
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
|
||||
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
|
||||
VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
|
||||
VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
|
||||
VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
|
||||
|
||||
#define MULT_ADD_XYZTa(g0,g1) \
|
||||
asm ( "movq %0, %%r8 \n\t" \
|
||||
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9");\
|
||||
__asm__ ( \
|
||||
VSHUF(Chi_00,T0) \
|
||||
VSHUF(Chi_10,T1) \
|
||||
VMOVIDUP(0,%r8,Z0 ) \
|
||||
VMOVIDUP(3,%r8,Z1 ) \
|
||||
VMOVIDUP(6,%r8,Z2 ) \
|
||||
VMADDSUB(Z0,T0,UChi_00) \
|
||||
VMADDSUB(Z1,T0,UChi_01) \
|
||||
VMADDSUB(Z2,T0,UChi_02) \
|
||||
\
|
||||
VMOVIDUP(0,%r9,Z0 ) \
|
||||
VMOVIDUP(3,%r9,Z1 ) \
|
||||
VMOVIDUP(6,%r9,Z2 ) \
|
||||
VMADDSUB(Z0,T1,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_12) \
|
||||
\
|
||||
\
|
||||
VMOVRDUP(0,%r8,Z3 ) \
|
||||
VMOVRDUP(3,%r8,Z4 ) \
|
||||
VMOVRDUP(6,%r8,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_00,UChi_00)/*rr * ir = ri rr*/ \
|
||||
VMADDSUB(Z4,Chi_00,UChi_01) \
|
||||
VMADDSUB(Z5,Chi_00,UChi_02) \
|
||||
\
|
||||
VMOVRDUP(0,%r9,Z3 ) \
|
||||
VMOVRDUP(3,%r9,Z4 ) \
|
||||
VMOVRDUP(6,%r9,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_10,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_10,UChi_11)\
|
||||
VMADDSUB(Z5,Chi_10,UChi_12) \
|
||||
\
|
||||
\
|
||||
VMOVIDUP(1,%r8,Z0 ) \
|
||||
VMOVIDUP(4,%r8,Z1 ) \
|
||||
VMOVIDUP(7,%r8,Z2 ) \
|
||||
VSHUF(Chi_01,T0) \
|
||||
VMADDSUB(Z0,T0,UChi_00) \
|
||||
VMADDSUB(Z1,T0,UChi_01) \
|
||||
VMADDSUB(Z2,T0,UChi_02) \
|
||||
\
|
||||
VMOVIDUP(1,%r9,Z0 ) \
|
||||
VMOVIDUP(4,%r9,Z1 ) \
|
||||
VMOVIDUP(7,%r9,Z2 ) \
|
||||
VSHUF(Chi_11,T1) \
|
||||
VMADDSUB(Z0,T1,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_12) \
|
||||
\
|
||||
VMOVRDUP(1,%r8,Z3 ) \
|
||||
VMOVRDUP(4,%r8,Z4 ) \
|
||||
VMOVRDUP(7,%r8,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_01,UChi_00) \
|
||||
VMADDSUB(Z4,Chi_01,UChi_01) \
|
||||
VMADDSUB(Z5,Chi_01,UChi_02) \
|
||||
\
|
||||
VMOVRDUP(1,%r9,Z3 ) \
|
||||
VMOVRDUP(4,%r9,Z4 ) \
|
||||
VMOVRDUP(7,%r9,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_11,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_11,UChi_11) \
|
||||
VMADDSUB(Z5,Chi_11,UChi_12) \
|
||||
\
|
||||
VSHUF(Chi_02,T0) \
|
||||
VSHUF(Chi_12,T1) \
|
||||
VMOVIDUP(2,%r8,Z0 ) \
|
||||
VMOVIDUP(5,%r8,Z1 ) \
|
||||
VMOVIDUP(8,%r8,Z2 ) \
|
||||
VMADDSUB(Z0,T0,UChi_00) \
|
||||
VMADDSUB(Z1,T0,UChi_01) \
|
||||
VMADDSUB(Z2,T0,UChi_02) \
|
||||
VMOVIDUP(2,%r9,Z0 ) \
|
||||
VMOVIDUP(5,%r9,Z1 ) \
|
||||
VMOVIDUP(8,%r9,Z2 ) \
|
||||
VMADDSUB(Z0,T1,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_12) \
|
||||
/*55*/ \
|
||||
VMOVRDUP(2,%r8,Z3 ) \
|
||||
VMOVRDUP(5,%r8,Z4 ) \
|
||||
VMOVRDUP(8,%r8,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_02,UChi_00) \
|
||||
VMADDSUB(Z4,Chi_02,UChi_01) \
|
||||
VMADDSUB(Z5,Chi_02,UChi_02) \
|
||||
VMOVRDUP(2,%r9,Z3 ) \
|
||||
VMOVRDUP(5,%r9,Z4 ) \
|
||||
VMOVRDUP(8,%r9,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_12,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_12,UChi_11) \
|
||||
VMADDSUB(Z5,Chi_12,UChi_12) \
|
||||
/*61 insns*/ );
|
||||
|
||||
#define MULT_ADD_XYZT(g0,g1) \
|
||||
asm ( "movq %0, %%r8 \n\t" \
|
||||
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9");\
|
||||
__asm__ ( \
|
||||
VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
|
||||
VRDUP(Chi_00,T0) VIDUP(Chi_00,Chi_00) \
|
||||
VRDUP(Chi_10,T1) VIDUP(Chi_10,Chi_10) \
|
||||
VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
|
||||
VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
|
||||
VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
|
||||
VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
|
||||
VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
|
||||
VMADDMEM(0,%r8,T0,UChi_00) VMADDMEM(0,%r9,T1,UChi_10) \
|
||||
VMADDMEM(3,%r8,T0,UChi_01) VMADDMEM(3,%r9,T1,UChi_11) \
|
||||
VMADDMEM(6,%r8,T0,UChi_02) VMADDMEM(6,%r9,T1,UChi_12) \
|
||||
VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
|
||||
VRDUP(Chi_01,T0) VIDUP(Chi_01,Chi_01) \
|
||||
VRDUP(Chi_11,T1) VIDUP(Chi_11,Chi_11) \
|
||||
VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
|
||||
VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
|
||||
VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
|
||||
VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
|
||||
VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
|
||||
VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10) \
|
||||
VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11) \
|
||||
VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12) \
|
||||
VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
|
||||
VRDUP(Chi_02,T0) VIDUP(Chi_02,Chi_02) \
|
||||
VRDUP(Chi_12,T1) VIDUP(Chi_12,Chi_12) \
|
||||
VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
|
||||
VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
|
||||
VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
|
||||
VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
|
||||
VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
|
||||
VMADDSUBMEM(2,%r8,T0,Z1) VMADDSUBMEM(2,%r9,T1,Z2) \
|
||||
VMADDSUBMEM(5,%r8,T0,Z3) VMADDSUBMEM(5,%r9,T1,Z4) \
|
||||
VMADDSUBMEM(8,%r8,T0,Z5) VMADDSUBMEM(8,%r9,T1,Z6) \
|
||||
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
|
||||
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
|
||||
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
|
||||
|
||||
#define MULT_XYZT(g0,g1) \
|
||||
asm ( "movq %0, %%r8 \n\t" \
|
||||
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9" ); \
|
||||
__asm__ ( \
|
||||
VSHUF(Chi_00,T0) \
|
||||
VSHUF(Chi_10,T1) \
|
||||
VMOVIDUP(0,%r8,Z0 ) \
|
||||
VMOVIDUP(3,%r8,Z1 ) \
|
||||
VMOVIDUP(6,%r8,Z2 ) \
|
||||
/*6*/ \
|
||||
VMUL(Z0,T0,UChi_00) \
|
||||
VMUL(Z1,T0,UChi_01) \
|
||||
VMUL(Z2,T0,UChi_02) \
|
||||
VMOVIDUP(0,%r9,Z0 ) \
|
||||
VMOVIDUP(3,%r9,Z1 ) \
|
||||
VMOVIDUP(6,%r9,Z2 ) \
|
||||
VMUL(Z0,T1,UChi_10) \
|
||||
VMUL(Z1,T1,UChi_11) \
|
||||
VMUL(Z2,T1,UChi_12) \
|
||||
VMOVRDUP(0,%r8,Z3 ) \
|
||||
VMOVRDUP(3,%r8,Z4 ) \
|
||||
VMOVRDUP(6,%r8,Z5 ) \
|
||||
/*18*/ \
|
||||
VMADDSUB(Z3,Chi_00,UChi_00) \
|
||||
VMADDSUB(Z4,Chi_00,UChi_01)\
|
||||
VMADDSUB(Z5,Chi_00,UChi_02) \
|
||||
VMOVRDUP(0,%r9,Z3 ) \
|
||||
VMOVRDUP(3,%r9,Z4 ) \
|
||||
VMOVRDUP(6,%r9,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_10,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_10,UChi_11)\
|
||||
VMADDSUB(Z5,Chi_10,UChi_12) \
|
||||
VMOVIDUP(1,%r8,Z0 ) \
|
||||
VMOVIDUP(4,%r8,Z1 ) \
|
||||
VMOVIDUP(7,%r8,Z2 ) \
|
||||
/*28*/ \
|
||||
VSHUF(Chi_01,T0) \
|
||||
VMADDSUB(Z0,T0,UChi_00) \
|
||||
VMADDSUB(Z1,T0,UChi_01) \
|
||||
VMADDSUB(Z2,T0,UChi_02) \
|
||||
VMOVIDUP(1,%r9,Z0 ) \
|
||||
VMOVIDUP(4,%r9,Z1 ) \
|
||||
VMOVIDUP(7,%r9,Z2 ) \
|
||||
VSHUF(Chi_11,T1) \
|
||||
VMADDSUB(Z0,T1,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_12) \
|
||||
VMOVRDUP(1,%r8,Z3 ) \
|
||||
VMOVRDUP(4,%r8,Z4 ) \
|
||||
VMOVRDUP(7,%r8,Z5 ) \
|
||||
/*38*/ \
|
||||
VMADDSUB(Z3,Chi_01,UChi_00) \
|
||||
VMADDSUB(Z4,Chi_01,UChi_01) \
|
||||
VMADDSUB(Z5,Chi_01,UChi_02) \
|
||||
VMOVRDUP(1,%r9,Z3 ) \
|
||||
VMOVRDUP(4,%r9,Z4 ) \
|
||||
VMOVRDUP(7,%r9,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_11,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_11,UChi_11) \
|
||||
VMADDSUB(Z5,Chi_11,UChi_12) \
|
||||
/*48*/ \
|
||||
VSHUF(Chi_02,T0) \
|
||||
VSHUF(Chi_12,T1) \
|
||||
VMOVIDUP(2,%r8,Z0 ) \
|
||||
VMOVIDUP(5,%r8,Z1 ) \
|
||||
VMOVIDUP(8,%r8,Z2 ) \
|
||||
VMADDSUB(Z0,T0,UChi_00) \
|
||||
VMADDSUB(Z1,T0,UChi_01) \
|
||||
VMADDSUB(Z2,T0,UChi_02) \
|
||||
VMOVIDUP(2,%r9,Z0 ) \
|
||||
VMOVIDUP(5,%r9,Z1 ) \
|
||||
VMOVIDUP(8,%r9,Z2 ) \
|
||||
VMADDSUB(Z0,T1,UChi_10) \
|
||||
VMADDSUB(Z1,T1,UChi_11) \
|
||||
VMADDSUB(Z2,T1,UChi_12) \
|
||||
/*55*/ \
|
||||
VMOVRDUP(2,%r8,Z3 ) \
|
||||
VMOVRDUP(5,%r8,Z4 ) \
|
||||
VMOVRDUP(8,%r8,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_02,UChi_00) \
|
||||
VMADDSUB(Z4,Chi_02,UChi_01) \
|
||||
VMADDSUB(Z5,Chi_02,UChi_02) \
|
||||
VMOVRDUP(2,%r9,Z3 ) \
|
||||
VMOVRDUP(5,%r9,Z4 ) \
|
||||
VMOVRDUP(8,%r9,Z5 ) \
|
||||
VMADDSUB(Z3,Chi_12,UChi_10) \
|
||||
VMADDSUB(Z4,Chi_12,UChi_11) \
|
||||
VMADDSUB(Z5,Chi_12,UChi_12) \
|
||||
/*61 insns*/ );
|
||||
|
||||
#define MULT_XYZTa(g0,g1) \
|
||||
asm ( "movq %0, %%r8 \n\t" \
|
||||
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9" ); \
|
||||
__asm__ ( \
|
||||
VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
|
||||
VRDUP(Chi_00,T0) VIDUP(Chi_00,Chi_00) \
|
||||
VRDUP(Chi_10,T1) VIDUP(Chi_10,Chi_10) \
|
||||
VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
|
||||
VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
|
||||
VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
|
||||
VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
|
||||
VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
|
||||
VMULMEM(0,%r8,T0,UChi_00) VMULMEM(0,%r9,T1,UChi_10) \
|
||||
VMULMEM(3,%r8,T0,UChi_01) VMULMEM(3,%r9,T1,UChi_11) \
|
||||
VMULMEM(6,%r8,T0,UChi_02) VMULMEM(6,%r9,T1,UChi_12) \
|
||||
VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
|
||||
VRDUP(Chi_01,T0) VIDUP(Chi_01,Chi_01) \
|
||||
VRDUP(Chi_11,T1) VIDUP(Chi_11,Chi_11) \
|
||||
VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
|
||||
VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
|
||||
VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
|
||||
VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
|
||||
VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
|
||||
VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10) \
|
||||
VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11) \
|
||||
VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12) \
|
||||
VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
|
||||
VRDUP(Chi_02,T0) VIDUP(Chi_02,Chi_02) \
|
||||
VRDUP(Chi_12,T1) VIDUP(Chi_12,Chi_12) \
|
||||
VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
|
||||
VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
|
||||
VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
|
||||
VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
|
||||
VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
|
||||
VMADDSUBMEM(2,%r8,T0,Z1) VMADDSUBMEM(2,%r9,T1,Z2) \
|
||||
VMADDSUBMEM(5,%r8,T0,Z3) VMADDSUBMEM(5,%r9,T1,Z4) \
|
||||
VMADDSUBMEM(8,%r8,T0,Z5) VMADDSUBMEM(8,%r9,T1,Z6) \
|
||||
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
|
||||
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
|
||||
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
|
||||
|
||||
|
||||
#define LOAD_CHI(a0,a1,a2,a3) \
|
||||
asm ( \
|
||||
"movq %0, %%r8 \n\t" \
|
||||
VLOAD(0,%%r8,pChi_00) \
|
||||
VLOAD(1,%%r8,pChi_01) \
|
||||
VLOAD(2,%%r8,pChi_02) \
|
||||
: : "r" (a0) : "%r8" ); \
|
||||
asm ( \
|
||||
"movq %0, %%r8 \n\t" \
|
||||
VLOAD(0,%%r8,pChi_10) \
|
||||
VLOAD(1,%%r8,pChi_11) \
|
||||
VLOAD(2,%%r8,pChi_12) \
|
||||
: : "r" (a1) : "%r8" ); \
|
||||
asm ( \
|
||||
"movq %0, %%r8 \n\t" \
|
||||
VLOAD(0,%%r8,pChi_20) \
|
||||
VLOAD(1,%%r8,pChi_21) \
|
||||
VLOAD(2,%%r8,pChi_22) \
|
||||
: : "r" (a2) : "%r8" ); \
|
||||
asm ( \
|
||||
"movq %0, %%r8 \n\t" \
|
||||
VLOAD(0,%%r8,pChi_30) \
|
||||
VLOAD(1,%%r8,pChi_31) \
|
||||
VLOAD(2,%%r8,pChi_32) \
|
||||
: : "r" (a3) : "%r8" );
|
||||
|
||||
#define LOAD_CHIa(a0,a1) \
|
||||
asm ( \
|
||||
"movq %0, %%r8 \n\t" \
|
||||
VLOAD(0,%%r8,pChi_00) \
|
||||
VLOAD(1,%%r8,pChi_01) \
|
||||
VLOAD(2,%%r8,pChi_02) \
|
||||
: : "r" (a0) : "%r8" ); \
|
||||
asm ( \
|
||||
"movq %0, %%r8 \n\t" \
|
||||
VLOAD(0,%%r8,pChi_10) \
|
||||
VLOAD(1,%%r8,pChi_11) \
|
||||
VLOAD(2,%%r8,pChi_12) \
|
||||
: : "r" (a1) : "%r8" );
|
||||
|
||||
#define PF_CHI(a0)
|
||||
#define PF_CHIa(a0) \
|
||||
asm ( \
|
||||
"movq %0, %%r8 \n\t" \
|
||||
VPREFETCH1(0,%%r8) \
|
||||
VPREFETCH1(1,%%r8) \
|
||||
VPREFETCH1(2,%%r8) \
|
||||
: : "r" (a0) : "%r8" ); \
|
||||
|
||||
#define PF_GAUGE_XYZT(a0)
|
||||
#define PF_GAUGE_XYZTa(a0) \
|
||||
asm ( \
|
||||
"movq %0, %%r8 \n\t" \
|
||||
VPREFETCH1(0,%%r8) \
|
||||
VPREFETCH1(1,%%r8) \
|
||||
VPREFETCH1(2,%%r8) \
|
||||
VPREFETCH1(3,%%r8) \
|
||||
VPREFETCH1(4,%%r8) \
|
||||
VPREFETCH1(5,%%r8) \
|
||||
VPREFETCH1(6,%%r8) \
|
||||
VPREFETCH1(7,%%r8) \
|
||||
VPREFETCH1(8,%%r8) \
|
||||
: : "r" (a0) : "%r8" ); \
|
||||
|
||||
#define PF_GAUGE_LS(a0)
|
||||
#define PF_GAUGE_LSa(a0) \
|
||||
asm ( \
|
||||
"movq %0, %%r8 \n\t" \
|
||||
VPREFETCH1(0,%%r8) \
|
||||
VPREFETCH1(1,%%r8) \
|
||||
: : "r" (a0) : "%r8" ); \
|
||||
|
||||
|
||||
#define REDUCE(out) \
|
||||
asm ( \
|
||||
VADD(UChi_00,UChi_10,UChi_00) \
|
||||
VADD(UChi_01,UChi_11,UChi_01) \
|
||||
VADD(UChi_02,UChi_12,UChi_02) \
|
||||
VADD(UChi_30,UChi_20,UChi_30) \
|
||||
VADD(UChi_31,UChi_21,UChi_31) \
|
||||
VADD(UChi_32,UChi_22,UChi_32) \
|
||||
VADD(UChi_00,UChi_30,UChi_00) \
|
||||
VADD(UChi_01,UChi_31,UChi_01) \
|
||||
VADD(UChi_02,UChi_32,UChi_02) ); \
|
||||
asm ( \
|
||||
VSTORE(0,%0,pUChi_00) \
|
||||
VSTORE(1,%0,pUChi_01) \
|
||||
VSTORE(2,%0,pUChi_02) \
|
||||
: : "r" (out) : "memory" );
|
||||
|
||||
#define nREDUCE(out) \
|
||||
asm ( \
|
||||
VADD(UChi_00,UChi_10,UChi_00) \
|
||||
VADD(UChi_01,UChi_11,UChi_01) \
|
||||
VADD(UChi_02,UChi_12,UChi_02) \
|
||||
VADD(UChi_30,UChi_20,UChi_30) \
|
||||
VADD(UChi_31,UChi_21,UChi_31) \
|
||||
VADD(UChi_32,UChi_22,UChi_32) \
|
||||
VADD(UChi_00,UChi_30,UChi_00) \
|
||||
VADD(UChi_01,UChi_31,UChi_01) \
|
||||
VADD(UChi_02,UChi_32,UChi_02) ); \
|
||||
asm (VZERO(Chi_00) \
|
||||
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||
asm ( \
|
||||
VSTORE(0,%0,pUChi_00) \
|
||||
VSTORE(1,%0,pUChi_01) \
|
||||
VSTORE(2,%0,pUChi_02) \
|
||||
: : "r" (out) : "memory" );
|
||||
|
||||
#define REDUCEa(out) \
|
||||
asm ( \
|
||||
VADD(UChi_00,UChi_10,UChi_00) \
|
||||
VADD(UChi_01,UChi_11,UChi_01) \
|
||||
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||
asm ( \
|
||||
VSTORE(0,%0,pUChi_00) \
|
||||
VSTORE(1,%0,pUChi_01) \
|
||||
VSTORE(2,%0,pUChi_02) \
|
||||
: : "r" (out) : "memory" );
|
||||
|
||||
// FIXME is sign right in the VSUB ?
|
||||
#define nREDUCEa(out) \
|
||||
asm ( \
|
||||
VADD(UChi_00,UChi_10,UChi_00) \
|
||||
VADD(UChi_01,UChi_11,UChi_01) \
|
||||
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||
asm (VZERO(Chi_00) \
|
||||
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||
asm ( \
|
||||
VSTORE(0,%0,pUChi_00) \
|
||||
VSTORE(1,%0,pUChi_01) \
|
||||
VSTORE(2,%0,pUChi_02) \
|
||||
: : "r" (out) : "memory" );
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_0,Chi_0);\
|
||||
permute##dir(Chi_1,Chi_1);\
|
||||
permute##dir(Chi_2,Chi_2);
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
assert(0);
|
||||
};
|
||||
|
||||
|
||||
//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in[o] ; } else { out =(uint64_t) &buf[o]; }
|
||||
|
||||
#define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }
|
||||
|
||||
#define PREPARE_XYZT(X,Y,Z,T,skew,UU) \
|
||||
PREPARE(X,Y,Z,T,skew,UU); \
|
||||
PF_GAUGE_XYZT(gauge0); \
|
||||
PF_GAUGE_XYZT(gauge1); \
|
||||
PF_GAUGE_XYZT(gauge2); \
|
||||
PF_GAUGE_XYZT(gauge3);
|
||||
|
||||
#define PREPARE_LS(X,Y,Z,T,skew,UU) \
|
||||
PREPARE(X,Y,Z,T,skew,UU); \
|
||||
PF_GAUGE_LS(gauge0); \
|
||||
PF_GAUGE_LS(gauge1); \
|
||||
PF_GAUGE_LS(gauge2); \
|
||||
PF_GAUGE_LS(gauge3);
|
||||
|
||||
#define PREPARE(X,Y,Z,T,skew,UU) \
|
||||
SE0=st.GetEntry(ptype,X+skew,sF); \
|
||||
o0 = SE0->_offset; \
|
||||
l0 = SE0->_is_local; \
|
||||
p0 = SE0->_permute; \
|
||||
CONDITIONAL_MOVE(l0,o0,addr0); \
|
||||
PF_CHI(addr0); \
|
||||
\
|
||||
SE1=st.GetEntry(ptype,Y+skew,sF); \
|
||||
o1 = SE1->_offset; \
|
||||
l1 = SE1->_is_local; \
|
||||
p1 = SE1->_permute; \
|
||||
CONDITIONAL_MOVE(l1,o1,addr1); \
|
||||
PF_CHI(addr1); \
|
||||
\
|
||||
SE2=st.GetEntry(ptype,Z+skew,sF); \
|
||||
o2 = SE2->_offset; \
|
||||
l2 = SE2->_is_local; \
|
||||
p2 = SE2->_permute; \
|
||||
CONDITIONAL_MOVE(l2,o2,addr2); \
|
||||
PF_CHI(addr2); \
|
||||
\
|
||||
SE3=st.GetEntry(ptype,T+skew,sF); \
|
||||
o3 = SE3->_offset; \
|
||||
l3 = SE3->_is_local; \
|
||||
p3 = SE3->_permute; \
|
||||
CONDITIONAL_MOVE(l3,o3,addr3); \
|
||||
PF_CHI(addr3); \
|
||||
\
|
||||
gauge0 =(uint64_t)&UU[sU]( X ); \
|
||||
gauge1 =(uint64_t)&UU[sU]( Y ); \
|
||||
gauge2 =(uint64_t)&UU[sU]( Z ); \
|
||||
gauge3 =(uint64_t)&UU[sU]( T );
|
||||
|
||||
// This is the single precision 5th direction vectorised kernel
|
||||
#include <simd/Intel512single.h>
|
||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||
uint64_t addr0,addr1,addr2,addr3;
|
||||
const SiteSpinor *in_p; in_p = &in[0];
|
||||
|
||||
int o0,o1,o2,o3; // offsets
|
||||
int l0,l1,l2,l3; // local
|
||||
int p0,p1,p2,p3; // perm
|
||||
int ptype;
|
||||
StencilEntry *SE0;
|
||||
StencilEntry *SE1;
|
||||
StencilEntry *SE2;
|
||||
StencilEntry *SE3;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
|
||||
int sF=s+LLs*sU;
|
||||
// Xp, Yp, Zp, Tp
|
||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
addr0 = (uint64_t) &out[sF];
|
||||
if ( dag ) {
|
||||
nREDUCE(addr0);
|
||||
} else {
|
||||
REDUCE(addr0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#include <simd/Intel512double.h>
|
||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||
uint64_t addr0,addr1,addr2,addr3;
|
||||
const SiteSpinor *in_p; in_p = &in[0];
|
||||
|
||||
int o0,o1,o2,o3; // offsets
|
||||
int l0,l1,l2,l3; // local
|
||||
int p0,p1,p2,p3; // perm
|
||||
int ptype;
|
||||
StencilEntry *SE0;
|
||||
StencilEntry *SE1;
|
||||
StencilEntry *SE2;
|
||||
StencilEntry *SE3;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
// Xp, Yp, Zp, Tp
|
||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
MULT_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||
|
||||
addr0 = (uint64_t) &out[sF];
|
||||
if ( dag ) {
|
||||
nREDUCE(addr0);
|
||||
} else {
|
||||
REDUCE(addr0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#define PERMUTE_DIR3 __asm__ ( \
|
||||
VPERM3(Chi_00,Chi_00) \
|
||||
VPERM3(Chi_01,Chi_01) \
|
||||
VPERM3(Chi_02,Chi_02) );
|
||||
|
||||
#define PERMUTE_DIR2 __asm__ ( \
|
||||
VPERM2(Chi_10,Chi_10) \
|
||||
VPERM2(Chi_11,Chi_11) \
|
||||
VPERM2(Chi_12,Chi_12) );
|
||||
|
||||
#define PERMUTE_DIR1 __asm__ ( \
|
||||
VPERM1(Chi_00,Chi_00) \
|
||||
VPERM1(Chi_01,Chi_01) \
|
||||
VPERM1(Chi_02,Chi_02) );
|
||||
|
||||
#define PERMUTE_DIR0 __asm__ ( \
|
||||
VPERM0(Chi_10,Chi_10) \
|
||||
VPERM0(Chi_11,Chi_11) \
|
||||
VPERM0(Chi_12,Chi_12) );
|
||||
|
||||
#define PERMUTE01 \
|
||||
if ( p0 ) { PERMUTE_DIR3; }\
|
||||
if ( p1 ) { PERMUTE_DIR2; }
|
||||
|
||||
#define PERMUTE23 \
|
||||
if ( p2 ) { PERMUTE_DIR1; }\
|
||||
if ( p3 ) { PERMUTE_DIR0; }
|
||||
|
||||
// This is the single precision 5th direction vectorised kernel
|
||||
|
||||
#include <simd/Intel512single.h>
|
||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||
uint64_t addr0,addr1,addr2,addr3;
|
||||
const SiteSpinor *in_p; in_p = &in[0];
|
||||
|
||||
int o0,o1,o2,o3; // offsets
|
||||
int l0,l1,l2,l3; // local
|
||||
int p0,p1,p2,p3; // perm
|
||||
int ptype;
|
||||
StencilEntry *SE0;
|
||||
StencilEntry *SE1;
|
||||
StencilEntry *SE2;
|
||||
StencilEntry *SE3;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
|
||||
int sF=s+LLs*sU;
|
||||
// Xp, Yp, Zp, Tp
|
||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
PERMUTE01;
|
||||
MULT_XYZT(gauge0,gauge1);
|
||||
LOAD_CHIa(addr2,addr3);
|
||||
PERMUTE23;
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
PERMUTE01;
|
||||
MULT_ADD_XYZT(gauge0,gauge1);
|
||||
LOAD_CHIa(addr2,addr3);
|
||||
PERMUTE23;
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
PERMUTE01;
|
||||
MULT_ADD_XYZT(gauge0,gauge1);
|
||||
LOAD_CHIa(addr2,addr3);
|
||||
PERMUTE23;
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
PERMUTE01;
|
||||
MULT_ADD_XYZT(gauge0,gauge1);
|
||||
LOAD_CHIa(addr2,addr3);
|
||||
PERMUTE23;
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
addr0 = (uint64_t) &out[sF];
|
||||
if ( dag ) {
|
||||
nREDUCEa(addr0);
|
||||
} else {
|
||||
REDUCEa(addr0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
#include <simd/Intel512double.h>
|
||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U,
|
||||
DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
#ifdef AVX512
|
||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||
uint64_t addr0,addr1,addr2,addr3;
|
||||
const SiteSpinor *in_p; in_p = &in[0];
|
||||
|
||||
int o0,o1,o2,o3; // offsets
|
||||
int l0,l1,l2,l3; // local
|
||||
int p0,p1,p2,p3; // perm
|
||||
int ptype;
|
||||
StencilEntry *SE0;
|
||||
StencilEntry *SE1;
|
||||
StencilEntry *SE2;
|
||||
StencilEntry *SE3;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
|
||||
int sF=s+LLs*sU;
|
||||
// Xp, Yp, Zp, Tp
|
||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
PERMUTE01;
|
||||
MULT_XYZT(gauge0,gauge1);
|
||||
LOAD_CHIa(addr2,addr3);
|
||||
PERMUTE23;
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
PREPARE(Xm,Ym,Zm,Tm,0,U);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
PERMUTE01;
|
||||
MULT_ADD_XYZT(gauge0,gauge1);
|
||||
LOAD_CHIa(addr2,addr3);
|
||||
PERMUTE23;
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
PERMUTE01;
|
||||
MULT_ADD_XYZT(gauge0,gauge1);
|
||||
LOAD_CHIa(addr2,addr3);
|
||||
PERMUTE23;
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
|
||||
LOAD_CHIa(addr0,addr1);
|
||||
PERMUTE01;
|
||||
MULT_ADD_XYZT(gauge0,gauge1);
|
||||
LOAD_CHIa(addr2,addr3);
|
||||
PERMUTE23;
|
||||
MULT_ADD_XYZT(gauge2,gauge3);
|
||||
|
||||
addr0 = (uint64_t) &out[sF];
|
||||
if ( dag ) {
|
||||
nREDUCEa(addr0);
|
||||
} else {
|
||||
REDUCEa(addr0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define KERNEL_INSTANTIATE(CLASS,FUNC,IMPL) \
|
||||
template void CLASS<IMPL>::FUNC(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeFieldView &U, \
|
||||
DoubledGaugeFieldView &UUU, \
|
||||
SiteSpinor *buf, int LLs, \
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||
|
||||
//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
||||
//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
||||
//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplD);
|
||||
//KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplF);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -1,396 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#define LOAD_CHI(b) \
|
||||
const SiteSpinor & ref (b[offset]); \
|
||||
Chi_0=ref()()(0);\
|
||||
Chi_1=ref()()(1);\
|
||||
Chi_2=ref()()(2);
|
||||
|
||||
|
||||
// To splat or not to splat depends on the implementation
|
||||
#define MULT(A,UChi) \
|
||||
auto & ref(U[sU](A)); \
|
||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||
Impl::loadLinkElement(U_02,ref()(0,2)); \
|
||||
Impl::loadLinkElement(U_12,ref()(1,2)); \
|
||||
Impl::loadLinkElement(U_22,ref()(2,2)); \
|
||||
UChi ## _0 = U_00*Chi_0; \
|
||||
UChi ## _1 = U_10*Chi_0;\
|
||||
UChi ## _2 = U_20*Chi_0;\
|
||||
UChi ## _0 += U_01*Chi_1;\
|
||||
UChi ## _1 += U_11*Chi_1;\
|
||||
UChi ## _2 += U_21*Chi_1;\
|
||||
UChi ## _0 += U_02*Chi_2;\
|
||||
UChi ## _1 += U_12*Chi_2;\
|
||||
UChi ## _2 += U_22*Chi_2;
|
||||
|
||||
#define MULT_ADD(U,A,UChi) \
|
||||
auto & ref(U[sU](A)); \
|
||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||
Impl::loadLinkElement(U_02,ref()(0,2)); \
|
||||
Impl::loadLinkElement(U_12,ref()(1,2)); \
|
||||
Impl::loadLinkElement(U_22,ref()(2,2)); \
|
||||
UChi ## _0 += U_00*Chi_0; \
|
||||
UChi ## _1 += U_10*Chi_0;\
|
||||
UChi ## _2 += U_20*Chi_0;\
|
||||
UChi ## _0 += U_01*Chi_1;\
|
||||
UChi ## _1 += U_11*Chi_1;\
|
||||
UChi ## _2 += U_21*Chi_1;\
|
||||
UChi ## _0 += U_02*Chi_2;\
|
||||
UChi ## _1 += U_12*Chi_2;\
|
||||
UChi ## _2 += U_22*Chi_2;
|
||||
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_0,Chi_0); \
|
||||
permute##dir(Chi_1,Chi_1); \
|
||||
permute##dir(Chi_2,Chi_2);
|
||||
|
||||
|
||||
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHI(in); \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(Perm); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI(buf); \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
|
||||
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||
{ \
|
||||
MULT(Dir,even); \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
|
||||
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||
{ \
|
||||
MULT_ADD(U,Dir,even); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
|
||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHI(in); \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(Perm); \
|
||||
} \
|
||||
} else if ( st.same_node[Dir] ) { \
|
||||
LOAD_CHI(buf); \
|
||||
} \
|
||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||
MULT_ADD(U,Dir,even); \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
|
||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||
nmu++; \
|
||||
{ LOAD_CHI(buf); } \
|
||||
{ MULT_ADD(U,Dir,even); } \
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
Simd even_0; // 12 regs on knc
|
||||
Simd even_1;
|
||||
Simd even_2;
|
||||
Simd odd_0; // 12 regs on knc
|
||||
Simd odd_1;
|
||||
Simd odd_2;
|
||||
|
||||
Simd Chi_0; // two spinor; 6 regs
|
||||
Simd Chi_1;
|
||||
Simd Chi_2;
|
||||
|
||||
Simd U_00; // two rows of U matrix
|
||||
Simd U_10;
|
||||
Simd U_20;
|
||||
Simd U_01;
|
||||
Simd U_11;
|
||||
Simd U_21; // 2 reg left.
|
||||
Simd U_02;
|
||||
Simd U_12;
|
||||
Simd U_22;
|
||||
|
||||
SiteSpinor result;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
StencilEntry *SE;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
|
||||
skew = 0;
|
||||
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG (U,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG (U,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG (U,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
||||
skew = 8;
|
||||
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
||||
|
||||
if ( dag ) {
|
||||
result()()(0) = - even_0 - odd_0;
|
||||
result()()(1) = - even_1 - odd_1;
|
||||
result()()(2) = - even_2 - odd_2;
|
||||
} else {
|
||||
result()()(0) = even_0 + odd_0;
|
||||
result()()(1) = even_1 + odd_1;
|
||||
result()()(2) = even_2 + odd_2;
|
||||
}
|
||||
vstream(out[sF],result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
Simd even_0; // 12 regs on knc
|
||||
Simd even_1;
|
||||
Simd even_2;
|
||||
Simd odd_0; // 12 regs on knc
|
||||
Simd odd_1;
|
||||
Simd odd_2;
|
||||
|
||||
Simd Chi_0; // two spinor; 6 regs
|
||||
Simd Chi_1;
|
||||
Simd Chi_2;
|
||||
|
||||
Simd U_00; // two rows of U matrix
|
||||
Simd U_10;
|
||||
Simd U_20;
|
||||
Simd U_01;
|
||||
Simd U_11;
|
||||
Simd U_21; // 2 reg left.
|
||||
Simd U_02;
|
||||
Simd U_12;
|
||||
Simd U_22;
|
||||
|
||||
SiteSpinor result;
|
||||
int offset, ptype, local, perm;
|
||||
|
||||
StencilEntry *SE;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
|
||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||
|
||||
skew = 0;
|
||||
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
||||
skew = 8;
|
||||
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
||||
|
||||
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
||||
if ( dag ) {
|
||||
result()()(0) = - even_0 - odd_0;
|
||||
result()()(1) = - even_1 - odd_1;
|
||||
result()()(2) = - even_2 - odd_2;
|
||||
} else {
|
||||
result()()(0) = even_0 + odd_0;
|
||||
result()()(1) = even_1 + odd_1;
|
||||
result()()(2) = even_2 + odd_2;
|
||||
}
|
||||
vstream(out[sF],result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||
SiteSpinor *buf, int LLs, int sU,
|
||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
Simd even_0; // 12 regs on knc
|
||||
Simd even_1;
|
||||
Simd even_2;
|
||||
Simd odd_0; // 12 regs on knc
|
||||
Simd odd_1;
|
||||
Simd odd_2;
|
||||
|
||||
Simd Chi_0; // two spinor; 6 regs
|
||||
Simd Chi_1;
|
||||
Simd Chi_2;
|
||||
|
||||
Simd U_00; // two rows of U matrix
|
||||
Simd U_10;
|
||||
Simd U_20;
|
||||
Simd U_01;
|
||||
Simd U_11;
|
||||
Simd U_21; // 2 reg left.
|
||||
Simd U_02;
|
||||
Simd U_12;
|
||||
Simd U_22;
|
||||
|
||||
SiteSpinor result;
|
||||
int offset, ptype, local, perm;
|
||||
|
||||
StencilEntry *SE;
|
||||
int skew;
|
||||
|
||||
for(int s=0;s<LLs;s++){
|
||||
int sF=s+LLs*sU;
|
||||
|
||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||
int nmu=0;
|
||||
skew = 0;
|
||||
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
||||
skew = 8;
|
||||
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
||||
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
||||
|
||||
// Add sum of all exterior connected stencil legs
|
||||
if ( nmu ) {
|
||||
if ( dag ) {
|
||||
result()()(0) = - even_0 - odd_0;
|
||||
result()()(1) = - even_1 - odd_1;
|
||||
result()()(2) = - even_2 - odd_2;
|
||||
} else {
|
||||
result()()(0) = even_0 + odd_0;
|
||||
result()()(1) = even_1 + odd_1;
|
||||
result()()(2) = even_2 + odd_2;
|
||||
}
|
||||
out[sF] = out[sF] + result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||
\
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||
\
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||
|
||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
203
Grid/qcd/action/fermion/StaggeredVec5dImpl.h
Normal file
203
Grid/qcd/action/fermion/StaggeredVec5dImpl.h
Normal file
@ -0,0 +1,203 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template <class S, class Representation = FundamentalRepresentation >
|
||||
class StaggeredVec5dImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
||||
|
||||
public:
|
||||
|
||||
static const int Dimension = Representation::Dimension;
|
||||
static const bool isFundamental = Representation::isFundamental;
|
||||
static const bool LsVectorised=true;
|
||||
typedef RealD Coeff_t ;
|
||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||
|
||||
//Necessary?
|
||||
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
|
||||
|
||||
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
template <typename vtype> using iImplSpinor = iScalar<iScalar<iVector<vtype, Dimension> > >;
|
||||
template <typename vtype> using iImplHalfSpinor = iScalar<iScalar<iVector<vtype, Dimension> > >;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
|
||||
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
|
||||
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
|
||||
template <typename vtype> using iImplPropagator = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
|
||||
|
||||
// Make the doubled gauge field a *scalar*
|
||||
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
|
||||
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
|
||||
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
|
||||
typedef iImplPropagator<Simd> SitePropagator;
|
||||
|
||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||
typedef Lattice<SitePropagator> PropagatorField;
|
||||
|
||||
typedef iImplSpinor<Simd> SiteSpinor;
|
||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||
|
||||
|
||||
typedef Lattice<SiteSpinor> FermionField;
|
||||
|
||||
typedef StaggeredImplParams ImplParams;
|
||||
typedef SimpleCompressor<SiteSpinor> Compressor;
|
||||
typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
|
||||
typedef typename StencilImpl::View_type StencilView;
|
||||
|
||||
ImplParams Params;
|
||||
|
||||
StaggeredVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
|
||||
|
||||
template <class ref>
|
||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
||||
{
|
||||
vsplat(reg, memory);
|
||||
}
|
||||
|
||||
static accelerator_inline void multLink(SiteHalfSpinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SiteHalfSpinor &chi,
|
||||
int mu)
|
||||
{
|
||||
SiteGaugeLink UU;
|
||||
for (int i = 0; i < Dimension; i++) {
|
||||
for (int j = 0; j < Dimension; j++) {
|
||||
vsplat(UU()()(i, j), U(mu)()(i, j));
|
||||
}
|
||||
}
|
||||
mult(&phi(), &UU(), &chi());
|
||||
}
|
||||
static accelerator_inline void multLinkAdd(SiteHalfSpinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SiteHalfSpinor &chi,
|
||||
int mu)
|
||||
{
|
||||
SiteGaugeLink UU;
|
||||
for (int i = 0; i < Dimension; i++) {
|
||||
for (int j = 0; j < Dimension; j++) {
|
||||
vsplat(UU()()(i, j), U(mu)()(i, j));
|
||||
}
|
||||
}
|
||||
mac(&phi(), &UU(), &chi());
|
||||
}
|
||||
|
||||
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
||||
{
|
||||
GridBase *GaugeGrid = U_ds.Grid();
|
||||
thread_loop( (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++), {
|
||||
|
||||
SiteScalarGaugeLink ScalarU;
|
||||
SiteDoubledGaugeField ScalarUds;
|
||||
|
||||
Coordinate lcoor;
|
||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||
peekLocalSite(ScalarUds, U_ds, lcoor);
|
||||
|
||||
peekLocalSite(ScalarU, U, lcoor);
|
||||
ScalarUds(mu) = ScalarU();
|
||||
|
||||
});
|
||||
}
|
||||
inline void DoubleStore(GridBase *GaugeGrid,
|
||||
DoubledGaugeField &UUUds, // for Naik term
|
||||
DoubledGaugeField &Uds,
|
||||
const GaugeField &Uthin,
|
||||
const GaugeField &Ufat)
|
||||
{
|
||||
|
||||
GridBase * InputGrid = Uthin.Grid();
|
||||
conformable(InputGrid,Ufat.Grid());
|
||||
|
||||
GaugeLinkField U(InputGrid);
|
||||
GaugeLinkField UU(InputGrid);
|
||||
GaugeLinkField UUU(InputGrid);
|
||||
GaugeLinkField Udag(InputGrid);
|
||||
GaugeLinkField UUUdag(InputGrid);
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
// Staggered Phase.
|
||||
Lattice<iScalar<vInteger> > coor(InputGrid);
|
||||
Lattice<iScalar<vInteger> > x(InputGrid); LatticeCoordinate(x,0);
|
||||
Lattice<iScalar<vInteger> > y(InputGrid); LatticeCoordinate(y,1);
|
||||
Lattice<iScalar<vInteger> > z(InputGrid); LatticeCoordinate(z,2);
|
||||
Lattice<iScalar<vInteger> > t(InputGrid); LatticeCoordinate(t,3);
|
||||
|
||||
Lattice<iScalar<vInteger> > lin_z(InputGrid); lin_z=x+y;
|
||||
Lattice<iScalar<vInteger> > lin_t(InputGrid); lin_t=x+y+z;
|
||||
|
||||
ComplexField phases(InputGrid); phases=1.0;
|
||||
|
||||
if ( mu == 1 ) phases = where( mod(x ,2)==(Integer)0, phases,-phases);
|
||||
if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
|
||||
if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
|
||||
|
||||
// 1 hop based on fat links
|
||||
U = PeekIndex<LorentzIndex>(Ufat, mu);
|
||||
Udag = adj( Cshift(U, mu, -1));
|
||||
|
||||
U = U *phases;
|
||||
Udag = Udag *phases;
|
||||
|
||||
InsertGaugeField(Uds,U,mu);
|
||||
InsertGaugeField(Uds,Udag,mu+4);
|
||||
|
||||
// 3 hop based on thin links. Crazy huh ?
|
||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||
UU = Gimpl::CovShiftForward(U,mu,U);
|
||||
UUU= Gimpl::CovShiftForward(U,mu,UU);
|
||||
|
||||
UUUdag = adj( Cshift(UUU, mu, -3));
|
||||
|
||||
UUU = UUU *phases;
|
||||
UUUdag = UUUdag *phases;
|
||||
|
||||
InsertGaugeField(UUUds,UUU,mu);
|
||||
InsertGaugeField(UUUds,UUUdag,mu+4);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
|
||||
assert (0);
|
||||
}
|
||||
};
|
||||
typedef StaggeredVec5dImpl<vComplex, FundamentalRepresentation > StaggeredVec5dImplR; // Real.. whichever prec
|
||||
typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF; // Float
|
||||
typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD; // Double
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,242 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// *NOT* EO
|
||||
template <class Impl>
|
||||
RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
|
||||
{
|
||||
FermionField temp(out.Grid());
|
||||
|
||||
// Wilson term
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
this->Dhop(in, out, DaggerNo);
|
||||
|
||||
// Clover term
|
||||
Mooee(in, temp);
|
||||
|
||||
out += temp;
|
||||
return norm2(out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
FermionField temp(out.Grid());
|
||||
|
||||
// Wilson term
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
this->Dhop(in, out, DaggerYes);
|
||||
|
||||
// Clover term
|
||||
MooeeDag(in, temp);
|
||||
|
||||
out += temp;
|
||||
return norm2(out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
{
|
||||
WilsonFermion<Impl>::ImportGauge(_Umu);
|
||||
GridBase *grid = _Umu.Grid();
|
||||
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||
|
||||
// Compute the field strength terms mu>nu
|
||||
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
|
||||
|
||||
// Compute the Clover Operator acting on Colour and Spin
|
||||
// multiply here by the clover coefficients for the anisotropy
|
||||
CloverTerm = fillCloverYZ(Bx) * csw_r;
|
||||
CloverTerm += fillCloverXZ(By) * csw_r;
|
||||
CloverTerm += fillCloverXY(Bz) * csw_r;
|
||||
CloverTerm += fillCloverXT(Ex) * csw_t;
|
||||
CloverTerm += fillCloverYT(Ey) * csw_t;
|
||||
CloverTerm += fillCloverZT(Ez) * csw_t;
|
||||
CloverTerm += diag_mass;
|
||||
|
||||
int lvol = _Umu.Grid()->lSites();
|
||||
int DimRep = Impl::Dimension;
|
||||
|
||||
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
|
||||
Coordinate lcoor;
|
||||
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
||||
|
||||
for (int site = 0; site < lvol; site++)
|
||||
{
|
||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
peekLocalSite(Qx, CloverTerm, lcoor);
|
||||
Qxinv = Zero();
|
||||
//if (csw!=0){
|
||||
for (int j = 0; j < Ns; j++)
|
||||
for (int k = 0; k < Ns; k++)
|
||||
for (int a = 0; a < DimRep; a++)
|
||||
for (int b = 0; b < DimRep; b++){
|
||||
auto zz = Qx()(j, k)(a, b);
|
||||
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
|
||||
}
|
||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
|
||||
|
||||
EigenInvCloverOp = EigenCloverOp.inverse();
|
||||
//std::cout << EigenInvCloverOp << std::endl;
|
||||
for (int j = 0; j < Ns; j++)
|
||||
for (int k = 0; k < Ns; k++)
|
||||
for (int a = 0; a < DimRep; a++)
|
||||
for (int b = 0; b < DimRep; b++)
|
||||
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
|
||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
||||
// }
|
||||
pokeLocalSite(Qxinv, CloverTermInv, lcoor);
|
||||
}
|
||||
|
||||
// Separate the even and odd parts
|
||||
pickCheckerboard(Even, CloverTermEven, CloverTerm);
|
||||
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
|
||||
|
||||
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
|
||||
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
|
||||
|
||||
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
|
||||
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
|
||||
|
||||
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
|
||||
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerNo, InverseNo);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerYes, InverseNo);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerNo, InverseYes);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerYes, InverseYes);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
CloverFieldType *Clover;
|
||||
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
|
||||
|
||||
if (dag)
|
||||
{
|
||||
if (in.Grid()->_isCheckerBoarded)
|
||||
{
|
||||
if (in.Checkerboard() == Odd)
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
|
||||
}
|
||||
out = *Clover * in;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInv : &CloverTerm;
|
||||
out = adj(*Clover) * in;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (in.Grid()->_isCheckerBoarded)
|
||||
{
|
||||
|
||||
if (in.Checkerboard() == Odd)
|
||||
{
|
||||
// std::cout << "Calling clover term Odd" << std::endl;
|
||||
Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
|
||||
}
|
||||
else
|
||||
{
|
||||
// std::cout << "Calling clover term Even" << std::endl;
|
||||
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
|
||||
}
|
||||
out = *Clover * in;
|
||||
// std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInv : &CloverTerm;
|
||||
out = *Clover * in;
|
||||
}
|
||||
}
|
||||
|
||||
} // MooeeInternal
|
||||
|
||||
|
||||
// Derivative parts
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
// Derivative parts
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||
{
|
||||
assert(0); // not implemented yet
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,596 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
|
||||
const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
|
||||
int WilsonFermionStatic::HandOptDslash;
|
||||
|
||||
/////////////////////////////////
|
||||
// Constructor and gauge import
|
||||
/////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||
const ImplParams &p,
|
||||
const WilsonAnisotropyCoefficients &anis)
|
||||
:
|
||||
Kernels(p),
|
||||
_grid(&Fgrid),
|
||||
_cbgrid(&Hgrid),
|
||||
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
|
||||
StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even
|
||||
StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd
|
||||
mass(_mass),
|
||||
Lebesgue(_grid),
|
||||
LebesgueEvenOdd(_cbgrid),
|
||||
Umu(&Fgrid),
|
||||
UmuEven(&Hgrid),
|
||||
UmuOdd(&Hgrid),
|
||||
_tmp(&Hgrid),
|
||||
anisotropyCoeff(anis)
|
||||
{
|
||||
// Allocate the required comms buffer
|
||||
ImportGauge(_Umu);
|
||||
if (anisotropyCoeff.isAnisotropic){
|
||||
diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
|
||||
} else {
|
||||
diag_mass = 4.0 + mass;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
{
|
||||
GaugeField HUmu(_Umu.Grid());
|
||||
|
||||
//Here multiply the anisotropy coefficients
|
||||
if (anisotropyCoeff.isAnisotropic)
|
||||
{
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
|
||||
if (mu != anisotropyCoeff.t_direction)
|
||||
U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
|
||||
|
||||
PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
HUmu = _Umu * (-0.5);
|
||||
}
|
||||
Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
|
||||
pickCheckerboard(Even, UmuEven, Umu);
|
||||
pickCheckerboard(Odd, UmuOdd, Umu);
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Implement the interface
|
||||
/////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Dhop(in, out, DaggerNo);
|
||||
return axpy_norm(out, diag_mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Dhop(in, out, DaggerYes);
|
||||
return axpy_norm(out, diag_mass, in, out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerNo);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerNo);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerYes);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerYes);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
typename FermionField::scalar_type scal(diag_mass);
|
||||
out = scal * in;
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Mooee(in, out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
out = (1.0/(diag_mass))*in;
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
MooeeInv(in,out);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector<double> twist)
|
||||
{
|
||||
typedef typename FermionField::vector_type vector_type;
|
||||
typedef typename FermionField::scalar_type ScalComplex;
|
||||
typedef Lattice<iSinglet<vector_type> > LatComplex;
|
||||
|
||||
// what type LatticeComplex
|
||||
conformable(_grid,out.Grid());
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
Coordinate latt_size = _grid->_fdimensions;
|
||||
|
||||
FermionField num (_grid); num = Zero();
|
||||
LatComplex wilson(_grid); wilson= Zero();
|
||||
LatComplex one (_grid); one = ScalComplex(1.0,0.0);
|
||||
|
||||
LatComplex denom(_grid); denom= Zero();
|
||||
LatComplex kmu(_grid);
|
||||
ScalComplex ci(0.0,1.0);
|
||||
// momphase = n * 2pi / L
|
||||
for(int mu=0;mu<Nd;mu++) {
|
||||
|
||||
LatticeCoordinate(kmu,mu);
|
||||
|
||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||
|
||||
kmu = TwoPiL * kmu;
|
||||
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
|
||||
|
||||
wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
|
||||
|
||||
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in); // derivative term
|
||||
|
||||
denom=denom + sin(kmu)*sin(kmu);
|
||||
}
|
||||
|
||||
wilson = wilson + _m; // 2 sin^2 k/2 + m
|
||||
|
||||
num = num + wilson*in; // -i gmu sin k + 2 sin^2 k/2 + m
|
||||
|
||||
denom= denom+wilson*wilson; // sin^2 k + (2 sin^2 k/2 + m)^2
|
||||
|
||||
denom= one/denom;
|
||||
|
||||
out = num*denom; // [ -i gmu sin k + 2 sin^2 k/2 + m] / [ sin^2 k + (2 sin^2 k/2 + m)^2 ]
|
||||
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////
|
||||
// Internal
|
||||
///////////////////////////////////
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
GaugeField &mat, const FermionField &A,
|
||||
const FermionField &B, int dag) {
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
|
||||
Compressor compressor(dag);
|
||||
|
||||
FermionField Btilde(B.Grid());
|
||||
FermionField Atilde(B.Grid());
|
||||
Atilde = A;
|
||||
|
||||
st.HaloExchange(B, compressor);
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Flip gamma (1+g)<->(1-g) if dag
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
int gamma = mu;
|
||||
if (!dag) gamma += Nd;
|
||||
|
||||
////////////////////////
|
||||
// Call the single hop
|
||||
////////////////////////
|
||||
auto U_v = U.View();
|
||||
auto B_v = B.View();
|
||||
auto Btilde_v = Btilde.View();
|
||||
auto st_v = st.View();
|
||||
thread_loop( (int sss = 0; sss < B.Grid()->oSites(); sss++) ,{
|
||||
Kernels::DhopDirK(st_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu, gamma);
|
||||
});
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// spin trace outer product
|
||||
//////////////////////////////////////////////////
|
||||
Impl::InsertForce4D(mat, Btilde, Atilde, mu);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||
conformable(U.Grid(), _grid);
|
||||
conformable(U.Grid(), V.Grid());
|
||||
conformable(U.Grid(), mat.Grid());
|
||||
|
||||
mat.Checkerboard() = U.Checkerboard();
|
||||
|
||||
DerivInternal(Stencil, Umu, mat, U, V, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||
conformable(U.Grid(), _cbgrid);
|
||||
conformable(U.Grid(), V.Grid());
|
||||
//conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
|
||||
// Motivation: look at the SchurDiff operator
|
||||
|
||||
assert(V.Checkerboard() == Even);
|
||||
assert(U.Checkerboard() == Odd);
|
||||
mat.Checkerboard() = Odd;
|
||||
|
||||
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||
conformable(U.Grid(), _cbgrid);
|
||||
conformable(U.Grid(), V.Grid());
|
||||
//conformable(U.Grid(), mat.Grid());
|
||||
|
||||
assert(V.Checkerboard() == Odd);
|
||||
assert(U.Checkerboard() == Even);
|
||||
mat.Checkerboard() = Even;
|
||||
|
||||
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
||||
conformable(in.Grid(), _grid); // verifies full grid
|
||||
conformable(in.Grid(), out.Grid());
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||
|
||||
assert(in.Checkerboard() == Even);
|
||||
out.Checkerboard() = Odd;
|
||||
|
||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
|
||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||
|
||||
assert(in.Checkerboard() == Odd);
|
||||
out.Checkerboard() = Even;
|
||||
|
||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
|
||||
DhopDir(in, out, dir, disp);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
|
||||
int skip = (disp == 1) ? 0 : 1;
|
||||
int dirdisp = dir + skip * 4;
|
||||
int gamma = dir + (1 - skip) * 4;
|
||||
|
||||
DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag)
|
||||
{
|
||||
Compressor compressor(dag);
|
||||
|
||||
Stencil.HaloExchange(in, compressor);
|
||||
auto in_v = in.View();
|
||||
auto out_v = in.View();
|
||||
auto Umu_v = Umu.View();
|
||||
auto Stencil_v = Stencil.View();
|
||||
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
|
||||
Kernels::DhopDirK(Stencil_v, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dirdisp, gamma);
|
||||
});
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag) {
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag) {
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
#ifdef GRID_OMP
|
||||
Compressor compressor(dag);
|
||||
int len = U.Grid()->oSites();
|
||||
const int LLs = 1;
|
||||
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
st.CommsMergeSHM(compressor);
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
if (tid >= ncomms) {
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = len;
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
// do the compute
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
|
||||
if (dag == DaggerYes) {
|
||||
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||
Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
|
||||
// Kernels::DhopSiteDag(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
|
||||
}
|
||||
} else {
|
||||
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||
Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
|
||||
// Kernels::DhopSite(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
st.CommunicateThreaded();
|
||||
}
|
||||
} //pragma
|
||||
|
||||
{
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
if (dag == DaggerYes) {
|
||||
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
|
||||
Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
|
||||
});
|
||||
} else {
|
||||
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
|
||||
Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
|
||||
});
|
||||
}
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag) {
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
Compressor compressor(dag);
|
||||
st.HaloExchange(in, compressor);
|
||||
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v= out.View();
|
||||
auto st_v = st.View();
|
||||
if (dag == DaggerYes) {
|
||||
accelerator_loop( sss,in_v, {
|
||||
Kernels::DhopSiteDag(Opt,st_v, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
|
||||
});
|
||||
} else {
|
||||
accelerator_loop( sss,in_v, {
|
||||
Kernels::DhopSite(Opt,st_v, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
|
||||
});
|
||||
}
|
||||
};
|
||||
/*Change ends */
|
||||
|
||||
/*******************************************************************************
|
||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||
* to make a conserved current sink or inserting the conserved current
|
||||
* sequentially.
|
||||
******************************************************************************/
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||
PropagatorField &q_in_2,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu)
|
||||
{
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
conformable(_grid, q_in_1.Grid());
|
||||
conformable(_grid, q_in_2.Grid());
|
||||
conformable(_grid, q_out.Grid());
|
||||
PropagatorField tmp1(_grid), tmp2(_grid);
|
||||
q_out = Zero();
|
||||
|
||||
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
|
||||
// Inefficient comms method but not performance critical.
|
||||
tmp1 = Cshift(q_in_1, mu, 1);
|
||||
tmp2 = Cshift(q_in_2, mu, 1);
|
||||
auto tmp1_v = tmp1.View();
|
||||
auto tmp2_v = tmp2.View();
|
||||
auto q_in_1_v=q_in_1.View();
|
||||
auto q_in_2_v=q_in_2.View();
|
||||
auto q_out_v = q_out.View();
|
||||
auto Umu_v = Umu.View();
|
||||
thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
|
||||
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
|
||||
q_in_2_v[sU],
|
||||
q_out_v[sU],
|
||||
Umu_v, sU, mu);
|
||||
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
|
||||
tmp2_v[sU],
|
||||
q_out_v[sU],
|
||||
Umu_v, sU, mu);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||
PropagatorField &q_out,
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax,
|
||||
ComplexField &lattice_cmplx)
|
||||
{
|
||||
conformable(_grid, q_in.Grid());
|
||||
conformable(_grid, q_out.Grid());
|
||||
|
||||
// Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
|
||||
Complex i(0.0,1.0);
|
||||
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
|
||||
unsigned int tshift = (mu == Tp) ? 1 : 0;
|
||||
unsigned int LLt = GridDefaultLatt()[Tp];
|
||||
|
||||
q_out = Zero();
|
||||
LatticeInteger coords(_grid);
|
||||
LatticeCoordinate(coords, Tp);
|
||||
|
||||
// Need q(x + mu) and q(x - mu).
|
||||
tmp = Cshift(q_in, mu, 1);
|
||||
tmpFwd = tmp*lattice_cmplx;
|
||||
tmp = lattice_cmplx*q_in;
|
||||
tmpBwd = Cshift(tmp, mu, -1);
|
||||
|
||||
auto coords_v = coords.View();
|
||||
auto tmpFwd_v = tmpFwd.View();
|
||||
auto tmpBwd_v = tmpBwd.View();
|
||||
auto Umu_v = Umu.View();
|
||||
auto q_out_v = q_out.View();
|
||||
|
||||
thread_loop( (unsigned int sU = 0; sU < Umu.Grid()->oSites(); ++sU), {
|
||||
|
||||
// Compute the sequential conserved current insertion only if our simd
|
||||
// object contains a timeslice we need.
|
||||
vInteger t_mask = ((coords_v[sU] >= tmin) &&
|
||||
(coords_v[sU] <= tmax));
|
||||
Integer timeSlices = Reduce(t_mask);
|
||||
|
||||
if (timeSlices > 0) {
|
||||
Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU],
|
||||
q_out_v[sU],
|
||||
Umu_v, sU, mu, t_mask);
|
||||
}
|
||||
|
||||
// Repeat for backward direction.
|
||||
t_mask = ((coords_v[sU] >= (tmin + tshift)) &&
|
||||
(coords_v[sU] <= (tmax + tshift)));
|
||||
|
||||
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
||||
unsigned int t0 = 0;
|
||||
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords_v[sU] == t0 ));
|
||||
|
||||
timeSlices = Reduce(t_mask);
|
||||
|
||||
if (timeSlices > 0) {
|
||||
Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU],
|
||||
q_out_v[sU],
|
||||
Umu_v, sU, mu, t_mask);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonFermion);
|
||||
AdjointFermOpTemplateInstantiate(WilsonFermion);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonFermion);
|
||||
GparityFermOpTemplateInstantiate(WilsonFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
File diff suppressed because it is too large
Load Diff
@ -227,8 +227,8 @@ public:
|
||||
Current curr_type,
|
||||
unsigned int mu,
|
||||
unsigned int tmin,
|
||||
unsigned int tmax,
|
||||
ComplexField &lattice_cmplx);
|
||||
unsigned int tmax,
|
||||
ComplexField &lattice_cmplx);
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -90,15 +90,7 @@ public:
|
||||
auto UU = coalescedRead(U(mu));
|
||||
mult(&phi(), &UU, &chi());
|
||||
}
|
||||
|
||||
#ifdef GPU_VEC
|
||||
static accelerator_inline void copyLinkGpu(int lane,
|
||||
SiteDoubledGaugeField & UU,
|
||||
const SiteDoubledGaugeField &U)
|
||||
{
|
||||
auto U_l = extractLane(lane,U);
|
||||
insertLane(lane,UU,U_l);
|
||||
}
|
||||
|
||||
static accelerator_inline void multLinkGpu(int lane,
|
||||
typename SiteHalfSpinor::scalar_object &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
@ -108,17 +100,6 @@ public:
|
||||
auto U_l = extractLane(lane,U(mu));
|
||||
phi() = U_l * chi();
|
||||
}
|
||||
#else
|
||||
static accelerator_inline void multLinkGpu(int lane,
|
||||
SiteHalfSpinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const SiteHalfSpinor &chi,
|
||||
int mu)
|
||||
{
|
||||
auto U_l = U(mu);
|
||||
phi() = U_l * chi();
|
||||
}
|
||||
#endif
|
||||
|
||||
static accelerator_inline void multLinkProp(SitePropagator &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
|
@ -1,445 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric;
|
||||
int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
|
||||
|
||||
////////////////////////////////////////////
|
||||
// Generic implementation; move to different file?
|
||||
////////////////////////////////////////////
|
||||
|
||||
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
||||
SE = st.GetEntry(ptype, Dir, sF); \
|
||||
if (SE->_is_local) { \
|
||||
chi_p = χ \
|
||||
if (SE->_permute) { \
|
||||
spProj(tmp, in[SE->_offset]); \
|
||||
permute(chi, tmp, ptype); \
|
||||
} else { \
|
||||
spProj(chi, in[SE->_offset]); \
|
||||
} \
|
||||
} else { \
|
||||
chi_p = &buf[SE->_offset]; \
|
||||
} \
|
||||
Impl::multLink(Uchi, U[sU], *chi_p, Dir, SE, st); \
|
||||
Recon(result, Uchi);
|
||||
|
||||
#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \
|
||||
SE = st.GetEntry(ptype, Dir, sF); \
|
||||
if (SE->_is_local) { \
|
||||
chi_p = χ \
|
||||
if (SE->_permute) { \
|
||||
spProj(tmp, in[SE->_offset]); \
|
||||
permute(chi, tmp, ptype); \
|
||||
} else { \
|
||||
spProj(chi, in[SE->_offset]); \
|
||||
} \
|
||||
} else if ( st.same_node[Dir] ) { \
|
||||
chi_p = &buf[SE->_offset]; \
|
||||
} \
|
||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||
Impl::multLink(Uchi, U[sU], *chi_p, Dir, SE, st); \
|
||||
Recon(result, Uchi); \
|
||||
}
|
||||
|
||||
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
|
||||
SE = st.GetEntry(ptype, Dir, sF); \
|
||||
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||
chi_p = &buf[SE->_offset]; \
|
||||
Impl::multLink(Uchi, U[sU], *chi_p, Dir, SE, st); \
|
||||
Recon(result, Uchi); \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \
|
||||
if (gamma == Dir) { \
|
||||
if (SE->_is_local && SE->_permute) { \
|
||||
spProj(tmp, in[SE->_offset]); \
|
||||
permute(chi, tmp, ptype); \
|
||||
} else if (SE->_is_local) { \
|
||||
spProj(chi, in[SE->_offset]); \
|
||||
} else { \
|
||||
chi = buf[SE->_offset]; \
|
||||
} \
|
||||
Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \
|
||||
Recon(result, Uchi); \
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// All legs kernels ; comms then compute
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor *chi_p;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
|
||||
GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
|
||||
GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
|
||||
GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
|
||||
GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
|
||||
GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
|
||||
GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
|
||||
GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
|
||||
GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
|
||||
vstream(out[sF], result);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor *chi_p;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
|
||||
GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
|
||||
GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
|
||||
GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
|
||||
GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
|
||||
GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
|
||||
GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
|
||||
GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
|
||||
GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
|
||||
vstream(out[sF], result);
|
||||
};
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Interior kernels
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor *chi_p;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
|
||||
result=Zero();
|
||||
GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
|
||||
GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
|
||||
GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
|
||||
GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
|
||||
GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
|
||||
GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
|
||||
GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
|
||||
GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
|
||||
vstream(out[sF], result);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor *chi_p;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
result=Zero();
|
||||
GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
|
||||
GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
|
||||
GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
|
||||
GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
|
||||
GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
|
||||
GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
|
||||
GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
|
||||
GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
|
||||
vstream(out[sF], result);
|
||||
};
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Exterior kernels
|
||||
////////////////////////////////////////////////////////////////////
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
// SiteHalfSpinor tmp;
|
||||
// SiteHalfSpinor chi;
|
||||
SiteHalfSpinor *chi_p;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
int nmu=0;
|
||||
result=Zero();
|
||||
GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
|
||||
GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
|
||||
GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
|
||||
GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
|
||||
GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
|
||||
GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
|
||||
GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
|
||||
GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
|
||||
if ( nmu ) {
|
||||
out[sF] = out[sF] + result;
|
||||
}
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
// SiteHalfSpinor tmp;
|
||||
// SiteHalfSpinor chi;
|
||||
SiteHalfSpinor *chi_p;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
int nmu=0;
|
||||
result=Zero();
|
||||
GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
|
||||
GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
|
||||
GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
|
||||
GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
|
||||
GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
|
||||
GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
|
||||
GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
|
||||
GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
|
||||
if ( nmu ) {
|
||||
out[sF] = out[sF] + result;
|
||||
}
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
||||
{
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
SiteSpinor result;
|
||||
SiteHalfSpinor Uchi;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
|
||||
SE = st.GetEntry(ptype, dir, sF);
|
||||
GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
|
||||
GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
|
||||
GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
|
||||
GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
|
||||
GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
|
||||
GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
|
||||
GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
|
||||
GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
|
||||
vstream(out[sF], result);
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||
* to make a conserved current sink or inserting the conserved current
|
||||
* sequentially. Common to both 4D and 5D.
|
||||
******************************************************************************/
|
||||
// N.B. Functions below assume a -1/2 factor within U.
|
||||
#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
|
||||
#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
|
||||
|
||||
/*******************************************************************************
|
||||
* Name: ContractConservedCurrentSiteFwd
|
||||
* Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
|
||||
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
|
||||
* - Pass in q_in_1 shifted in +ve mu direction.
|
||||
******************************************************************************/
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
|
||||
const SitePropagator &q_in_1,
|
||||
const SitePropagator &q_in_2,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeFieldView &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
bool switch_sign)
|
||||
{
|
||||
SitePropagator result, tmp;
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
|
||||
Impl::multLinkProp(tmp, U[sU], q_in_1, mu);
|
||||
|
||||
result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
|
||||
|
||||
if (switch_sign) {
|
||||
q_out -= result;
|
||||
} else {
|
||||
q_out += result;
|
||||
}
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
* Name: ContractConservedCurrentSiteBwd
|
||||
* Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
|
||||
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
|
||||
* - Pass in q_in_2 shifted in +ve mu direction.
|
||||
******************************************************************************/
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
|
||||
const SitePropagator &q_in_1,
|
||||
const SitePropagator &q_in_2,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeFieldView &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
bool switch_sign)
|
||||
{
|
||||
SitePropagator result, tmp;
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
|
||||
Impl::multLinkProp(tmp, U[sU], q_in_1, mu + Nd);
|
||||
|
||||
result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
|
||||
if (switch_sign) {
|
||||
q_out += result;
|
||||
} else {
|
||||
q_out -= result;
|
||||
}
|
||||
}
|
||||
|
||||
// G-parity requires more specialised implementation.
|
||||
#define NO_CURR_SITE(Impl) \
|
||||
template <> \
|
||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
|
||||
const SitePropagator &q_in_1, \
|
||||
const SitePropagator &q_in_2, \
|
||||
SitePropagator &q_out, \
|
||||
DoubledGaugeFieldView &U, \
|
||||
unsigned int sU, \
|
||||
unsigned int mu, \
|
||||
bool switch_sign) \
|
||||
{ \
|
||||
assert(0); \
|
||||
} \
|
||||
template <> \
|
||||
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
|
||||
const SitePropagator &q_in_1, \
|
||||
const SitePropagator &q_in_2, \
|
||||
SitePropagator &q_out, \
|
||||
DoubledGaugeFieldView &U, \
|
||||
unsigned int mu, \
|
||||
unsigned int sU, \
|
||||
bool switch_sign) \
|
||||
{ \
|
||||
assert(0); \
|
||||
}
|
||||
|
||||
NO_CURR_SITE(GparityWilsonImplF);
|
||||
NO_CURR_SITE(GparityWilsonImplD);
|
||||
NO_CURR_SITE(GparityWilsonImplFH);
|
||||
NO_CURR_SITE(GparityWilsonImplDF);
|
||||
|
||||
|
||||
/*******************************************************************************
|
||||
* Name: SeqConservedCurrentSiteFwd
|
||||
* Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
|
||||
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
|
||||
* - Pass in q_in shifted in +ve mu direction.
|
||||
******************************************************************************/
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeFieldView &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
vInteger t_mask,
|
||||
bool switch_sign)
|
||||
{
|
||||
SitePropagator result;
|
||||
|
||||
Impl::multLinkProp(result, U[sU], q_in, mu);
|
||||
result = WilsonCurrentFwd(result, mu);
|
||||
|
||||
// Zero any unwanted timeslice entries.
|
||||
result = predicatedWhere(t_mask, result, 0.*result);
|
||||
|
||||
if (switch_sign) {
|
||||
q_out -= result;
|
||||
} else {
|
||||
q_out += result;
|
||||
}
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
* Name: SeqConservedCurrentSiteFwd
|
||||
* Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
|
||||
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
|
||||
* - Pass in q_in shifted in -ve mu direction.
|
||||
******************************************************************************/
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
|
||||
SitePropagator &q_out,
|
||||
DoubledGaugeFieldView &U,
|
||||
unsigned int sU,
|
||||
unsigned int mu,
|
||||
vInteger t_mask,
|
||||
bool switch_sign)
|
||||
{
|
||||
SitePropagator result;
|
||||
Impl::multLinkProp(result, U[sU], q_in, mu + Nd);
|
||||
result = WilsonCurrentBwd(result, mu);
|
||||
|
||||
// Zero any unwanted timeslice entries.
|
||||
result = predicatedWhere(t_mask, result, 0.*result);
|
||||
|
||||
if (switch_sign) {
|
||||
q_out += result;
|
||||
} else {
|
||||
q_out -= result;
|
||||
}
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonKernels);
|
||||
AdjointFermOpTemplateInstantiate(WilsonKernels);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonKernels);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -107,7 +107,7 @@ private:
|
||||
int Ls,int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||
|
||||
static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||
|
||||
static accelerator void GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor * buf,
|
||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out);
|
||||
|
@ -1,125 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Default to no assembler implementation
|
||||
///////////////////////////////////////////////////////////
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl >::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl >::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl >::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl >::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmAvx512.h>
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmQPX.h>
|
||||
|
||||
#define INSTANTIATE_ASM(A) \
|
||||
template void WilsonKernels<A>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
|
||||
\
|
||||
template void WilsonKernels<A>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
|
||||
template void WilsonKernels<A>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
|
||||
\
|
||||
template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
|
||||
template void WilsonKernels<A>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
|
||||
\
|
||||
template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,\
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out);\
|
||||
|
||||
//INSTANTIATE_ASM(WilsonImplF);
|
||||
//INSTANTIATE_ASM(WilsonImplD);
|
||||
INSTANTIATE_ASM(GparityWilsonImplF);
|
||||
INSTANTIATE_ASM(GparityWilsonImplD);
|
||||
//INSTANTIATE_ASM(ZWilsonImplF);
|
||||
//INSTANTIATE_ASM(ZWilsonImplD);
|
||||
//INSTANTIATE_ASM(DomainWallVec5dImplF);
|
||||
//INSTANTIATE_ASM(DomainWallVec5dImplD);
|
||||
//INSTANTIATE_ASM(ZDomainWallVec5dImplF);
|
||||
//INSTANTIATE_ASM(ZDomainWallVec5dImplD);
|
||||
|
||||
//INSTANTIATE_ASM(WilsonImplFH);
|
||||
//INSTANTIATE_ASM(WilsonImplDF);
|
||||
//INSTANTIATE_ASM(ZWilsonImplFH);
|
||||
//INSTANTIATE_ASM(ZWilsonImplDF);
|
||||
INSTANTIATE_ASM(GparityWilsonImplFH);
|
||||
INSTANTIATE_ASM(GparityWilsonImplDF);
|
||||
//INSTANTIATE_ASM(DomainWallVec5dImplFH);
|
||||
//INSTANTIATE_ASM(DomainWallVec5dImplDF);
|
||||
//INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
|
||||
//INSTANTIATE_ASM(ZDomainWallVec5dImplDF);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -1,650 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
|
||||
#if defined(AVX512)
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are AVX512 specialise the single precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
#include <simd/Intel512wilson.h>
|
||||
#include <simd/Intel512single.h>
|
||||
|
||||
static Vector<vComplexF> signsF;
|
||||
|
||||
template<typename vtype>
|
||||
int setupSigns(Vector<vtype>& signs ){
|
||||
Vector<vtype> bother(2);
|
||||
signs = bother;
|
||||
vrsign(signs[0]);
|
||||
visign(signs[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int signInitF = setupSigns(signsF);
|
||||
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||
#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#define MAYBEPERM(A,B)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
#undef MULT_2SPIN
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef COMPLEX_SIGNS
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are AVX512 specialise the double precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
#include <simd/Intel512double.h>
|
||||
|
||||
static Vector<vComplexD> signsD;
|
||||
static int signInitD = setupSigns(signsD);
|
||||
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||
#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
|
||||
|
||||
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#define MAYBEPERM(A,B)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
#undef MULT_2SPIN
|
||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#define INTERIOR
|
||||
#undef EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#define EXTERIOR
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
template<> void
|
||||
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef COMPLEX_SIGNS
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
|
||||
#endif //AVX512
|
@ -1,198 +0,0 @@
|
||||
#ifdef KERNEL_DAG
|
||||
#define DIR0_PROJMEM(base) XP_PROJMEM(base);
|
||||
#define DIR1_PROJMEM(base) YP_PROJMEM(base);
|
||||
#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
|
||||
#define DIR3_PROJMEM(base) TP_PROJMEM(base);
|
||||
#define DIR4_PROJMEM(base) XM_PROJMEM(base);
|
||||
#define DIR5_PROJMEM(base) YM_PROJMEM(base);
|
||||
#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
|
||||
#define DIR7_PROJMEM(base) TM_PROJMEM(base);
|
||||
#define DIR0_RECON XP_RECON
|
||||
#define DIR1_RECON YP_RECON_ACCUM
|
||||
#define DIR2_RECON ZP_RECON_ACCUM
|
||||
#define DIR3_RECON TP_RECON_ACCUM
|
||||
#define DIR4_RECON XM_RECON_ACCUM
|
||||
#define DIR5_RECON YM_RECON_ACCUM
|
||||
#define DIR6_RECON ZM_RECON_ACCUM
|
||||
#define DIR7_RECON TM_RECON_ACCUM
|
||||
#else
|
||||
#define DIR0_PROJMEM(base) XM_PROJMEM(base);
|
||||
#define DIR1_PROJMEM(base) YM_PROJMEM(base);
|
||||
#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
|
||||
#define DIR3_PROJMEM(base) TM_PROJMEM(base);
|
||||
#define DIR4_PROJMEM(base) XP_PROJMEM(base);
|
||||
#define DIR5_PROJMEM(base) YP_PROJMEM(base);
|
||||
#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
|
||||
#define DIR7_PROJMEM(base) TP_PROJMEM(base);
|
||||
#define DIR0_RECON XM_RECON
|
||||
#define DIR1_RECON YM_RECON_ACCUM
|
||||
#define DIR2_RECON ZM_RECON_ACCUM
|
||||
#define DIR3_RECON TM_RECON_ACCUM
|
||||
#define DIR4_RECON XP_RECON_ACCUM
|
||||
#define DIR5_RECON YP_RECON_ACCUM
|
||||
#define DIR6_RECON ZP_RECON_ACCUM
|
||||
#define DIR7_RECON TP_RECON_ACCUM
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Comms then compute kernel
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef INTERIOR_AND_EXTERIOR
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||
if ( local ) { \
|
||||
LOAD64(%r10,isigns); \
|
||||
PROJ(base); \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
} else { \
|
||||
LOAD_CHI(base); \
|
||||
} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
PREFETCH_CHIMU(base); \
|
||||
MULT_2SPIN_DIR_PF(Dir,basep); \
|
||||
LOAD64(%r10,isigns); \
|
||||
RECON; \
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
PF_GAUGE(Xp); \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||
|
||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Pre comms kernel -- prefetch like normal because it is mostly right
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef INTERIOR
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||
if ( local ) { \
|
||||
LOAD64(%r10,isigns); \
|
||||
PROJ(base); \
|
||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
||||
if ( local || st.same_node[Dir] ) { \
|
||||
MULT_2SPIN_DIR_PF(Dir,basep); \
|
||||
LOAD64(%r10,isigns); \
|
||||
RECON; \
|
||||
} \
|
||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||
PREFETCH_CHIMU(base); \
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
PF_GAUGE(Xp); \
|
||||
PREFETCH1_CHIMU(base); \
|
||||
{ ZERO_PSI; } \
|
||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||
|
||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||
|
||||
#endif
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Post comms kernel
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef EXTERIOR
|
||||
|
||||
|
||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_DIR_PF(Dir,base); \
|
||||
LOAD64(%r10,isigns); \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||
nmu=0; \
|
||||
{ ZERO_PSI;} \
|
||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||
LOAD_CHI(base); \
|
||||
MULT_2SPIN_DIR_PF(Dir,base); \
|
||||
LOAD64(%r10,isigns); \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
||||
|
||||
#endif
|
||||
{
|
||||
int nmu;
|
||||
int local,perm, ptype;
|
||||
uint64_t base;
|
||||
uint64_t basep;
|
||||
const uint64_t plocal =(uint64_t) & in[0];
|
||||
|
||||
COMPLEX_SIGNS(isigns);
|
||||
MASK_REGS;
|
||||
int nmax=U.oSites();
|
||||
for(int site=0;site<Ns;site++) {
|
||||
#ifndef EXTERIOR
|
||||
// int sU =lo.Reorder(ssU);
|
||||
int sU =ssU;
|
||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||
// int sUn=lo.Reorder(ssn);
|
||||
int sUn=ssn;
|
||||
LOCK_GAUGE(0);
|
||||
#else
|
||||
int sU =ssU;
|
||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||
int sUn=ssn;
|
||||
#endif
|
||||
for(int s=0;s<Ls;s++) {
|
||||
ss =sU*Ls+s;
|
||||
ssn=sUn*Ls+s;
|
||||
int ent=ss*8;// 2*Ndim
|
||||
int nent=ssn*8;
|
||||
|
||||
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
|
||||
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
|
||||
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
|
||||
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
|
||||
|
||||
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
|
||||
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
|
||||
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
|
||||
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
|
||||
|
||||
#ifdef EXTERIOR
|
||||
if (nmu==0) break;
|
||||
// if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
|
||||
#endif
|
||||
base = (uint64_t) &out[ss];
|
||||
basep= st.GetPFInfo(nent,plocal); nent++;
|
||||
RESULT(base,basep);
|
||||
}
|
||||
ssU++;
|
||||
UNLOCK_GAUGE(0);
|
||||
}
|
||||
}
|
||||
|
||||
#undef DIR0_PROJMEM
|
||||
#undef DIR1_PROJMEM
|
||||
#undef DIR2_PROJMEM
|
||||
#undef DIR3_PROJMEM
|
||||
#undef DIR4_PROJMEM
|
||||
#undef DIR5_PROJMEM
|
||||
#undef DIR6_PROJMEM
|
||||
#undef DIR7_PROJMEM
|
||||
#undef DIR0_RECON
|
||||
#undef DIR1_RECON
|
||||
#undef DIR2_RECON
|
||||
#undef DIR3_RECON
|
||||
#undef DIR4_RECON
|
||||
#undef DIR5_RECON
|
||||
#undef DIR6_RECON
|
||||
#undef DIR7_RECON
|
||||
#undef ASM_LEG
|
||||
#undef ASM_LEG_XP
|
||||
#undef RESULT
|
@ -1,161 +0,0 @@
|
||||
{
|
||||
int locala,perma, ptypea;
|
||||
int localb,permb, ptypeb;
|
||||
uint64_t basea, baseb;
|
||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||
|
||||
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||
vComplexF *isigns = &signs[0];
|
||||
|
||||
MASK_REGS;
|
||||
|
||||
for(int site=0;site<Ns;site++) {
|
||||
int sU=lo.Reorder(ssU);
|
||||
for(int s=0;s<Ls;s++) {
|
||||
ss=sU*Ls+s;
|
||||
////////////////////////////////
|
||||
// Xp
|
||||
////////////////////////////////
|
||||
int ent=ss*8;// 2*Ndim
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns);
|
||||
XM_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns);
|
||||
XM_RECON;
|
||||
|
||||
////////////////////////////////
|
||||
// Yp
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YM_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYP(Yp,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YM_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Zp
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZM_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZP(Zp,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZM_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Tp
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TM_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTP(Tp,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TM_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Xm
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
XP_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXM(Xm,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
XP_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Ym
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YP_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YP_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Zm
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZP_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZP_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Tm
|
||||
////////////////////////////////
|
||||
basea = (uint64_t)&out._odata[ss];
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TP_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTM(Tm,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TP_RECON_ACCUM;
|
||||
|
||||
SAVE_RESULT(&out._odata[ss],baseb);
|
||||
|
||||
}
|
||||
ssU++;
|
||||
}
|
||||
}
|
@ -1,187 +0,0 @@
|
||||
{
|
||||
int locala,perma, ptypea;
|
||||
int localb,permb, ptypeb;
|
||||
int localc,permc, ptypec;
|
||||
uint64_t basea, baseb, basec;
|
||||
uint64_t basex;
|
||||
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||
|
||||
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||
vComplexF *isigns = &signs[0];
|
||||
|
||||
MASK_REGS;
|
||||
|
||||
for(int site=0;site<Ns;site++) {
|
||||
int sU=lo.Reorder(ssU);
|
||||
|
||||
for(int s=0;s<Ls;s++) {
|
||||
ss =sU*Ls+s;
|
||||
|
||||
////////////////////////////////
|
||||
// Xp
|
||||
////////////////////////////////
|
||||
int ent=ss*8;// 2*Ndim
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basea);
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(baseb);
|
||||
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basec);
|
||||
|
||||
basex = basea;
|
||||
|
||||
label(FX(XP) );
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns);
|
||||
XM_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns);
|
||||
XM_RECON;
|
||||
|
||||
////////////////////////////////
|
||||
// Yp
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basea);
|
||||
label(FX(YP) );
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YM_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYP(Yp,basec);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YM_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Zp
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(baseb);
|
||||
label(FX(ZP) );
|
||||
if ( localc ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZM_PROJMEM(basec);
|
||||
MAYBEPERM(PERMUTE_DIR1,permc);
|
||||
} else {
|
||||
LOAD_CHI(basec);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZP(Zp,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZM_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Tp
|
||||
////////////////////////////////
|
||||
basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basec);
|
||||
label(FX(TP) );
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TM_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR0,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTP(Tp,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TM_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Xm
|
||||
////////////////////////////////
|
||||
basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basea);
|
||||
label(FX(XM) );
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
XP_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR3,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXM(Xm,basec);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
XP_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Ym
|
||||
////////////////////////////////
|
||||
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(baseb);
|
||||
label(FX(YM) );
|
||||
if ( localc ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YP_PROJMEM(basec);
|
||||
MAYBEPERM(PERMUTE_DIR2,permc);
|
||||
} else {
|
||||
LOAD_CHI(basec);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
YP_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Zm
|
||||
////////////////////////////////
|
||||
basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
|
||||
PREFETCH_CHIMU(basec);
|
||||
label(FX(ZM) );
|
||||
if ( locala ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZP_PROJMEM(basea);
|
||||
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||
} else {
|
||||
LOAD_CHI(basea);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
ZP_RECON_ACCUM;
|
||||
|
||||
////////////////////////////////
|
||||
// Tm
|
||||
////////////////////////////////
|
||||
basea = (uint64_t)&out._odata[ss];
|
||||
PREFETCH_CHIMU(basea);
|
||||
label(FX(TM) );
|
||||
if ( localb ) {
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TP_PROJMEM(baseb);
|
||||
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||
} else {
|
||||
LOAD_CHI(baseb);
|
||||
}
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTM(Tm,basec);
|
||||
}
|
||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||
TP_RECON_ACCUM;
|
||||
|
||||
// PREFETCH_CHIMU(basex);
|
||||
label(FX(SAV) );
|
||||
SAVE_RESULT(&out._odata[ss]);
|
||||
|
||||
}
|
||||
ssU++;
|
||||
}
|
||||
}
|
@ -1,150 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmQPX.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
|
||||
#if defined(QPX)
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// If we are QPX specialise the single precision routine
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
#include <simd/IBM_qpx.h>
|
||||
#include <simd/IBM_qpx_single.h>
|
||||
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
|
||||
#define COMPLEX_SIGNS(isigns)
|
||||
|
||||
#define INTERIOR_AND_EXTERIOR
|
||||
#undef INTERIOR
|
||||
#undef EXTERIOR
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#define MAYBEPERM(A,B)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, undag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, dag Kernel, single
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// DP routines
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
#include <simd/IBM_qpx_double.h>
|
||||
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT Vectorised, undag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// XYZT Vectorised, dag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
#define MAYBEPERM(A,B)
|
||||
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, undag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#undef KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeField &U, SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Ls vectorised, dag Kernel, double
|
||||
/////////////////////////////////////////////////////////////////
|
||||
#define KERNEL_DAG
|
||||
template<> void
|
||||
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
#undef MAYBEPERM
|
||||
#undef MULT_2SPIN
|
||||
|
||||
#endif
|
@ -1,378 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc
|
||||
|
||||
Copyright (C) 2018
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
// Gpu implementation; thread loop is implicit ; move to header
|
||||
//////////////////////////////////////////////////////////////
|
||||
accelerator_inline void synchronise(void)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
__syncthreads();
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
accelerator_inline int get_my_lanes(int Nsimd)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return 1;
|
||||
#else
|
||||
return Nsimd;
|
||||
#endif
|
||||
}
|
||||
accelerator_inline int get_my_lane_offset(int Nsimd)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return ( (threadIdx.x) % Nsimd);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
||||
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
||||
uint4 * chip_pun = (uint4 *)&chip;
|
||||
* chip_pun = * mem_pun;
|
||||
#else
|
||||
chip = *mem;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef GPU_VEC
|
||||
#if 1
|
||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||
if (SE._is_local) { \
|
||||
int mask = Nsimd >> (ptype + 1); \
|
||||
int plane= SE._permute ? (lane ^ mask) : lane; \
|
||||
auto in_l = extractLane(plane,in[SE._offset+s]); \
|
||||
spProj(chi,in_l); \
|
||||
} else { \
|
||||
chi = extractLane(lane,buf[SE._offset+s]); \
|
||||
} \
|
||||
synchronise();
|
||||
#else
|
||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||
{ int mask = Nsimd >> (ptype + 1); \
|
||||
int plane= SE._permute ? (lane ^ mask) : lane; \
|
||||
auto in_l = extractLane(plane,in[SE._offset+s]); \
|
||||
spProj(chi,in_l); }
|
||||
#endif
|
||||
#else
|
||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||
if (SE._is_local) { \
|
||||
auto in_t = in[SE._offset+s]; \
|
||||
if (SE._permute) { \
|
||||
spProj(tmp, in_t); \
|
||||
permute(chi, tmp, ptype); \
|
||||
} else { \
|
||||
spProj(chi, in_t); \
|
||||
} \
|
||||
} else { \
|
||||
chi = buf[SE._offset+s]; \
|
||||
} \
|
||||
synchronise();
|
||||
#endif
|
||||
|
||||
template <class Impl>
|
||||
accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||
SiteHalfSpinor *buf, int Ls, int s,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifdef GPU_VEC
|
||||
typename SiteHalfSpinor::scalar_object chi;
|
||||
typename SiteHalfSpinor::scalar_object Uchi;
|
||||
typename SiteSpinor::scalar_object result;
|
||||
#else
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteHalfSpinor tmp;
|
||||
SiteSpinor result;
|
||||
#endif
|
||||
typedef typename SiteSpinor::scalar_type scalar_type;
|
||||
typedef typename SiteSpinor::vector_type vector_type;
|
||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||
|
||||
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
||||
uint64_t lanes = get_my_lanes(Nsimd);
|
||||
|
||||
StencilEntry *SE_mem;
|
||||
StencilEntry SE;
|
||||
|
||||
int ptype;
|
||||
uint64_t ssF = Ls * sU;
|
||||
uint64_t sF = ssF + s;
|
||||
#ifndef __CUDA_ARCH__
|
||||
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
||||
#else
|
||||
int lane = lane_offset; {
|
||||
#endif
|
||||
SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);
|
||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
|
||||
spReconXp(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
|
||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
|
||||
accumReconYp(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
|
||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
|
||||
accumReconZp(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
|
||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
|
||||
accumReconTp(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
|
||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
|
||||
accumReconXm(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
|
||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
|
||||
accumReconYm(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
|
||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
|
||||
accumReconZm(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);
|
||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
|
||||
accumReconTm(result, Uchi);
|
||||
|
||||
#ifdef GPU_VEC
|
||||
insertLane (lane,out[sF],result);
|
||||
#else
|
||||
vstream(out[sF], result);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
|
||||
SiteHalfSpinor *buf, int Ls, int s,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifdef GPU_VEC
|
||||
typename SiteHalfSpinor::scalar_object chi;
|
||||
typename SiteHalfSpinor::scalar_object Uchi;
|
||||
typename SiteSpinor::scalar_object result;
|
||||
#else
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteHalfSpinor tmp;
|
||||
SiteSpinor result;
|
||||
#endif
|
||||
typedef typename SiteSpinor::scalar_type scalar_type;
|
||||
typedef typename SiteSpinor::vector_type vector_type;
|
||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||
|
||||
uint64_t lane_offset= get_my_lane_offset(Nsimd);
|
||||
uint64_t lanes = get_my_lanes(Nsimd);
|
||||
|
||||
// printf (" sU %d s %d Nsimd %d lanes %ld lane_off %ld\n",sU, s, Nsimd, lanes, lane_offset);
|
||||
|
||||
StencilEntry *SE_mem;
|
||||
StencilEntry SE;
|
||||
int ptype;
|
||||
// Forces some degree of coalesce on the table look ups
|
||||
// Could also use wide load instructions on the data structure
|
||||
uint64_t ssF = Ls * sU;
|
||||
uint64_t sF = ssF + s;
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
|
||||
#else
|
||||
int lane = lane_offset; {
|
||||
#endif
|
||||
SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
|
||||
spReconXm(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
|
||||
accumReconYm(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
|
||||
accumReconZm(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
|
||||
accumReconTm(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
|
||||
accumReconXp(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
|
||||
accumReconYp(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
|
||||
accumReconZp(result, Uchi);
|
||||
|
||||
SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
|
||||
accumReconTp(result, Uchi);
|
||||
|
||||
#ifdef GPU_VEC
|
||||
insertLane (lane,out[sF],result);
|
||||
#else
|
||||
vstream(out[sF], result);
|
||||
#endif
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// Template specialise Gparity to empty for now
|
||||
#define GPU_EMPTY(A) \
|
||||
template <> \
|
||||
accelerator_inline void \
|
||||
WilsonKernels<A>::GpuDhopSite(StencilView &st, \
|
||||
SiteDoubledGaugeField &U, \
|
||||
SiteHalfSpinor *buf, int Ls, int sF, \
|
||||
int sU, \
|
||||
const FermionFieldView &in, \
|
||||
FermionFieldView &out) { assert(0);}; \
|
||||
template <> \
|
||||
accelerator_inline void \
|
||||
WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \
|
||||
DoubledGaugeFieldView &U, \
|
||||
SiteHalfSpinor *buf, int Ls,int sF, \
|
||||
int sU, \
|
||||
const FermionFieldView &in, \
|
||||
FermionFieldView &out) { assert(0);};
|
||||
|
||||
GPU_EMPTY(GparityWilsonImplF);
|
||||
GPU_EMPTY(GparityWilsonImplFH);
|
||||
GPU_EMPTY(GparityWilsonImplD);
|
||||
GPU_EMPTY(GparityWilsonImplDF);
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
|
||||
const uint64_t nsimd = Simd::Nsimd();
|
||||
const uint64_t NN = Nsite*Ls*nsimd;
|
||||
accelerator_loopN( sss, NN, {
|
||||
uint64_t cur = sss;
|
||||
// uint64_t lane = cur % nsimd;
|
||||
cur = cur / nsimd;
|
||||
uint64_t s = cur%Ls;
|
||||
// uint64_t sF = cur;
|
||||
cur = cur / Ls;
|
||||
uint64_t sU = cur;
|
||||
WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
|
||||
});
|
||||
} else {
|
||||
accelerator_loop( ss, U_v, {
|
||||
int sU = ss;
|
||||
int sF = Ls * sU;
|
||||
WilsonKernels<Impl>::DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
|
||||
});
|
||||
}
|
||||
}
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior,int exterior)
|
||||
{
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
|
||||
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
|
||||
const uint64_t nsimd = Simd::Nsimd();
|
||||
const uint64_t NN = Nsite*Ls*nsimd;
|
||||
accelerator_loopN( sss, NN, {
|
||||
uint64_t cur = sss;
|
||||
// uint64_t lane = cur % nsimd;
|
||||
cur = cur / nsimd;
|
||||
uint64_t s = cur%Ls;
|
||||
//uint64_t sF = cur;
|
||||
cur = cur / Ls;
|
||||
uint64_t sU = cur;
|
||||
WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
|
||||
});
|
||||
} else {
|
||||
accelerator_loop( ss, U_v, {
|
||||
int sU = ss;
|
||||
int sF = Ls * sU;
|
||||
WilsonKernels<Impl>::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
GPU_EMPTY(DomainWallVec5dImplF);
|
||||
GPU_EMPTY(DomainWallVec5dImplFH);
|
||||
GPU_EMPTY(DomainWallVec5dImplD);
|
||||
GPU_EMPTY(DomainWallVec5dImplDF);
|
||||
GPU_EMPTY(ZDomainWallVec5dImplF);
|
||||
GPU_EMPTY(ZDomainWallVec5dImplFH);
|
||||
GPU_EMPTY(ZDomainWallVec5dImplD);
|
||||
GPU_EMPTY(ZDomainWallVec5dImplDF);
|
||||
*/
|
||||
|
||||
FermOpTemplateInstantiate(WilsonKernels);
|
||||
AdjointFermOpTemplateInstantiate(WilsonKernels);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonKernels);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -1,654 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
|
||||
#define REGISTER
|
||||
|
||||
#define LOAD_CHIMU \
|
||||
{const SiteSpinor & ref (in[offset]); \
|
||||
Chimu_00=ref()(0)(0);\
|
||||
Chimu_01=ref()(0)(1);\
|
||||
Chimu_02=ref()(0)(2);\
|
||||
Chimu_10=ref()(1)(0);\
|
||||
Chimu_11=ref()(1)(1);\
|
||||
Chimu_12=ref()(1)(2);\
|
||||
Chimu_20=ref()(2)(0);\
|
||||
Chimu_21=ref()(2)(1);\
|
||||
Chimu_22=ref()(2)(2);\
|
||||
Chimu_30=ref()(3)(0);\
|
||||
Chimu_31=ref()(3)(1);\
|
||||
Chimu_32=ref()(3)(2);}
|
||||
|
||||
#define LOAD_CHI\
|
||||
{const SiteHalfSpinor &ref(buf[offset]); \
|
||||
Chi_00 = ref()(0)(0);\
|
||||
Chi_01 = ref()(0)(1);\
|
||||
Chi_02 = ref()(0)(2);\
|
||||
Chi_10 = ref()(1)(0);\
|
||||
Chi_11 = ref()(1)(1);\
|
||||
Chi_12 = ref()(1)(2);}
|
||||
|
||||
// To splat or not to splat depends on the implementation
|
||||
#define MULT_2SPIN(A)\
|
||||
{auto & ref(U[sU](A)); \
|
||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||
UChi_00 = U_00*Chi_00;\
|
||||
UChi_10 = U_00*Chi_10;\
|
||||
UChi_01 = U_10*Chi_00;\
|
||||
UChi_11 = U_10*Chi_10;\
|
||||
UChi_02 = U_20*Chi_00;\
|
||||
UChi_12 = U_20*Chi_10;\
|
||||
UChi_00+= U_01*Chi_01;\
|
||||
UChi_10+= U_01*Chi_11;\
|
||||
UChi_01+= U_11*Chi_01;\
|
||||
UChi_11+= U_11*Chi_11;\
|
||||
UChi_02+= U_21*Chi_01;\
|
||||
UChi_12+= U_21*Chi_11;\
|
||||
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
||||
UChi_00+= U_00*Chi_02;\
|
||||
UChi_10+= U_00*Chi_12;\
|
||||
UChi_01+= U_10*Chi_02;\
|
||||
UChi_11+= U_10*Chi_12;\
|
||||
UChi_02+= U_20*Chi_02;\
|
||||
UChi_12+= U_20*Chi_12;}
|
||||
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_00,Chi_00);\
|
||||
permute##dir(Chi_01,Chi_01);\
|
||||
permute##dir(Chi_02,Chi_02);\
|
||||
permute##dir(Chi_10,Chi_10);\
|
||||
permute##dir(Chi_11,Chi_11);\
|
||||
permute##dir(Chi_12,Chi_12);
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJ \
|
||||
Chi_00 = Chimu_00+timesI(Chimu_30);\
|
||||
Chi_01 = Chimu_01+timesI(Chimu_31);\
|
||||
Chi_02 = Chimu_02+timesI(Chimu_32);\
|
||||
Chi_10 = Chimu_10+timesI(Chimu_20);\
|
||||
Chi_11 = Chimu_11+timesI(Chimu_21);\
|
||||
Chi_12 = Chimu_12+timesI(Chimu_22);
|
||||
|
||||
#define YP_PROJ \
|
||||
Chi_00 = Chimu_00-Chimu_30;\
|
||||
Chi_01 = Chimu_01-Chimu_31;\
|
||||
Chi_02 = Chimu_02-Chimu_32;\
|
||||
Chi_10 = Chimu_10+Chimu_20;\
|
||||
Chi_11 = Chimu_11+Chimu_21;\
|
||||
Chi_12 = Chimu_12+Chimu_22;
|
||||
|
||||
#define ZP_PROJ \
|
||||
Chi_00 = Chimu_00+timesI(Chimu_20); \
|
||||
Chi_01 = Chimu_01+timesI(Chimu_21); \
|
||||
Chi_02 = Chimu_02+timesI(Chimu_22); \
|
||||
Chi_10 = Chimu_10-timesI(Chimu_30); \
|
||||
Chi_11 = Chimu_11-timesI(Chimu_31); \
|
||||
Chi_12 = Chimu_12-timesI(Chimu_32);
|
||||
|
||||
#define TP_PROJ \
|
||||
Chi_00 = Chimu_00+Chimu_20; \
|
||||
Chi_01 = Chimu_01+Chimu_21; \
|
||||
Chi_02 = Chimu_02+Chimu_22; \
|
||||
Chi_10 = Chimu_10+Chimu_30; \
|
||||
Chi_11 = Chimu_11+Chimu_31; \
|
||||
Chi_12 = Chimu_12+Chimu_32;
|
||||
|
||||
|
||||
// hspin(0)=fspin(0)-timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)-timesI(fspin(2));
|
||||
#define XM_PROJ \
|
||||
Chi_00 = Chimu_00-timesI(Chimu_30);\
|
||||
Chi_01 = Chimu_01-timesI(Chimu_31);\
|
||||
Chi_02 = Chimu_02-timesI(Chimu_32);\
|
||||
Chi_10 = Chimu_10-timesI(Chimu_20);\
|
||||
Chi_11 = Chimu_11-timesI(Chimu_21);\
|
||||
Chi_12 = Chimu_12-timesI(Chimu_22);
|
||||
|
||||
#define YM_PROJ \
|
||||
Chi_00 = Chimu_00+Chimu_30;\
|
||||
Chi_01 = Chimu_01+Chimu_31;\
|
||||
Chi_02 = Chimu_02+Chimu_32;\
|
||||
Chi_10 = Chimu_10-Chimu_20;\
|
||||
Chi_11 = Chimu_11-Chimu_21;\
|
||||
Chi_12 = Chimu_12-Chimu_22;
|
||||
|
||||
#define ZM_PROJ \
|
||||
Chi_00 = Chimu_00-timesI(Chimu_20); \
|
||||
Chi_01 = Chimu_01-timesI(Chimu_21); \
|
||||
Chi_02 = Chimu_02-timesI(Chimu_22); \
|
||||
Chi_10 = Chimu_10+timesI(Chimu_30); \
|
||||
Chi_11 = Chimu_11+timesI(Chimu_31); \
|
||||
Chi_12 = Chimu_12+timesI(Chimu_32);
|
||||
|
||||
#define TM_PROJ \
|
||||
Chi_00 = Chimu_00-Chimu_20; \
|
||||
Chi_01 = Chimu_01-Chimu_21; \
|
||||
Chi_02 = Chimu_02-Chimu_22; \
|
||||
Chi_10 = Chimu_10-Chimu_30; \
|
||||
Chi_11 = Chimu_11-Chimu_31; \
|
||||
Chi_12 = Chimu_12-Chimu_32;
|
||||
|
||||
// fspin(0)=hspin(0);
|
||||
// fspin(1)=hspin(1);
|
||||
// fspin(2)=timesMinusI(hspin(1));
|
||||
// fspin(3)=timesMinusI(hspin(0));
|
||||
#define XP_RECON\
|
||||
result_00 = UChi_00;\
|
||||
result_01 = UChi_01;\
|
||||
result_02 = UChi_02;\
|
||||
result_10 = UChi_10;\
|
||||
result_11 = UChi_11;\
|
||||
result_12 = UChi_12;\
|
||||
result_20 = timesMinusI(UChi_10);\
|
||||
result_21 = timesMinusI(UChi_11);\
|
||||
result_22 = timesMinusI(UChi_12);\
|
||||
result_30 = timesMinusI(UChi_00);\
|
||||
result_31 = timesMinusI(UChi_01);\
|
||||
result_32 = timesMinusI(UChi_02);
|
||||
|
||||
#define XP_RECON_ACCUM\
|
||||
result_00+=UChi_00;\
|
||||
result_01+=UChi_01;\
|
||||
result_02+=UChi_02;\
|
||||
result_10+=UChi_10;\
|
||||
result_11+=UChi_11;\
|
||||
result_12+=UChi_12;\
|
||||
result_20-=timesI(UChi_10);\
|
||||
result_21-=timesI(UChi_11);\
|
||||
result_22-=timesI(UChi_12);\
|
||||
result_30-=timesI(UChi_00);\
|
||||
result_31-=timesI(UChi_01);\
|
||||
result_32-=timesI(UChi_02);
|
||||
|
||||
#define XM_RECON\
|
||||
result_00 = UChi_00;\
|
||||
result_01 = UChi_01;\
|
||||
result_02 = UChi_02;\
|
||||
result_10 = UChi_10;\
|
||||
result_11 = UChi_11;\
|
||||
result_12 = UChi_12;\
|
||||
result_20 = timesI(UChi_10);\
|
||||
result_21 = timesI(UChi_11);\
|
||||
result_22 = timesI(UChi_12);\
|
||||
result_30 = timesI(UChi_00);\
|
||||
result_31 = timesI(UChi_01);\
|
||||
result_32 = timesI(UChi_02);
|
||||
|
||||
#define XM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= timesI(UChi_10);\
|
||||
result_21+= timesI(UChi_11);\
|
||||
result_22+= timesI(UChi_12);\
|
||||
result_30+= timesI(UChi_00);\
|
||||
result_31+= timesI(UChi_01);\
|
||||
result_32+= timesI(UChi_02);
|
||||
|
||||
#define YP_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= UChi_10;\
|
||||
result_21+= UChi_11;\
|
||||
result_22+= UChi_12;\
|
||||
result_30-= UChi_00;\
|
||||
result_31-= UChi_01;\
|
||||
result_32-= UChi_02;
|
||||
|
||||
#define YM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20-= UChi_10;\
|
||||
result_21-= UChi_11;\
|
||||
result_22-= UChi_12;\
|
||||
result_30+= UChi_00;\
|
||||
result_31+= UChi_01;\
|
||||
result_32+= UChi_02;
|
||||
|
||||
#define ZP_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20-= timesI(UChi_00); \
|
||||
result_21-= timesI(UChi_01); \
|
||||
result_22-= timesI(UChi_02); \
|
||||
result_30+= timesI(UChi_10); \
|
||||
result_31+= timesI(UChi_11); \
|
||||
result_32+= timesI(UChi_12);
|
||||
|
||||
#define ZM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= timesI(UChi_00); \
|
||||
result_21+= timesI(UChi_01); \
|
||||
result_22+= timesI(UChi_02); \
|
||||
result_30-= timesI(UChi_10); \
|
||||
result_31-= timesI(UChi_11); \
|
||||
result_32-= timesI(UChi_12);
|
||||
|
||||
#define TP_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= UChi_00; \
|
||||
result_21+= UChi_01; \
|
||||
result_22+= UChi_02; \
|
||||
result_30+= UChi_10; \
|
||||
result_31+= UChi_11; \
|
||||
result_32+= UChi_12;
|
||||
|
||||
#define TM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20-= UChi_00; \
|
||||
result_21-= UChi_01; \
|
||||
result_22-= UChi_02; \
|
||||
result_30-= UChi_10; \
|
||||
result_31-= UChi_11; \
|
||||
result_32-= UChi_12;
|
||||
|
||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU; \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON;
|
||||
|
||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU; \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else if ( st.same_node[DIR] ) { \
|
||||
LOAD_CHI; \
|
||||
} \
|
||||
if (local || st.same_node[DIR] ) { \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
||||
LOAD_CHI; \
|
||||
MULT_2SPIN(DIR); \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define HAND_RESULT(ss) \
|
||||
{ \
|
||||
SiteSpinor & ref (out[ss]); \
|
||||
vstream(ref()(0)(0),result_00); \
|
||||
vstream(ref()(0)(1),result_01); \
|
||||
vstream(ref()(0)(2),result_02); \
|
||||
vstream(ref()(1)(0),result_10); \
|
||||
vstream(ref()(1)(1),result_11); \
|
||||
vstream(ref()(1)(2),result_12); \
|
||||
vstream(ref()(2)(0),result_20); \
|
||||
vstream(ref()(2)(1),result_21); \
|
||||
vstream(ref()(2)(2),result_22); \
|
||||
vstream(ref()(3)(0),result_30); \
|
||||
vstream(ref()(3)(1),result_31); \
|
||||
vstream(ref()(3)(2),result_32); \
|
||||
}
|
||||
|
||||
#define HAND_RESULT_EXT(ss) \
|
||||
if (nmu){ \
|
||||
SiteSpinor & ref (out[ss]); \
|
||||
ref()(0)(0)+=result_00; \
|
||||
ref()(0)(1)+=result_01; \
|
||||
ref()(0)(2)+=result_02; \
|
||||
ref()(1)(0)+=result_10; \
|
||||
ref()(1)(1)+=result_11; \
|
||||
ref()(1)(2)+=result_12; \
|
||||
ref()(2)(0)+=result_20; \
|
||||
ref()(2)(1)+=result_21; \
|
||||
ref()(2)(2)+=result_22; \
|
||||
ref()(3)(0)+=result_30; \
|
||||
ref()(3)(1)+=result_31; \
|
||||
ref()(3)(2)+=result_32; \
|
||||
}
|
||||
|
||||
|
||||
#define HAND_DECLARATIONS(a) \
|
||||
Simd result_00; \
|
||||
Simd result_01; \
|
||||
Simd result_02; \
|
||||
Simd result_10; \
|
||||
Simd result_11; \
|
||||
Simd result_12; \
|
||||
Simd result_20; \
|
||||
Simd result_21; \
|
||||
Simd result_22; \
|
||||
Simd result_30; \
|
||||
Simd result_31; \
|
||||
Simd result_32; \
|
||||
Simd Chi_00; \
|
||||
Simd Chi_01; \
|
||||
Simd Chi_02; \
|
||||
Simd Chi_10; \
|
||||
Simd Chi_11; \
|
||||
Simd Chi_12; \
|
||||
Simd UChi_00; \
|
||||
Simd UChi_01; \
|
||||
Simd UChi_02; \
|
||||
Simd UChi_10; \
|
||||
Simd UChi_11; \
|
||||
Simd UChi_12; \
|
||||
Simd U_00; \
|
||||
Simd U_10; \
|
||||
Simd U_20; \
|
||||
Simd U_01; \
|
||||
Simd U_11; \
|
||||
Simd U_21;
|
||||
|
||||
#define ZERO_RESULT \
|
||||
result_00=Zero(); \
|
||||
result_01=Zero(); \
|
||||
result_02=Zero(); \
|
||||
result_10=Zero(); \
|
||||
result_11=Zero(); \
|
||||
result_12=Zero(); \
|
||||
result_20=Zero(); \
|
||||
result_21=Zero(); \
|
||||
result_22=Zero(); \
|
||||
result_30=Zero(); \
|
||||
result_31=Zero(); \
|
||||
result_32=Zero();
|
||||
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_12
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
|
||||
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||
HAND_RESULT(ss);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
|
||||
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||
HAND_RESULT(ss);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||
HAND_RESULT(ss);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||
HAND_RESULT(ss);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl> void
|
||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
int nmu=0;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||
HAND_RESULT_EXT(ss);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
int nmu=0;
|
||||
ZERO_RESULT;
|
||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||
HAND_RESULT_EXT(ss);
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////// Wilson ; uses this implementation /////////////////////
|
||||
|
||||
#define INSTANTIATE_THEM(A) \
|
||||
template void WilsonKernels<A>::HandDhopSite(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
|
||||
template void WilsonKernels<A>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out);\
|
||||
template void WilsonKernels<A>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
|
||||
template void WilsonKernels<A>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
|
||||
template void WilsonKernels<A>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
|
||||
template void WilsonKernels<A>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out);
|
||||
|
||||
INSTANTIATE_THEM(WilsonImplF);
|
||||
INSTANTIATE_THEM(WilsonImplD);
|
||||
INSTANTIATE_THEM(ZWilsonImplF);
|
||||
INSTANTIATE_THEM(ZWilsonImplD);
|
||||
INSTANTIATE_THEM(DomainWallVec5dImplF);
|
||||
INSTANTIATE_THEM(DomainWallVec5dImplD);
|
||||
INSTANTIATE_THEM(ZDomainWallVec5dImplF);
|
||||
INSTANTIATE_THEM(ZDomainWallVec5dImplD);
|
||||
INSTANTIATE_THEM(WilsonImplFH);
|
||||
INSTANTIATE_THEM(WilsonImplDF);
|
||||
INSTANTIATE_THEM(ZWilsonImplFH);
|
||||
INSTANTIATE_THEM(ZWilsonImplDF);
|
||||
INSTANTIATE_THEM(DomainWallVec5dImplFH);
|
||||
INSTANTIATE_THEM(DomainWallVec5dImplDF);
|
||||
INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
|
||||
INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
|
||||
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
|
||||
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,943 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
|
||||
#define REGISTER
|
||||
|
||||
#define LOAD_CHIMU_BODY(F) \
|
||||
Chimu_00=ref(F)(0)(0); \
|
||||
Chimu_01=ref(F)(0)(1); \
|
||||
Chimu_02=ref(F)(0)(2); \
|
||||
Chimu_10=ref(F)(1)(0); \
|
||||
Chimu_11=ref(F)(1)(1); \
|
||||
Chimu_12=ref(F)(1)(2); \
|
||||
Chimu_20=ref(F)(2)(0); \
|
||||
Chimu_21=ref(F)(2)(1); \
|
||||
Chimu_22=ref(F)(2)(2); \
|
||||
Chimu_30=ref(F)(3)(0); \
|
||||
Chimu_31=ref(F)(3)(1); \
|
||||
Chimu_32=ref(F)(3)(2)
|
||||
|
||||
#define LOAD_CHIMU(DIR,F,PERM) \
|
||||
{ const SiteSpinor & ref (in[offset]); LOAD_CHIMU_BODY(F); }
|
||||
|
||||
#define LOAD_CHI_BODY(F) \
|
||||
Chi_00 = ref(F)(0)(0);\
|
||||
Chi_01 = ref(F)(0)(1);\
|
||||
Chi_02 = ref(F)(0)(2);\
|
||||
Chi_10 = ref(F)(1)(0);\
|
||||
Chi_11 = ref(F)(1)(1);\
|
||||
Chi_12 = ref(F)(1)(2)
|
||||
|
||||
#define LOAD_CHI(DIR,F,PERM) \
|
||||
{const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
|
||||
|
||||
|
||||
//G-parity implementations using in-place intrinsic ops
|
||||
|
||||
//1l 1h -> 1h 1l
|
||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
|
||||
//0h,1l -> 1l,0h
|
||||
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
|
||||
//Pulled fermion through forwards face, GPBC on upper component
|
||||
//Need 0= 0l 1h 1= 1l 0h
|
||||
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
|
||||
//Pulled fermion through backwards face, GPBC on lower component
|
||||
//Need 0= 1l 0h 1= 0l 1h
|
||||
|
||||
//1l 1h -> 1h 1l
|
||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
|
||||
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
|
||||
permute##PERM(tmp1, ref(1)(S)(C)); \
|
||||
exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1); \
|
||||
INTO = tmp2;
|
||||
|
||||
//0l 0h -> 0h 0l
|
||||
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
|
||||
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
|
||||
permute##PERM(tmp1, ref(0)(S)(C)); \
|
||||
exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1); \
|
||||
INTO = tmp2;
|
||||
|
||||
|
||||
|
||||
|
||||
#define LOAD_CHI_SETUP(DIR,F) \
|
||||
g = F; \
|
||||
direction = st._directions[DIR]; \
|
||||
distance = st._distances[DIR]; \
|
||||
sl = st._simd_layout[direction]; \
|
||||
inplace_twist = 0; \
|
||||
if(SE->_around_the_world && st.parameters.twists[DIR % 4]){ \
|
||||
if(sl == 1){ \
|
||||
g = (F+1) % 2; \
|
||||
}else{ \
|
||||
inplace_twist = 1; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
|
||||
{ const SiteSpinor &ref(in[offset]); \
|
||||
LOAD_CHI_SETUP(DIR,F); \
|
||||
if(!inplace_twist){ \
|
||||
LOAD_CHIMU_BODY(g); \
|
||||
}else{ \
|
||||
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
|
||||
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
|
||||
DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
|
||||
}else{ \
|
||||
DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
|
||||
{ const SiteHalfSpinor &ref(buf[offset]); \
|
||||
LOAD_CHI_SETUP(DIR,F); \
|
||||
if(!inplace_twist){ \
|
||||
LOAD_CHI_BODY(g); \
|
||||
}else{ \
|
||||
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
|
||||
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
|
||||
DO_TWIST_0L_1H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_0L_1H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_0L_1H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
|
||||
DO_TWIST_0L_1H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
|
||||
DO_TWIST_0L_1H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_0L_1H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
|
||||
}else{ \
|
||||
DO_TWIST_1L_0H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_1L_0H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
|
||||
DO_TWIST_1L_0H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
|
||||
DO_TWIST_1L_0H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
|
||||
DO_TWIST_1L_0H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
|
||||
DO_TWIST_1L_0H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
|
||||
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
|
||||
|
||||
// To splat or not to splat depends on the implementation
|
||||
#define MULT_2SPIN_BODY \
|
||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||
UChi_00 = U_00*Chi_00; \
|
||||
UChi_10 = U_00*Chi_10; \
|
||||
UChi_01 = U_10*Chi_00; \
|
||||
UChi_11 = U_10*Chi_10; \
|
||||
UChi_02 = U_20*Chi_00; \
|
||||
UChi_12 = U_20*Chi_10; \
|
||||
UChi_00+= U_01*Chi_01; \
|
||||
UChi_10+= U_01*Chi_11; \
|
||||
UChi_01+= U_11*Chi_01; \
|
||||
UChi_11+= U_11*Chi_11; \
|
||||
UChi_02+= U_21*Chi_01; \
|
||||
UChi_12+= U_21*Chi_11; \
|
||||
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
||||
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
||||
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
||||
UChi_00+= U_00*Chi_02; \
|
||||
UChi_10+= U_00*Chi_12; \
|
||||
UChi_01+= U_10*Chi_02; \
|
||||
UChi_11+= U_10*Chi_12; \
|
||||
UChi_02+= U_20*Chi_02; \
|
||||
UChi_12+= U_20*Chi_12
|
||||
|
||||
|
||||
#define MULT_2SPIN(A,F) \
|
||||
{auto & ref(U[sU](A)); MULT_2SPIN_BODY; }
|
||||
|
||||
#define MULT_2SPIN_GPARITY(A,F) \
|
||||
{auto & ref(U[sU](F)(A)); MULT_2SPIN_BODY; }
|
||||
|
||||
|
||||
#define PERMUTE_DIR(dir) \
|
||||
permute##dir(Chi_00,Chi_00);\
|
||||
permute##dir(Chi_01,Chi_01);\
|
||||
permute##dir(Chi_02,Chi_02);\
|
||||
permute##dir(Chi_10,Chi_10);\
|
||||
permute##dir(Chi_11,Chi_11);\
|
||||
permute##dir(Chi_12,Chi_12);
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
#define XP_PROJ \
|
||||
Chi_00 = Chimu_00+timesI(Chimu_30);\
|
||||
Chi_01 = Chimu_01+timesI(Chimu_31);\
|
||||
Chi_02 = Chimu_02+timesI(Chimu_32);\
|
||||
Chi_10 = Chimu_10+timesI(Chimu_20);\
|
||||
Chi_11 = Chimu_11+timesI(Chimu_21);\
|
||||
Chi_12 = Chimu_12+timesI(Chimu_22);
|
||||
|
||||
#define YP_PROJ \
|
||||
Chi_00 = Chimu_00-Chimu_30;\
|
||||
Chi_01 = Chimu_01-Chimu_31;\
|
||||
Chi_02 = Chimu_02-Chimu_32;\
|
||||
Chi_10 = Chimu_10+Chimu_20;\
|
||||
Chi_11 = Chimu_11+Chimu_21;\
|
||||
Chi_12 = Chimu_12+Chimu_22;
|
||||
|
||||
#define ZP_PROJ \
|
||||
Chi_00 = Chimu_00+timesI(Chimu_20); \
|
||||
Chi_01 = Chimu_01+timesI(Chimu_21); \
|
||||
Chi_02 = Chimu_02+timesI(Chimu_22); \
|
||||
Chi_10 = Chimu_10-timesI(Chimu_30); \
|
||||
Chi_11 = Chimu_11-timesI(Chimu_31); \
|
||||
Chi_12 = Chimu_12-timesI(Chimu_32);
|
||||
|
||||
#define TP_PROJ \
|
||||
Chi_00 = Chimu_00+Chimu_20; \
|
||||
Chi_01 = Chimu_01+Chimu_21; \
|
||||
Chi_02 = Chimu_02+Chimu_22; \
|
||||
Chi_10 = Chimu_10+Chimu_30; \
|
||||
Chi_11 = Chimu_11+Chimu_31; \
|
||||
Chi_12 = Chimu_12+Chimu_32;
|
||||
|
||||
|
||||
// hspin(0)=fspin(0)-timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)-timesI(fspin(2));
|
||||
#define XM_PROJ \
|
||||
Chi_00 = Chimu_00-timesI(Chimu_30);\
|
||||
Chi_01 = Chimu_01-timesI(Chimu_31);\
|
||||
Chi_02 = Chimu_02-timesI(Chimu_32);\
|
||||
Chi_10 = Chimu_10-timesI(Chimu_20);\
|
||||
Chi_11 = Chimu_11-timesI(Chimu_21);\
|
||||
Chi_12 = Chimu_12-timesI(Chimu_22);
|
||||
|
||||
#define YM_PROJ \
|
||||
Chi_00 = Chimu_00+Chimu_30;\
|
||||
Chi_01 = Chimu_01+Chimu_31;\
|
||||
Chi_02 = Chimu_02+Chimu_32;\
|
||||
Chi_10 = Chimu_10-Chimu_20;\
|
||||
Chi_11 = Chimu_11-Chimu_21;\
|
||||
Chi_12 = Chimu_12-Chimu_22;
|
||||
|
||||
#define ZM_PROJ \
|
||||
Chi_00 = Chimu_00-timesI(Chimu_20); \
|
||||
Chi_01 = Chimu_01-timesI(Chimu_21); \
|
||||
Chi_02 = Chimu_02-timesI(Chimu_22); \
|
||||
Chi_10 = Chimu_10+timesI(Chimu_30); \
|
||||
Chi_11 = Chimu_11+timesI(Chimu_31); \
|
||||
Chi_12 = Chimu_12+timesI(Chimu_32);
|
||||
|
||||
#define TM_PROJ \
|
||||
Chi_00 = Chimu_00-Chimu_20; \
|
||||
Chi_01 = Chimu_01-Chimu_21; \
|
||||
Chi_02 = Chimu_02-Chimu_22; \
|
||||
Chi_10 = Chimu_10-Chimu_30; \
|
||||
Chi_11 = Chimu_11-Chimu_31; \
|
||||
Chi_12 = Chimu_12-Chimu_32;
|
||||
|
||||
// fspin(0)=hspin(0);
|
||||
// fspin(1)=hspin(1);
|
||||
// fspin(2)=timesMinusI(hspin(1));
|
||||
// fspin(3)=timesMinusI(hspin(0));
|
||||
#define XP_RECON\
|
||||
result_00 = UChi_00;\
|
||||
result_01 = UChi_01;\
|
||||
result_02 = UChi_02;\
|
||||
result_10 = UChi_10;\
|
||||
result_11 = UChi_11;\
|
||||
result_12 = UChi_12;\
|
||||
result_20 = timesMinusI(UChi_10);\
|
||||
result_21 = timesMinusI(UChi_11);\
|
||||
result_22 = timesMinusI(UChi_12);\
|
||||
result_30 = timesMinusI(UChi_00);\
|
||||
result_31 = timesMinusI(UChi_01);\
|
||||
result_32 = timesMinusI(UChi_02);
|
||||
|
||||
#define XP_RECON_ACCUM\
|
||||
result_00+=UChi_00;\
|
||||
result_01+=UChi_01;\
|
||||
result_02+=UChi_02;\
|
||||
result_10+=UChi_10;\
|
||||
result_11+=UChi_11;\
|
||||
result_12+=UChi_12;\
|
||||
result_20-=timesI(UChi_10);\
|
||||
result_21-=timesI(UChi_11);\
|
||||
result_22-=timesI(UChi_12);\
|
||||
result_30-=timesI(UChi_00);\
|
||||
result_31-=timesI(UChi_01);\
|
||||
result_32-=timesI(UChi_02);
|
||||
|
||||
#define XM_RECON\
|
||||
result_00 = UChi_00;\
|
||||
result_01 = UChi_01;\
|
||||
result_02 = UChi_02;\
|
||||
result_10 = UChi_10;\
|
||||
result_11 = UChi_11;\
|
||||
result_12 = UChi_12;\
|
||||
result_20 = timesI(UChi_10);\
|
||||
result_21 = timesI(UChi_11);\
|
||||
result_22 = timesI(UChi_12);\
|
||||
result_30 = timesI(UChi_00);\
|
||||
result_31 = timesI(UChi_01);\
|
||||
result_32 = timesI(UChi_02);
|
||||
|
||||
#define XM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= timesI(UChi_10);\
|
||||
result_21+= timesI(UChi_11);\
|
||||
result_22+= timesI(UChi_12);\
|
||||
result_30+= timesI(UChi_00);\
|
||||
result_31+= timesI(UChi_01);\
|
||||
result_32+= timesI(UChi_02);
|
||||
|
||||
#define YP_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= UChi_10;\
|
||||
result_21+= UChi_11;\
|
||||
result_22+= UChi_12;\
|
||||
result_30-= UChi_00;\
|
||||
result_31-= UChi_01;\
|
||||
result_32-= UChi_02;
|
||||
|
||||
#define YM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20-= UChi_10;\
|
||||
result_21-= UChi_11;\
|
||||
result_22-= UChi_12;\
|
||||
result_30+= UChi_00;\
|
||||
result_31+= UChi_01;\
|
||||
result_32+= UChi_02;
|
||||
|
||||
#define ZP_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20-= timesI(UChi_00); \
|
||||
result_21-= timesI(UChi_01); \
|
||||
result_22-= timesI(UChi_02); \
|
||||
result_30+= timesI(UChi_10); \
|
||||
result_31+= timesI(UChi_11); \
|
||||
result_32+= timesI(UChi_12);
|
||||
|
||||
#define ZM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= timesI(UChi_00); \
|
||||
result_21+= timesI(UChi_01); \
|
||||
result_22+= timesI(UChi_02); \
|
||||
result_30-= timesI(UChi_10); \
|
||||
result_31-= timesI(UChi_11); \
|
||||
result_32-= timesI(UChi_12);
|
||||
|
||||
#define TP_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20+= UChi_00; \
|
||||
result_21+= UChi_01; \
|
||||
result_22+= UChi_02; \
|
||||
result_30+= UChi_10; \
|
||||
result_31+= UChi_11; \
|
||||
result_32+= UChi_12;
|
||||
|
||||
#define TM_RECON_ACCUM\
|
||||
result_00+= UChi_00;\
|
||||
result_01+= UChi_01;\
|
||||
result_02+= UChi_02;\
|
||||
result_10+= UChi_10;\
|
||||
result_11+= UChi_11;\
|
||||
result_12+= UChi_12;\
|
||||
result_20-= UChi_00; \
|
||||
result_21-= UChi_01; \
|
||||
result_22-= UChi_02; \
|
||||
result_30-= UChi_10; \
|
||||
result_31-= UChi_11; \
|
||||
result_32-= UChi_12;
|
||||
|
||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU_IMPL(DIR,F,PERM); \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else { \
|
||||
LOAD_CHI_IMPL(DIR,F,PERM); \
|
||||
} \
|
||||
MULT_2SPIN_IMPL(DIR,F); \
|
||||
RECON;
|
||||
|
||||
|
||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
local = SE->_is_local; \
|
||||
perm = SE->_permute; \
|
||||
if ( local ) { \
|
||||
LOAD_CHIMU_IMPL(DIR,F,PERM); \
|
||||
PROJ; \
|
||||
if ( perm) { \
|
||||
PERMUTE_DIR(PERM); \
|
||||
} \
|
||||
} else if ( st.same_node[DIR] ) { \
|
||||
LOAD_CHI_IMPL(DIR,F,PERM); \
|
||||
} \
|
||||
if (local || st.same_node[DIR] ) { \
|
||||
MULT_2SPIN_IMPL(DIR,F); \
|
||||
RECON; \
|
||||
}
|
||||
|
||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||
SE=st.GetEntry(ptype,DIR,ss); \
|
||||
offset = SE->_offset; \
|
||||
perm = SE->_permute; \
|
||||
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
||||
LOAD_CHI_IMPL(DIR,F,PERM); \
|
||||
MULT_2SPIN_IMPL(DIR,F); \
|
||||
RECON; \
|
||||
nmu++; \
|
||||
}
|
||||
|
||||
#define HAND_RESULT(ss,F) \
|
||||
{ \
|
||||
SiteSpinor & ref (out[ss]); \
|
||||
vstream(ref(F)(0)(0),result_00); \
|
||||
vstream(ref(F)(0)(1),result_01); \
|
||||
vstream(ref(F)(0)(2),result_02); \
|
||||
vstream(ref(F)(1)(0),result_10); \
|
||||
vstream(ref(F)(1)(1),result_11); \
|
||||
vstream(ref(F)(1)(2),result_12); \
|
||||
vstream(ref(F)(2)(0),result_20); \
|
||||
vstream(ref(F)(2)(1),result_21); \
|
||||
vstream(ref(F)(2)(2),result_22); \
|
||||
vstream(ref(F)(3)(0),result_30); \
|
||||
vstream(ref(F)(3)(1),result_31); \
|
||||
vstream(ref(F)(3)(2),result_32); \
|
||||
}
|
||||
|
||||
#define HAND_RESULT_EXT(ss,F) \
|
||||
if (nmu){ \
|
||||
SiteSpinor & ref (out[ss]); \
|
||||
ref(F)(0)(0)+=result_00; \
|
||||
ref(F)(0)(1)+=result_01; \
|
||||
ref(F)(0)(2)+=result_02; \
|
||||
ref(F)(1)(0)+=result_10; \
|
||||
ref(F)(1)(1)+=result_11; \
|
||||
ref(F)(1)(2)+=result_12; \
|
||||
ref(F)(2)(0)+=result_20; \
|
||||
ref(F)(2)(1)+=result_21; \
|
||||
ref(F)(2)(2)+=result_22; \
|
||||
ref(F)(3)(0)+=result_30; \
|
||||
ref(F)(3)(1)+=result_31; \
|
||||
ref(F)(3)(2)+=result_32; \
|
||||
}
|
||||
|
||||
|
||||
#define HAND_DECLARATIONS(a) \
|
||||
Simd result_00; \
|
||||
Simd result_01; \
|
||||
Simd result_02; \
|
||||
Simd result_10; \
|
||||
Simd result_11; \
|
||||
Simd result_12; \
|
||||
Simd result_20; \
|
||||
Simd result_21; \
|
||||
Simd result_22; \
|
||||
Simd result_30; \
|
||||
Simd result_31; \
|
||||
Simd result_32; \
|
||||
Simd Chi_00; \
|
||||
Simd Chi_01; \
|
||||
Simd Chi_02; \
|
||||
Simd Chi_10; \
|
||||
Simd Chi_11; \
|
||||
Simd Chi_12; \
|
||||
Simd UChi_00; \
|
||||
Simd UChi_01; \
|
||||
Simd UChi_02; \
|
||||
Simd UChi_10; \
|
||||
Simd UChi_11; \
|
||||
Simd UChi_12; \
|
||||
Simd U_00; \
|
||||
Simd U_10; \
|
||||
Simd U_20; \
|
||||
Simd U_01; \
|
||||
Simd U_11; \
|
||||
Simd U_21;
|
||||
|
||||
#define ZERO_RESULT \
|
||||
result_00=Zero(); \
|
||||
result_01=Zero(); \
|
||||
result_02=Zero(); \
|
||||
result_10=Zero(); \
|
||||
result_11=Zero(); \
|
||||
result_12=Zero(); \
|
||||
result_20=Zero(); \
|
||||
result_21=Zero(); \
|
||||
result_22=Zero(); \
|
||||
result_30=Zero(); \
|
||||
result_31=Zero(); \
|
||||
result_32=Zero();
|
||||
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_12
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl> void accelerator
|
||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_RESULT(ss,F)
|
||||
|
||||
HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl> accelerator
|
||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_RESULT(ss,F)
|
||||
|
||||
HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl> void accelerator
|
||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
int offset,local,perm, ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||
ZERO_RESULT; \
|
||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_RESULT(ss,F)
|
||||
|
||||
HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl> accelerator
|
||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||
ZERO_RESULT; \
|
||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_RESULT(ss,F)
|
||||
|
||||
HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl> void accelerator
|
||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
int offset, perm, ptype;
|
||||
StencilEntry *SE;
|
||||
int nmu=0;
|
||||
|
||||
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||
ZERO_RESULT; \
|
||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_RESULT_EXT(ss,F)
|
||||
|
||||
HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||
perm++;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
accelerator void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifndef GRID_NVCC
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
HAND_DECLARATIONS(ignore);
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset, perm, ptype;
|
||||
int nmu=0;
|
||||
|
||||
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||
ZERO_RESULT; \
|
||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||
HAND_RESULT_EXT(ss,F)
|
||||
|
||||
HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||
perm++;
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Specialise Gparity to simple implementation
|
||||
////////////////////////////////////////////////
|
||||
#define HAND_SPECIALISE_EMPTY(IMPL) \
|
||||
template<> void \
|
||||
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, \
|
||||
\
|
||||
DoubledGaugeFieldView &U, \
|
||||
SiteHalfSpinor *buf, \
|
||||
int sF,int sU, \
|
||||
const FermionFieldView &in, \
|
||||
FermionFieldView &out){ assert(0); } \
|
||||
template<> void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, \
|
||||
\
|
||||
DoubledGaugeFieldView &U, \
|
||||
SiteHalfSpinor *buf, \
|
||||
int sF,int sU, \
|
||||
const FermionFieldView &in, \
|
||||
FermionFieldView &out){ assert(0); } \
|
||||
template<> void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, \
|
||||
\
|
||||
DoubledGaugeFieldView &U, \
|
||||
SiteHalfSpinor *buf, \
|
||||
int sF,int sU, \
|
||||
const FermionFieldView &in, \
|
||||
FermionFieldView &out){ assert(0); } \
|
||||
template<> void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, \
|
||||
\
|
||||
DoubledGaugeFieldView &U, \
|
||||
SiteHalfSpinor *buf, \
|
||||
int sF,int sU, \
|
||||
const FermionFieldView &in, \
|
||||
FermionFieldView &out){ assert(0); } \
|
||||
template<> void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, \
|
||||
\
|
||||
DoubledGaugeFieldView &U, \
|
||||
SiteHalfSpinor *buf, \
|
||||
int sF,int sU, \
|
||||
const FermionFieldView &in, \
|
||||
FermionFieldView &out){ assert(0); } \
|
||||
template<> void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, \
|
||||
\
|
||||
DoubledGaugeFieldView &U, \
|
||||
SiteHalfSpinor *buf, \
|
||||
int sF,int sU, \
|
||||
const FermionFieldView &in, \
|
||||
FermionFieldView &out){ assert(0); } \
|
||||
|
||||
|
||||
#ifdef GRID_NVCC
|
||||
#define HAND_SPECIALISE_GPARITY(IMPL) HAND_SPECIALISE_EMPTY(IMPL)
|
||||
#else
|
||||
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
||||
template<> void \
|
||||
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
typedef IMPL Impl; \
|
||||
typedef typename Simd::scalar_type S; \
|
||||
typedef typename Simd::vector_type V; \
|
||||
\
|
||||
HAND_DECLARATIONS(ignore); \
|
||||
\
|
||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||
StencilEntry *SE; \
|
||||
HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> \
|
||||
void WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
typedef IMPL Impl; \
|
||||
typedef typename Simd::scalar_type S; \
|
||||
typedef typename Simd::vector_type V; \
|
||||
\
|
||||
HAND_DECLARATIONS(ignore); \
|
||||
\
|
||||
StencilEntry *SE; \
|
||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||
HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
typedef IMPL Impl; \
|
||||
typedef typename Simd::scalar_type S; \
|
||||
typedef typename Simd::vector_type V; \
|
||||
\
|
||||
HAND_DECLARATIONS(ignore); \
|
||||
\
|
||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||
StencilEntry *SE; \
|
||||
HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> \
|
||||
void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
typedef IMPL Impl; \
|
||||
typedef typename Simd::scalar_type S; \
|
||||
typedef typename Simd::vector_type V; \
|
||||
\
|
||||
HAND_DECLARATIONS(ignore); \
|
||||
\
|
||||
StencilEntry *SE; \
|
||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||
HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
\
|
||||
template<> void \
|
||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
typedef IMPL Impl; \
|
||||
typedef typename Simd::scalar_type S; \
|
||||
typedef typename Simd::vector_type V; \
|
||||
\
|
||||
HAND_DECLARATIONS(ignore); \
|
||||
\
|
||||
int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||
StencilEntry *SE; \
|
||||
int nmu=0; \
|
||||
HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
nmu = 0; \
|
||||
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
} \
|
||||
template<> \
|
||||
void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||
{ \
|
||||
typedef IMPL Impl; \
|
||||
typedef typename Simd::scalar_type S; \
|
||||
typedef typename Simd::vector_type V; \
|
||||
\
|
||||
HAND_DECLARATIONS(ignore); \
|
||||
\
|
||||
StencilEntry *SE; \
|
||||
int offset,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||
int nmu=0; \
|
||||
HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
nmu = 0; \
|
||||
HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||
}
|
||||
#endif
|
||||
|
||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
|
||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
|
||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
|
||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
|
||||
|
||||
|
||||
////////////// Wilson ; uses this implementation /////////////////////
|
||||
|
||||
#define INSTANTIATE_THEM(A) \
|
||||
template void WilsonKernels<A>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
|
||||
template void WilsonKernels<A>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out);\
|
||||
template void WilsonKernels<A>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
|
||||
template void WilsonKernels<A>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
|
||||
template void WilsonKernels<A>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,\
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out); \
|
||||
template void WilsonKernels<A>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out);
|
||||
|
||||
//INSTANTIATE_THEM(GparityWilsonImplF);
|
||||
//INSTANTIATE_THEM(GparityWilsonImplD);
|
||||
//INSTANTIATE_THEM(GparityWilsonImplFH);
|
||||
//INSTANTIATE_THEM(GparityWilsonImplDF);
|
||||
//INSTANTIATE_THEM(DomainWallVec5dImplFH);
|
||||
//INSTANTIATE_THEM(DomainWallVec5dImplDF);
|
||||
//INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
|
||||
//INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,97 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*
|
||||
* BF sequence
|
||||
*
|
||||
void bfmbase<Float>::MooeeInv(Fermion_t psi,
|
||||
Fermion_t chi,
|
||||
int dag, int cb)
|
||||
|
||||
double m = this->mass;
|
||||
double tm = this->twistedmass;
|
||||
double mtil = 4.0+this->mass;
|
||||
|
||||
double sq = mtil*mtil + tm*tm;
|
||||
|
||||
double a = mtil/sq;
|
||||
double b = -tm /sq;
|
||||
if(dag) b=-b;
|
||||
axpibg5x(chi,psi,a,b);
|
||||
|
||||
void bfmbase<Float>::Mooee(Fermion_t psi,
|
||||
Fermion_t chi,
|
||||
int dag,int cb)
|
||||
double a = 4.0+this->mass;
|
||||
double b = this->twistedmass;
|
||||
if(dag) b=-b;
|
||||
axpibg5x(chi,psi,a,b);
|
||||
*/
|
||||
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
||||
RealD a = 4.0+this->mass;
|
||||
RealD b = this->mu;
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
axpibg5x(out,in,a,b);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
||||
RealD a = 4.0+this->mass;
|
||||
RealD b = -this->mu;
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
axpibg5x(out,in,a,b);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
||||
RealD m = this->mass;
|
||||
RealD tm = this->mu;
|
||||
RealD mtil = 4.0+m;
|
||||
RealD sq = mtil*mtil+tm*tm;
|
||||
RealD a = mtil/sq;
|
||||
RealD b = -tm /sq;
|
||||
axpibg5x(out,in,a,b);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
|
||||
RealD m = this->mass;
|
||||
RealD tm = this->mu;
|
||||
RealD mtil = 4.0+m;
|
||||
RealD sq = mtil*mtil+tm*tm;
|
||||
RealD a = mtil/sq;
|
||||
RealD b = tm /sq;
|
||||
axpibg5x(out,in,a,b);
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonTMFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,433 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid_Eigen_Dense.h>
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
|
||||
GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mq1, RealD _mq2, RealD _mq3,
|
||||
RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
|
||||
AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
|
||||
FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
|
||||
_shift, _pm, _M5, 1.0, 0.0, p)
|
||||
{
|
||||
RealD eps = 1.0;
|
||||
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
|
||||
assert(zdata->n == this->Ls);
|
||||
|
||||
std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
|
||||
this->SetCoefficientsTanh(zdata, 1.0, 0.0);
|
||||
|
||||
Approx::zolotarev_free(zdata);
|
||||
}
|
||||
|
||||
/***************************************************************
|
||||
* Additional EOFA operators only called outside the inverter.
|
||||
* Since speed is not essential, simple axpby-style
|
||||
* implementations should be fine.
|
||||
***************************************************************/
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Din = Zero();
|
||||
if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
|
||||
else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
|
||||
else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
|
||||
else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
|
||||
}
|
||||
|
||||
// This is just the identity for DWF
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
|
||||
|
||||
// This is just the identity for DWF
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
|
||||
|
||||
/*****************************************************************************************************/
|
||||
|
||||
template<class Impl>
|
||||
RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
FermionField Din(psi.Grid());
|
||||
|
||||
this->Meooe5D(psi, Din);
|
||||
this->DW(Din, chi, DaggerNo);
|
||||
axpby(chi, 1.0, 1.0, chi, psi);
|
||||
this->M5D(psi, chi);
|
||||
return(norm2(chi));
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
FermionField Din(psi.Grid());
|
||||
|
||||
this->DW(psi, Din, DaggerYes);
|
||||
this->MeooeDag5D(Din, chi);
|
||||
this->M5Ddag(psi, chi);
|
||||
axpby(chi, 1.0, 1.0, chi, psi);
|
||||
return(norm2(chi));
|
||||
}
|
||||
|
||||
/********************************************************************
|
||||
* Performance critical fermion operators called inside the inverter
|
||||
********************************************************************/
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
int pm = this->pm;
|
||||
RealD shift = this->shift;
|
||||
RealD mq1 = this->mq1;
|
||||
RealD mq2 = this->mq2;
|
||||
RealD mq3 = this->mq3;
|
||||
|
||||
// coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
|
||||
Coeff_t shiftp(0.0), shiftm(0.0);
|
||||
if(shift != 0.0){
|
||||
if(pm == 1){ shiftp = shift*(mq3-mq2); }
|
||||
else{ shiftm = -shift*(mq3-mq2); }
|
||||
}
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
||||
|
||||
#if(0)
|
||||
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
|
||||
for(int i=0; i<diag.size(); ++i){
|
||||
std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
|
||||
}
|
||||
for(int i=0; i<upper.size(); ++i){
|
||||
std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
|
||||
}
|
||||
for(int i=0; i<lower.size(); ++i){
|
||||
std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
this->M5D(psi, chi, chi, lower, diag, upper);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
int pm = this->pm;
|
||||
RealD shift = this->shift;
|
||||
RealD mq1 = this->mq1;
|
||||
RealD mq2 = this->mq2;
|
||||
RealD mq3 = this->mq3;
|
||||
|
||||
// coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
|
||||
Coeff_t shiftp(0.0), shiftm(0.0);
|
||||
if(shift != 0.0){
|
||||
if(pm == 1){ shiftp = shift*(mq3-mq2); }
|
||||
else{ shiftm = -shift*(mq3-mq2); }
|
||||
}
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
||||
|
||||
#if(0)
|
||||
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
|
||||
for(int i=0; i<diag.size(); ++i){
|
||||
std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
|
||||
}
|
||||
for(int i=0; i<upper.size(); ++i){
|
||||
std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
|
||||
}
|
||||
for(int i=0; i<lower.size(); ++i){
|
||||
std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
this->M5Ddag(psi, chi, chi, lower, diag, upper);
|
||||
}
|
||||
|
||||
// half checkerboard operations
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
upper[s] = -this->cee[s];
|
||||
lower[s] = -this->cee[s];
|
||||
}
|
||||
upper[Ls-1] = this->dm;
|
||||
lower[0] = this->dp;
|
||||
|
||||
this->M5D(psi, psi, chi, lower, diag, upper);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
upper[s] = -this->cee[s];
|
||||
lower[s] = -this->cee[s];
|
||||
}
|
||||
upper[Ls-1] = this->dp;
|
||||
lower[0] = this->dm;
|
||||
|
||||
this->M5Ddag(psi, psi, chi, lower, diag, upper);
|
||||
}
|
||||
|
||||
/****************************************************************************************/
|
||||
|
||||
//Zolo
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
int pm = this->pm;
|
||||
RealD mq1 = this->mq1;
|
||||
RealD mq2 = this->mq2;
|
||||
RealD mq3 = this->mq3;
|
||||
RealD shift = this->shift;
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// Constants for the preconditioned matrix Cayley form
|
||||
////////////////////////////////////////////////////////
|
||||
this->bs.resize(Ls);
|
||||
this->cs.resize(Ls);
|
||||
this->aee.resize(Ls);
|
||||
this->aeo.resize(Ls);
|
||||
this->bee.resize(Ls);
|
||||
this->beo.resize(Ls);
|
||||
this->cee.resize(Ls);
|
||||
this->ceo.resize(Ls);
|
||||
|
||||
for(int i=0; i<Ls; ++i){
|
||||
this->bee[i] = 4.0 - this->M5 + 1.0;
|
||||
this->cee[i] = 1.0;
|
||||
}
|
||||
|
||||
for(int i=0; i<Ls; ++i){
|
||||
this->aee[i] = this->cee[i];
|
||||
this->bs[i] = this->beo[i] = 1.0;
|
||||
this->cs[i] = this->ceo[i] = 0.0;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
// EOFA shift terms
|
||||
//////////////////////////////////////////
|
||||
if(pm == 1){
|
||||
this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
|
||||
this->dm = mq1*this->cee[Ls-1];
|
||||
} else if(this->pm == -1) {
|
||||
this->dp = mq1*this->cee[0];
|
||||
this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
|
||||
} else {
|
||||
this->dp = mq1*this->cee[0];
|
||||
this->dm = mq1*this->cee[Ls-1];
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
// LDU decomposition of eeoo
|
||||
//////////////////////////////////////////
|
||||
this->dee.resize(Ls+1);
|
||||
this->lee.resize(Ls);
|
||||
this->leem.resize(Ls);
|
||||
this->uee.resize(Ls);
|
||||
this->ueem.resize(Ls);
|
||||
|
||||
for(int i=0; i<Ls; ++i){
|
||||
|
||||
if(i < Ls-1){
|
||||
|
||||
this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
|
||||
|
||||
this->leem[i] = this->dm/this->bee[i];
|
||||
for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
|
||||
|
||||
this->dee[i] = this->bee[i];
|
||||
|
||||
this->uee[i] = -this->aee[i]/this->bee[i]; // up-diag entry on the ith row
|
||||
|
||||
this->ueem[i] = this->dp / this->bee[0];
|
||||
for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
|
||||
|
||||
} else {
|
||||
|
||||
this->lee[i] = 0.0;
|
||||
this->leem[i] = 0.0;
|
||||
this->uee[i] = 0.0;
|
||||
this->ueem[i] = 0.0;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
Coeff_t delta_d = 1.0 / this->bee[0];
|
||||
for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
|
||||
this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
|
||||
this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
|
||||
}
|
||||
|
||||
int inv = 1;
|
||||
this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
|
||||
this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
|
||||
}
|
||||
|
||||
// Recompute Cayley-form coefficients for different shift
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
|
||||
{
|
||||
this->shift = new_shift;
|
||||
Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
|
||||
this->SetCoefficientsTanh(zdata, 1.0, 0.0);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
|
||||
Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
GridBase* grid = this->FermionRedBlackGrid();
|
||||
int LLs = grid->_rdimensions[0];
|
||||
|
||||
if(LLs == Ls){ return; } // Not vectorised in 5th direction
|
||||
|
||||
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
Pplus(s,s) = this->bee[s];
|
||||
Pminus(s,s) = this->bee[s];
|
||||
}
|
||||
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
Pminus(s,s+1) = -this->cee[s];
|
||||
}
|
||||
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
Pplus(s+1,s) = -this->cee[s+1];
|
||||
}
|
||||
|
||||
Pplus (0,Ls-1) = this->dp;
|
||||
Pminus(Ls-1,0) = this->dm;
|
||||
|
||||
Eigen::MatrixXcd PplusMat ;
|
||||
Eigen::MatrixXcd PminusMat;
|
||||
|
||||
#if(0)
|
||||
std::cout << GridLogMessage << "Pplus:" << std::endl;
|
||||
for(int s=0; s<Ls; ++s){
|
||||
for(int ss=0; ss<Ls; ++ss){
|
||||
std::cout << Pplus(s,ss) << "\t";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << GridLogMessage << "Pminus:" << std::endl;
|
||||
for(int s=0; s<Ls; ++s){
|
||||
for(int ss=0; ss<Ls; ++ss){
|
||||
std::cout << Pminus(s,ss) << "\t";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if(inv) {
|
||||
PplusMat = Pplus.inverse();
|
||||
PminusMat = Pminus.inverse();
|
||||
} else {
|
||||
PplusMat = Pplus;
|
||||
PminusMat = Pminus;
|
||||
}
|
||||
|
||||
if(dag){
|
||||
PplusMat.adjointInPlace();
|
||||
PminusMat.adjointInPlace();
|
||||
}
|
||||
|
||||
typedef typename SiteHalfSpinor::scalar_type scalar_type;
|
||||
const int Nsimd = Simd::Nsimd();
|
||||
Matp.resize(Ls*LLs);
|
||||
Matm.resize(Ls*LLs);
|
||||
|
||||
for(int s2=0; s2<Ls; s2++){
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
int istride = LLs;
|
||||
int ostride = 1;
|
||||
Simd Vp;
|
||||
Simd Vm;
|
||||
scalar_type *sp = (scalar_type*) &Vp;
|
||||
scalar_type *sm = (scalar_type*) &Vm;
|
||||
for(int l=0; l<Nsimd; l++){
|
||||
if(switcheroo<Coeff_t>::iscomplex()) {
|
||||
sp[l] = PplusMat (l*istride+s1*ostride,s2);
|
||||
sm[l] = PminusMat(l*istride+s1*ostride,s2);
|
||||
} else {
|
||||
// if real
|
||||
scalar_type tmp;
|
||||
tmp = PplusMat (l*istride+s1*ostride,s2);
|
||||
sp[l] = scalar_type(tmp.real(),tmp.real());
|
||||
tmp = PminusMat(l*istride+s1*ostride,s2);
|
||||
sm[l] = scalar_type(tmp.real(),tmp.real());
|
||||
}
|
||||
}
|
||||
Matp[LLs*s2+s1] = Vp;
|
||||
Matm[LLs*s2+s1] = Vm;
|
||||
}}
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(DomainWallEOFAFermion);
|
||||
GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,255 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
|
||||
|
||||
// Pminus fowards
|
||||
// Pplus backwards..
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
int Ls = this->Ls;
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto phi = phi_i.View();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||
for(int s=0; s<Ls; s++){
|
||||
auto tmp = psi[0];
|
||||
if(s==0) {
|
||||
spProj5m(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+Ls-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else if(s==(Ls-1)) {
|
||||
spProj5m(tmp, psi[ss+0]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else {
|
||||
spProj5m(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||
auto tmp = psi[0];
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(s==0) {
|
||||
spProj5p(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+Ls-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else if(s==(Ls-1)) {
|
||||
spProj5p(tmp, psi[ss+0]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else {
|
||||
spProj5p(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionField& chi_i)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi=psi_i.View();
|
||||
auto chi=chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||
|
||||
auto tmp1 = psi[0];
|
||||
auto tmp2 = psi[0];
|
||||
|
||||
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
|
||||
// Apply (L^{\prime})^{-1}
|
||||
chi[ss] = psi[ss]; // chi[0]=psi[0]
|
||||
for(int s=1; s<Ls; s++){
|
||||
spProj5p(tmp1, chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
|
||||
}
|
||||
|
||||
// L_m^{-1}
|
||||
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
|
||||
spProj5m(tmp1, chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
|
||||
}
|
||||
|
||||
// U_m^{-1} D^{-1}
|
||||
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
|
||||
spProj5p(tmp1, chi[ss+Ls-1]);
|
||||
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
|
||||
}
|
||||
spProj5m(tmp2, chi[ss+Ls-1]);
|
||||
chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
|
||||
|
||||
// Apply U^{-1}
|
||||
for(int s=Ls-2; s>=0; s--){
|
||||
spProj5m(tmp1, chi[ss+s+1]);
|
||||
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
|
||||
}
|
||||
});
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, FermionField& chi_i)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
|
||||
assert(psi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
Vector<Coeff_t> ueec(Ls);
|
||||
Vector<Coeff_t> deec(Ls+1);
|
||||
Vector<Coeff_t> leec(Ls);
|
||||
Vector<Coeff_t> ueemc(Ls);
|
||||
Vector<Coeff_t> leemc(Ls);
|
||||
|
||||
for(int s=0; s<ueec.size(); s++){
|
||||
ueec[s] = conjugate(this->uee[s]);
|
||||
deec[s] = conjugate(this->dee[s]);
|
||||
leec[s] = conjugate(this->lee[s]);
|
||||
ueemc[s] = conjugate(this->ueem[s]);
|
||||
leemc[s] = conjugate(this->leem[s]);
|
||||
}
|
||||
deec[Ls] = conjugate(this->dee[Ls]);
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
thread_loop((int ss=0; ss<grid->oSites(); ss+=Ls),{ // adds Ls
|
||||
|
||||
auto tmp1 = psi[0];
|
||||
auto tmp2 = psi[0];
|
||||
|
||||
// Apply (U^{\prime})^{-dagger}
|
||||
chi[ss] = psi[ss];
|
||||
for(int s=1; s<Ls; s++){
|
||||
spProj5m(tmp1, chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
|
||||
}
|
||||
|
||||
// U_m^{-\dagger}
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
spProj5p(tmp1, chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
|
||||
}
|
||||
|
||||
// L_m^{-\dagger} D^{-dagger}
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
spProj5m(tmp1, chi[ss+Ls-1]);
|
||||
chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
|
||||
}
|
||||
spProj5p(tmp2, chi[ss+Ls-1]);
|
||||
chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
|
||||
|
||||
// Apply L^{-dagger}
|
||||
for(int s=Ls-2; s>=0; s--){
|
||||
spProj5p(tmp1, chi[ss+s+1]);
|
||||
chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
|
||||
}
|
||||
});
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
|
||||
|
||||
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
|
||||
|
||||
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
|
||||
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,613 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
/*
|
||||
* Dense matrix versions of routines
|
||||
*/
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
const int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd> > u(LLs);
|
||||
Vector<iSinglet<Simd> > l(LLs);
|
||||
Vector<iSinglet<Simd> > d(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
|
||||
for(int o=0;o<LLs;o++){ // outer
|
||||
for(int i=0;i<nsimd;i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
assert(Nc == 3);
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
#if 0
|
||||
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
int vp = (v+1)%LLs;
|
||||
int vm = (v+LLs-1)%LLs;
|
||||
|
||||
spProj5m(hp, psi[ss+vp]);
|
||||
spProj5p(hm, psi[ss+vm]);
|
||||
|
||||
if (vp <= v){ rotate(hp, hp, 1); }
|
||||
if (vm >= v){ rotate(hm, hm, nsimd-1); }
|
||||
|
||||
hp = 0.5*hp;
|
||||
hm = 0.5*hm;
|
||||
|
||||
spRecon5m(fp, hp);
|
||||
spRecon5p(fm, hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v];
|
||||
chi[ss+v] = chi[ss+v] + u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] + l[v]*fm;
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v==LLs-1) ? 0 : v+1;
|
||||
int vm = (v==0) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
if(vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
// Can force these to real arithmetic and save 2x.
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
}
|
||||
|
||||
#endif
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd> > u(LLs);
|
||||
Vector<iSinglet<Simd> > l(LLs);
|
||||
Vector<iSinglet<Simd> > d(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop((int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
#if 0
|
||||
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
int vp = (v+1)%LLs;
|
||||
int vm = (v+LLs-1)%LLs;
|
||||
|
||||
spProj5p(hp, psi[ss+vp]);
|
||||
spProj5m(hm, psi[ss+vm]);
|
||||
|
||||
if(vp <= v){ rotate(hp, hp, 1); }
|
||||
if(vm >= v){ rotate(hm, hm, nsimd-1); }
|
||||
|
||||
hp = hp*0.5;
|
||||
hm = hm*0.5;
|
||||
spRecon5p(fp, hp);
|
||||
spRecon5m(fm, hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||
|
||||
if (vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
}
|
||||
#endif
|
||||
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
#ifdef AVX512
|
||||
#include<simd/Intel512common.h>
|
||||
#include<simd/Intel512avx.h>
|
||||
#include<simd/Intel512single.h>
|
||||
#endif
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
|
||||
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
#ifndef AVX512
|
||||
{
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
|
||||
for(int s2=0; s2<LLs; s2++){
|
||||
for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
|
||||
|
||||
int s = s2 + l*LLs;
|
||||
int lex = s2 + LLs*site;
|
||||
|
||||
if( s2==0 && l==0 ){
|
||||
SiteChiP=Zero();
|
||||
SiteChiM=Zero();
|
||||
}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
|
||||
}}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
|
||||
}}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
|
||||
SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
|
||||
}}
|
||||
}}
|
||||
|
||||
{
|
||||
int lex = s1 + LLs*site;
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
{
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %%zmm1
|
||||
#define Chi_01 %%zmm2
|
||||
#define Chi_02 %%zmm3
|
||||
#define Chi_10 %%zmm4
|
||||
#define Chi_11 %%zmm5
|
||||
#define Chi_12 %%zmm6
|
||||
#define Chi_20 %%zmm7
|
||||
#define Chi_21 %%zmm8
|
||||
#define Chi_22 %%zmm9
|
||||
#define Chi_30 %%zmm10
|
||||
#define Chi_31 %%zmm11
|
||||
#define Chi_32 %%zmm12
|
||||
|
||||
#define BCAST0 %%zmm13
|
||||
#define BCAST1 %%zmm14
|
||||
#define BCAST2 %%zmm15
|
||||
#define BCAST3 %%zmm16
|
||||
#define BCAST4 %%zmm17
|
||||
#define BCAST5 %%zmm18
|
||||
#define BCAST6 %%zmm19
|
||||
#define BCAST7 %%zmm20
|
||||
#define BCAST8 %%zmm21
|
||||
#define BCAST9 %%zmm22
|
||||
#define BCAST10 %%zmm23
|
||||
#define BCAST11 %%zmm24
|
||||
|
||||
int incr = LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
|
||||
for(int s2=0; s2<LLs; s2++){
|
||||
|
||||
int lex = s2 + LLs*site;
|
||||
uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t) &psi[lex];
|
||||
|
||||
for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
|
||||
if((s2+l)==0) {
|
||||
asm(
|
||||
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||
VBCASTCDUP(0,%2,BCAST0)
|
||||
VBCASTCDUP(1,%2,BCAST1)
|
||||
VBCASTCDUP(2,%2,BCAST2)
|
||||
VBCASTCDUP(3,%2,BCAST3)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMULMEM(0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMULMEM(0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMULMEM(0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMULMEM(0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMULMEM(0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMULMEM(0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMULMEM(0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMULMEM(0,%1,BCAST7,Chi_21)
|
||||
VMULMEM(0,%1,BCAST8,Chi_22)
|
||||
VMULMEM(0,%1,BCAST9,Chi_30)
|
||||
VMULMEM(0,%1,BCAST10,Chi_31)
|
||||
VMULMEM(0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
} else {
|
||||
asm(
|
||||
VBCASTCDUP(0,%2,BCAST0) VMADDMEM(0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(1,%2,BCAST1) VMADDMEM(0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(2,%2,BCAST2) VMADDMEM(0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(3,%2,BCAST3) VMADDMEM(0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMADDMEM(0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMADDMEM(0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMADDMEM(0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMADDMEM(0,%1,BCAST7,Chi_21)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMADDMEM(0,%1,BCAST8,Chi_22)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMADDMEM(0,%1,BCAST9,Chi_30)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
}
|
||||
a0 = a0 + incr;
|
||||
a1 = a1 + incr;
|
||||
a2 = a2 + sizeof(typename Simd::scalar_type);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
asm (
|
||||
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
|
||||
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
|
||||
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
|
||||
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
#undef Chi_10
|
||||
#undef Chi_11
|
||||
#undef Chi_12
|
||||
#undef Chi_20
|
||||
#undef Chi_21
|
||||
#undef Chi_22
|
||||
#undef Chi_30
|
||||
#undef Chi_31
|
||||
#undef Chi_32
|
||||
|
||||
#undef BCAST0
|
||||
#undef BCAST1
|
||||
#undef BCAST2
|
||||
#undef BCAST3
|
||||
#undef BCAST4
|
||||
#undef BCAST5
|
||||
#undef BCAST6
|
||||
#undef BCAST7
|
||||
#undef BCAST8
|
||||
#undef BCAST9
|
||||
#undef BCAST10
|
||||
#undef BCAST11
|
||||
#endif
|
||||
};
|
||||
|
||||
// Z-mobius version
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
|
||||
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
|
||||
exit(-1);
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
|
||||
{
|
||||
chi.Checkerboard() = psi.Checkerboard();
|
||||
int Ls = this->Ls;
|
||||
int LLs = psi.Grid()->_rdimensions[0];
|
||||
int vol = psi.Grid()->oSites()/LLs;
|
||||
|
||||
Vector<iSinglet<Simd> > Matp;
|
||||
Vector<iSinglet<Simd> > Matm;
|
||||
Vector<iSinglet<Simd> > *_Matp;
|
||||
Vector<iSinglet<Simd> > *_Matm;
|
||||
|
||||
// MooeeInternalCompute(dag,inv,Matp,Matm);
|
||||
if(inv && dag){
|
||||
_Matp = &this->MatpInvDag;
|
||||
_Matm = &this->MatmInvDag;
|
||||
}
|
||||
|
||||
if(inv && (!dag)){
|
||||
_Matp = &this->MatpInv;
|
||||
_Matm = &this->MatmInv;
|
||||
}
|
||||
|
||||
if(!inv){
|
||||
MooeeInternalCompute(dag, inv, Matp, Matm);
|
||||
_Matp = &Matp;
|
||||
_Matm = &Matm;
|
||||
}
|
||||
|
||||
assert(_Matp->size() == Ls*LLs);
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
if(switcheroo<Coeff_t>::iscomplex()){
|
||||
thread_loop((auto site=0; site<vol; site++),{
|
||||
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||
});
|
||||
} else {
|
||||
thread_loop((auto site=0; site<vol; site++){
|
||||
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||
});
|
||||
}
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
|
||||
|
||||
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
|
||||
|
||||
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
|
||||
|
||||
template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
|
||||
template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,497 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid_Eigen_Dense.h>
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
|
||||
GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mq1, RealD _mq2, RealD _mq3,
|
||||
RealD _shift, int _pm, RealD _M5,
|
||||
RealD _b, RealD _c, const ImplParams &p) :
|
||||
AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
|
||||
FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
|
||||
_shift, _pm, _M5, _b, _c, p)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
RealD eps = 1.0;
|
||||
Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
|
||||
assert(zdata->n == this->Ls);
|
||||
|
||||
std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
|
||||
",c=" << _c << ") with Ls=" << Ls << std::endl;
|
||||
this->SetCoefficientsTanh(zdata, _b, _c);
|
||||
std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
|
||||
",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
|
||||
",pm=" << _pm << ")" << std::endl;
|
||||
|
||||
Approx::zolotarev_free(zdata);
|
||||
|
||||
if(_shift != 0.0){
|
||||
SetCoefficientsPrecondShiftOps();
|
||||
} else {
|
||||
Mooee_shift.resize(Ls, 0.0);
|
||||
MooeeInv_shift_lc.resize(Ls, 0.0);
|
||||
MooeeInv_shift_norm.resize(Ls, 0.0);
|
||||
MooeeInvDag_shift_lc.resize(Ls, 0.0);
|
||||
MooeeInvDag_shift_norm.resize(Ls, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************
|
||||
* Additional EOFA operators only called outside the inverter.
|
||||
* Since speed is not essential, simple axpby-style
|
||||
* implementations should be fine.
|
||||
***************************************************************/
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
RealD alpha = this->alpha;
|
||||
|
||||
Din = Zero();
|
||||
if((sign == 1) && (dag == 0)) { // \Omega_{+}
|
||||
for(int s=0; s<Ls; ++s){
|
||||
axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
|
||||
}
|
||||
} else if((sign == -1) && (dag == 0)) { // \Omega_{-}
|
||||
for(int s=0; s<Ls; ++s){
|
||||
axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
|
||||
}
|
||||
} else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
|
||||
for(int sp=0; sp<Ls; ++sp){
|
||||
axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
|
||||
}
|
||||
} else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
|
||||
for(int sp=0; sp<Ls; ++sp){
|
||||
axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
|
||||
// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
RealD b = 0.5 * ( 1.0 + this->alpha );
|
||||
RealD c = 0.5 * ( 1.0 - this->alpha );
|
||||
RealD mq1 = this->mq1;
|
||||
|
||||
for(int s=0; s<Ls; ++s){
|
||||
if(s == 0) {
|
||||
axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
|
||||
axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
|
||||
} else if(s == (Ls-1)) {
|
||||
axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
|
||||
axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
|
||||
} else {
|
||||
axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
|
||||
axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
RealD m = this->mq1;
|
||||
RealD c = 0.5 * this->alpha;
|
||||
RealD d = 0.5;
|
||||
|
||||
RealD DtInv_p(0.0), DtInv_m(0.0);
|
||||
RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
|
||||
FermionField tmp(this->FermionGrid());
|
||||
|
||||
for(int s=0; s<Ls; ++s){
|
||||
for(int sp=0; sp<Ls; ++sp){
|
||||
|
||||
DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
|
||||
DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
|
||||
DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
|
||||
DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
|
||||
|
||||
if(sp == 0){
|
||||
axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
|
||||
axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
|
||||
} else {
|
||||
axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
|
||||
axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
|
||||
}
|
||||
|
||||
}}
|
||||
}
|
||||
|
||||
/*****************************************************************************************************/
|
||||
|
||||
template<class Impl>
|
||||
RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
FermionField Din(psi.Grid());
|
||||
|
||||
this->Meooe5D(psi, Din);
|
||||
this->DW(Din, chi, DaggerNo);
|
||||
axpby(chi, 1.0, 1.0, chi, psi);
|
||||
this->M5D(psi, chi);
|
||||
return(norm2(chi));
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
FermionField Din(psi.Grid());
|
||||
|
||||
this->DW(psi, Din, DaggerYes);
|
||||
this->MeooeDag5D(Din, chi);
|
||||
this->M5Ddag(psi, chi);
|
||||
axpby(chi, 1.0, 1.0, chi, psi);
|
||||
return(norm2(chi));
|
||||
}
|
||||
|
||||
/********************************************************************
|
||||
* Performance critical fermion operators called inside the inverter
|
||||
********************************************************************/
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
|
||||
|
||||
// fused M + shift operation
|
||||
else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
|
||||
|
||||
// fused M + shift operation
|
||||
else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
|
||||
}
|
||||
|
||||
// half checkerboard operations
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
// coefficients of Mooee
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
for(int s=0; s<Ls; s++){
|
||||
upper[s] = -this->cee[s];
|
||||
lower[s] = -this->cee[s];
|
||||
}
|
||||
upper[Ls-1] *= -this->mq1;
|
||||
lower[0] *= -this->mq1;
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
|
||||
|
||||
// fused M + shift operation
|
||||
else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
// coefficients of MooeeDag
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(s==0) {
|
||||
upper[s] = -this->cee[s+1];
|
||||
lower[s] = this->mq1*this->cee[Ls-1];
|
||||
} else if(s==(Ls-1)) {
|
||||
upper[s] = this->mq1*this->cee[0];
|
||||
lower[s] = -this->cee[s-1];
|
||||
} else {
|
||||
upper[s] = -this->cee[s+1];
|
||||
lower[s] = -this->cee[s-1];
|
||||
}
|
||||
}
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
|
||||
|
||||
// fused M + shift operation
|
||||
else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
|
||||
}
|
||||
|
||||
/****************************************************************************************/
|
||||
|
||||
// Computes coefficients for applying Cayley preconditioned shift operators
|
||||
// (Mooee + \Delta) --> Mooee_shift
|
||||
// (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
|
||||
// (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
|
||||
// For the latter two cases, the operation takes the form
|
||||
// [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
|
||||
// ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
int pm = this->pm;
|
||||
RealD alpha = this->alpha;
|
||||
RealD k = this->k;
|
||||
RealD mq1 = this->mq1;
|
||||
RealD shift = this->shift;
|
||||
|
||||
// Initialize
|
||||
Mooee_shift.resize(Ls);
|
||||
MooeeInv_shift_lc.resize(Ls);
|
||||
MooeeInv_shift_norm.resize(Ls);
|
||||
MooeeInvDag_shift_lc.resize(Ls);
|
||||
MooeeInvDag_shift_norm.resize(Ls);
|
||||
|
||||
// Construct Mooee_shift
|
||||
int idx(0);
|
||||
Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
|
||||
( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
|
||||
for(int s=0; s<Ls; ++s){
|
||||
idx = (pm == 1) ? (s) : (Ls-1-s);
|
||||
Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
|
||||
}
|
||||
|
||||
// Tridiagonal solve for MooeeInvDag_shift_lc
|
||||
{
|
||||
Coeff_t m(0.0);
|
||||
Vector<Coeff_t> d = Mooee_shift;
|
||||
Vector<Coeff_t> u(Ls,0.0);
|
||||
Vector<Coeff_t> y(Ls,0.0);
|
||||
Vector<Coeff_t> q(Ls,0.0);
|
||||
if(pm == 1){ u[0] = 1.0; }
|
||||
else{ u[Ls-1] = 1.0; }
|
||||
|
||||
// Tridiagonal matrix algorithm + Sherman-Morrison formula
|
||||
//
|
||||
// We solve
|
||||
// ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
|
||||
// where Mooee' is the tridiagonal part of Mooee_{+}, and
|
||||
// u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
|
||||
// so that the outer-product u \otimes v gives the (0,Ls-1)
|
||||
// entry of Mooee_{+}.
|
||||
//
|
||||
// We do this as two solves: Mooee'*y = d and Mooee'*q = u,
|
||||
// and then construct the solution to the original system
|
||||
// MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
|
||||
if(pm == 1){
|
||||
for(int s=1; s<Ls; ++s){
|
||||
m = -this->cee[s] / this->bee[s-1];
|
||||
d[s] -= m*d[s-1];
|
||||
u[s] -= m*u[s-1];
|
||||
}
|
||||
}
|
||||
y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
|
||||
q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
|
||||
for(int s=Ls-2; s>=0; --s){
|
||||
if(pm == 1){
|
||||
y[s] = d[s] / this->bee[s];
|
||||
q[s] = u[s] / this->bee[s];
|
||||
} else {
|
||||
y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
|
||||
q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
|
||||
}
|
||||
}
|
||||
|
||||
// Construct MooeeInvDag_shift_lc
|
||||
for(int s=0; s<Ls; ++s){
|
||||
if(pm == 1){
|
||||
MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
|
||||
(1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
|
||||
} else {
|
||||
MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
|
||||
(1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
|
||||
}
|
||||
}
|
||||
|
||||
// Compute remaining coefficients
|
||||
N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
|
||||
for(int s=0; s<Ls; ++s){
|
||||
|
||||
// MooeeInv_shift_lc
|
||||
if(pm == 1){ MooeeInv_shift_lc[s] = pow(this->bee[s],s) * pow(this->cee[s],Ls-1-s); }
|
||||
else { MooeeInv_shift_lc[s] = pow(this->bee[s],Ls-1-s) * pow(this->cee[s],s); }
|
||||
|
||||
// MooeeInv_shift_norm
|
||||
MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
|
||||
( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N;
|
||||
|
||||
// MooeeInvDag_shift_norm
|
||||
if(pm == 1){ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],s) * pow(this->cee[s],(Ls-1-s)) /
|
||||
( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
|
||||
else{ MooeeInvDag_shift_norm[s] = -pow(this->bee[s],(Ls-1-s)) * pow(this->cee[s],s) /
|
||||
( pow(this->bee[s],Ls) + mq1*pow(this->cee[s],Ls) ) / N; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Recompute coefficients for a different value of shift constant
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
|
||||
{
|
||||
this->shift = new_shift;
|
||||
if(new_shift != 0.0){
|
||||
SetCoefficientsPrecondShiftOps();
|
||||
} else {
|
||||
int Ls = this->Ls;
|
||||
Mooee_shift.resize(Ls,0.0);
|
||||
MooeeInv_shift_lc.resize(Ls,0.0);
|
||||
MooeeInv_shift_norm.resize(Ls,0.0);
|
||||
MooeeInvDag_shift_lc.resize(Ls,0.0);
|
||||
MooeeInvDag_shift_norm.resize(Ls,0.0);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
|
||||
Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
GridBase* grid = this->FermionRedBlackGrid();
|
||||
int LLs = grid->_rdimensions[0];
|
||||
|
||||
if(LLs == Ls){ return; } // Not vectorised in 5th direction
|
||||
|
||||
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
Pplus(s,s) = this->bee[s];
|
||||
Pminus(s,s) = this->bee[s];
|
||||
}
|
||||
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
Pminus(s,s+1) = -this->cee[s];
|
||||
Pplus(s+1,s) = -this->cee[s+1];
|
||||
}
|
||||
|
||||
Pplus (0,Ls-1) = this->mq1*this->cee[0];
|
||||
Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
|
||||
|
||||
if(this->shift != 0.0){
|
||||
RealD c = 0.5 * this->alpha;
|
||||
RealD d = 0.5;
|
||||
RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
|
||||
if(this->pm == 1) {
|
||||
for(int s=0; s<Ls; ++s){
|
||||
Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
|
||||
}
|
||||
} else {
|
||||
for(int s=0; s<Ls; ++s){
|
||||
Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Eigen::MatrixXcd PplusMat ;
|
||||
Eigen::MatrixXcd PminusMat;
|
||||
|
||||
if(inv) {
|
||||
PplusMat = Pplus.inverse();
|
||||
PminusMat = Pminus.inverse();
|
||||
} else {
|
||||
PplusMat = Pplus;
|
||||
PminusMat = Pminus;
|
||||
}
|
||||
|
||||
if(dag){
|
||||
PplusMat.adjointInPlace();
|
||||
PminusMat.adjointInPlace();
|
||||
}
|
||||
|
||||
typedef typename SiteHalfSpinor::scalar_type scalar_type;
|
||||
const int Nsimd = Simd::Nsimd();
|
||||
Matp.resize(Ls*LLs);
|
||||
Matm.resize(Ls*LLs);
|
||||
|
||||
for(int s2=0; s2<Ls; s2++){
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
int istride = LLs;
|
||||
int ostride = 1;
|
||||
Simd Vp;
|
||||
Simd Vm;
|
||||
scalar_type *sp = (scalar_type*) &Vp;
|
||||
scalar_type *sm = (scalar_type*) &Vm;
|
||||
for(int l=0; l<Nsimd; l++){
|
||||
if(switcheroo<Coeff_t>::iscomplex()) {
|
||||
sp[l] = PplusMat (l*istride+s1*ostride,s2);
|
||||
sm[l] = PminusMat(l*istride+s1*ostride,s2);
|
||||
} else {
|
||||
// if real
|
||||
scalar_type tmp;
|
||||
tmp = PplusMat (l*istride+s1*ostride,s2);
|
||||
sp[l] = scalar_type(tmp.real(),tmp.real());
|
||||
tmp = PminusMat(l*istride+s1*ostride,s2);
|
||||
sm[l] = scalar_type(tmp.real(),tmp.real());
|
||||
}
|
||||
}
|
||||
Matp[LLs*s2+s1] = Vp;
|
||||
Matm[LLs*s2+s1] = Vm;
|
||||
}}
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(MobiusEOFAFermion);
|
||||
GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,445 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||
for(int s=0; s<Ls; s++){
|
||||
auto tmp = psi[0];
|
||||
if(s==0){
|
||||
spProj5m(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+Ls-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else if(s==(Ls-1)) {
|
||||
spProj5m(tmp, psi[ss+0]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else {
|
||||
spProj5m(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
||||
Vector<Coeff_t> &shift_coeffs)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||
for(int s=0; s<Ls; s++){
|
||||
auto tmp = psi[0];
|
||||
if(s==0){
|
||||
spProj5m(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+Ls-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else if(s==(Ls-1)) {
|
||||
spProj5m(tmp, psi[ss+0]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else {
|
||||
spProj5m(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5p(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
}
|
||||
if(this->pm == 1){ spProj5p(tmp, psi[ss+shift_s]); }
|
||||
else{ spProj5m(tmp, psi[ss+shift_s]); }
|
||||
chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||
auto tmp = psi[0];
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(s==0) {
|
||||
spProj5p(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+Ls-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else if(s==(Ls-1)) {
|
||||
spProj5p(tmp, psi[ss+0]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else {
|
||||
spProj5p(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
||||
Vector<Coeff_t> &shift_coeffs)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||
chi[ss+Ls-1] = Zero();
|
||||
auto tmp = psi[0];
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(s==0) {
|
||||
spProj5p(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+Ls-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else if(s==(Ls-1)) {
|
||||
spProj5p(tmp, psi[ss+0]);
|
||||
chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
} else {
|
||||
spProj5p(tmp, psi[ss+s+1]);
|
||||
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
|
||||
spProj5m(tmp, psi[ss+s-1]);
|
||||
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
|
||||
}
|
||||
if(this->pm == 1){ spProj5p(tmp, psi[ss+s]); }
|
||||
else{ spProj5m(tmp, psi[ss+s]); }
|
||||
chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &chi_i)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||
|
||||
auto tmp = psi[0];
|
||||
|
||||
// Apply (L^{\prime})^{-1}
|
||||
chi[ss] = psi[ss]; // chi[0]=psi[0]
|
||||
for(int s=1; s<Ls; s++){
|
||||
spProj5p(tmp, chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
|
||||
}
|
||||
|
||||
// L_m^{-1}
|
||||
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
|
||||
spProj5m(tmp, chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
|
||||
}
|
||||
|
||||
// U_m^{-1} D^{-1}
|
||||
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
|
||||
spProj5p(tmp, chi[ss+Ls-1]);
|
||||
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
|
||||
}
|
||||
chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
|
||||
|
||||
// Apply U^{-1}
|
||||
for(int s=Ls-2; s>=0; s--){
|
||||
spProj5m(tmp, chi[ss+s+1]);
|
||||
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
|
||||
}
|
||||
});
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionField &chi_i)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||
|
||||
auto tmp1 = psi[0];
|
||||
auto tmp2 = psi[0];
|
||||
auto tmp2_spProj = psi[0];
|
||||
|
||||
// Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
|
||||
chi[ss] = psi[ss]; // chi[0]=psi[0]
|
||||
tmp2 = MooeeInv_shift_lc[0]*psi[ss];
|
||||
for(int s=1; s<Ls; s++){
|
||||
spProj5p(tmp1, chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
|
||||
tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
|
||||
}
|
||||
if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
|
||||
else{ spProj5m(tmp2_spProj, tmp2); }
|
||||
|
||||
// L_m^{-1}
|
||||
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
|
||||
spProj5m(tmp1, chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
|
||||
}
|
||||
|
||||
// U_m^{-1} D^{-1}
|
||||
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
|
||||
spProj5p(tmp1, chi[ss+Ls-1]);
|
||||
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
|
||||
}
|
||||
// chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
|
||||
chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
|
||||
spProj5m(tmp1, chi[ss+Ls-1]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
|
||||
|
||||
// Apply U^{-1} and add shift term
|
||||
for(int s=Ls-2; s>=0; s--){
|
||||
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
|
||||
spProj5m(tmp1, chi[ss+s]);
|
||||
chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
|
||||
}
|
||||
});
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionField &chi_i)
|
||||
{
|
||||
if(this->shift != 0.0){ MooeeInvDag_shift(psi_i,chi_i); return; }
|
||||
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
int Ls = this->Ls;
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||
|
||||
auto tmp = psi[0];
|
||||
|
||||
// Apply (U^{\prime})^{-dag}
|
||||
chi[ss] = psi[ss];
|
||||
for(int s=1; s<Ls; s++){
|
||||
spProj5m(tmp, chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
|
||||
}
|
||||
|
||||
// U_m^{-\dag}
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
spProj5p(tmp, chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
|
||||
}
|
||||
|
||||
// L_m^{-\dag} D^{-dag}
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
spProj5m(tmp, chi[ss+Ls-1]);
|
||||
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
|
||||
}
|
||||
chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
|
||||
|
||||
// Apply L^{-dag}
|
||||
for(int s=Ls-2; s>=0; s--){
|
||||
spProj5p(tmp, chi[ss+s+1]);
|
||||
chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
|
||||
}
|
||||
});
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, FermionField &chi_i)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=Ls),{
|
||||
|
||||
auto tmp1 = psi[0];
|
||||
auto tmp2 = psi[0];
|
||||
auto tmp2_spProj = psi[0];
|
||||
|
||||
// Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
|
||||
chi[ss] = psi[ss];
|
||||
tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
|
||||
for(int s=1; s<Ls; s++){
|
||||
spProj5m(tmp1, chi[ss+s-1]);
|
||||
chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
|
||||
tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
|
||||
}
|
||||
if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
|
||||
else{ spProj5m(tmp2_spProj, tmp2); }
|
||||
|
||||
// U_m^{-\dag}
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
spProj5p(tmp1, chi[ss+s]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
|
||||
}
|
||||
|
||||
// L_m^{-\dag} D^{-dag}
|
||||
for(int s=0; s<Ls-1; s++){
|
||||
spProj5m(tmp1, chi[ss+Ls-1]);
|
||||
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
|
||||
}
|
||||
chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
|
||||
spProj5p(tmp1, chi[ss+Ls-1]);
|
||||
chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
|
||||
|
||||
// Apply L^{-dag}
|
||||
for(int s=Ls-2; s>=0; s--){
|
||||
chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
|
||||
spProj5p(tmp1, chi[ss+s]);
|
||||
chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
|
||||
}
|
||||
});
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
#ifdef MOBIUS_EOFA_DPERP_CACHE
|
||||
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
|
||||
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
|
||||
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,998 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*
|
||||
* Dense matrix versions of routines
|
||||
*/
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
const int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd>> u(LLs);
|
||||
Vector<iSinglet<Simd>> l(LLs);
|
||||
Vector<iSinglet<Simd>> d(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
assert(Nc == 3);
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
#if 0
|
||||
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
int vp = (v+1)%LLs;
|
||||
int vm = (v+LLs-1)%LLs;
|
||||
|
||||
spProj5m(hp, psi[ss+vp]);
|
||||
spProj5p(hm, psi[ss+vm]);
|
||||
|
||||
if (vp <= v){ rotate(hp, hp, 1); }
|
||||
if (vm >= v){ rotate(hm, hm, nsimd-1); }
|
||||
|
||||
hp = 0.5*hp;
|
||||
hm = 0.5*hm;
|
||||
|
||||
spRecon5m(fp, hp);
|
||||
spRecon5p(fm, hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v];
|
||||
chi[ss+v] = chi[ss+v] + u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] + l[v]*fm;
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
if(vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
// Can force these to real arithmetic and save 2x.
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
}
|
||||
|
||||
#endif
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi_i, const FermionField& phi_i,
|
||||
FermionField& chi_i, Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||
Vector<Coeff_t>& shift_coeffs)
|
||||
{
|
||||
#if 0
|
||||
auto & psi = psi_i;
|
||||
auto & phi = phi_i;
|
||||
auto & chi = chi_i;
|
||||
|
||||
this->M5D(psi, phi, chi, lower, diag, upper);
|
||||
|
||||
// FIXME: possible gain from vectorizing shift operation as well?
|
||||
Coeff_t one(1.0);
|
||||
int Ls = this->Ls;
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
|
||||
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
|
||||
}
|
||||
|
||||
#else
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
const int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd>> u(LLs);
|
||||
Vector<iSinglet<Simd>> l(LLs);
|
||||
Vector<iSinglet<Simd>> d(LLs);
|
||||
Vector<iSinglet<Simd>> s(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
scalar_type* s_p = (scalar_type*) &s[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
s_p[ss] = shift_coeffs[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
assert(Nc == 3);
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
int vs = (this->pm == 1) ? LLs-1 : 0;
|
||||
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
|
||||
Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
|
||||
Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
|
||||
Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
|
||||
Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
|
||||
Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(2)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(2)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(2)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(3)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(3)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(3)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(0)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(0)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(0)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(1)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(1)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(1)(2);
|
||||
|
||||
if(vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(this->pm == 1 && vs <= v){
|
||||
hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
|
||||
hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
|
||||
hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
|
||||
hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
|
||||
hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
|
||||
hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
if(this->pm == -1 && vs >= v){
|
||||
hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
|
||||
hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
|
||||
hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
|
||||
hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
|
||||
hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
|
||||
hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
|
||||
}
|
||||
|
||||
// Can force these to real arithmetic and save 2x.
|
||||
Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
|
||||
Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
|
||||
Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
|
||||
Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
|
||||
Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
|
||||
Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
|
||||
Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
}
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd>> u(LLs);
|
||||
Vector<iSinglet<Simd>> l(LLs);
|
||||
Vector<iSinglet<Simd>> d(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
#if 0
|
||||
|
||||
alignas(64) SiteHalfSpinor hp;
|
||||
alignas(64) SiteHalfSpinor hm;
|
||||
alignas(64) SiteSpinor fp;
|
||||
alignas(64) SiteSpinor fm;
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
int vp = (v+1)%LLs;
|
||||
int vm = (v+LLs-1)%LLs;
|
||||
|
||||
spProj5p(hp, psi[ss+vp]);
|
||||
spProj5m(hm, psi[ss+vm]);
|
||||
|
||||
if(vp <= v){ rotate(hp, hp, 1); }
|
||||
if(vm >= v){ rotate(hm, hm, nsimd-1); }
|
||||
|
||||
hp = hp*0.5;
|
||||
hm = hm*0.5;
|
||||
spRecon5p(fp, hp);
|
||||
spRecon5m(fm, hm);
|
||||
|
||||
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
|
||||
chi[ss+v] = chi[ss+v] +l[v]*fm;
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||
|
||||
if (vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
|
||||
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
|
||||
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
|
||||
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
|
||||
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
|
||||
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||
Vector<Coeff_t>& shift_coeffs)
|
||||
{
|
||||
#if 0
|
||||
auto & psi = psi_i;
|
||||
auto & phi = phi_i;
|
||||
auto & chi = chi_i;
|
||||
this->M5Ddag(psi, phi, chi, lower, diag, upper);
|
||||
|
||||
// FIXME: possible gain from vectorizing shift operation as well?
|
||||
Coeff_t one(1.0);
|
||||
int Ls = this->Ls;
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
|
||||
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
|
||||
}
|
||||
|
||||
#else
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
auto psi = psi_i.View();
|
||||
auto phi = phi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
int Ls = this->Ls;
|
||||
int LLs = grid->_rdimensions[0];
|
||||
int nsimd = Simd::Nsimd();
|
||||
|
||||
Vector<iSinglet<Simd>> u(LLs);
|
||||
Vector<iSinglet<Simd>> l(LLs);
|
||||
Vector<iSinglet<Simd>> d(LLs);
|
||||
Vector<iSinglet<Simd>> s(LLs);
|
||||
|
||||
assert(Ls/LLs == nsimd);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
|
||||
// just directly address via type pun
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
scalar_type* u_p = (scalar_type*) &u[0];
|
||||
scalar_type* l_p = (scalar_type*) &l[0];
|
||||
scalar_type* d_p = (scalar_type*) &d[0];
|
||||
scalar_type* s_p = (scalar_type*) &s[0];
|
||||
|
||||
for(int o=0; o<LLs; o++){ // outer
|
||||
for(int i=0; i<nsimd; i++){ //inner
|
||||
int s = o + i*LLs;
|
||||
int ss = o*nsimd + i;
|
||||
u_p[ss] = upper[s];
|
||||
l_p[ss] = lower[s];
|
||||
d_p[ss] = diag[s];
|
||||
s_p[ss] = shift_coeffs[s];
|
||||
}}
|
||||
|
||||
this->M5Dcalls++;
|
||||
this->M5Dtime -= usecond();
|
||||
|
||||
thread_loop( (int ss=0; ss<grid->oSites(); ss+=LLs),{ // adds LLs
|
||||
|
||||
int vs = (this->pm == 1) ? LLs-1 : 0;
|
||||
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
|
||||
Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
|
||||
Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
|
||||
Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
|
||||
Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
|
||||
Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
|
||||
|
||||
for(int v=0; v<LLs; v++){
|
||||
|
||||
vprefetch(psi[ss+v+LLs]);
|
||||
|
||||
int vp = (v == LLs-1) ? 0 : v+1;
|
||||
int vm = (v == 0 ) ? LLs-1 : v-1;
|
||||
|
||||
Simd hp_00 = psi[ss+vp]()(0)(0);
|
||||
Simd hp_01 = psi[ss+vp]()(0)(1);
|
||||
Simd hp_02 = psi[ss+vp]()(0)(2);
|
||||
Simd hp_10 = psi[ss+vp]()(1)(0);
|
||||
Simd hp_11 = psi[ss+vp]()(1)(1);
|
||||
Simd hp_12 = psi[ss+vp]()(1)(2);
|
||||
|
||||
Simd hm_00 = psi[ss+vm]()(2)(0);
|
||||
Simd hm_01 = psi[ss+vm]()(2)(1);
|
||||
Simd hm_02 = psi[ss+vm]()(2)(2);
|
||||
Simd hm_10 = psi[ss+vm]()(3)(0);
|
||||
Simd hm_11 = psi[ss+vm]()(3)(1);
|
||||
Simd hm_12 = psi[ss+vm]()(3)(2);
|
||||
|
||||
if (vp <= v){
|
||||
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
|
||||
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
|
||||
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
|
||||
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
|
||||
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
|
||||
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
|
||||
}
|
||||
|
||||
if(this->pm == 1 && vs <= v){
|
||||
hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
|
||||
hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
|
||||
hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
|
||||
hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
|
||||
hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
|
||||
hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
|
||||
}
|
||||
|
||||
if(vm >= v){
|
||||
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
|
||||
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
|
||||
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
|
||||
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
|
||||
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
|
||||
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
|
||||
}
|
||||
|
||||
if(this->pm == -1 && vs >= v){
|
||||
hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
|
||||
hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
|
||||
hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
|
||||
hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
|
||||
hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
|
||||
hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
|
||||
}
|
||||
|
||||
Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
|
||||
Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
|
||||
Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
|
||||
Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
|
||||
Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
|
||||
Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
|
||||
Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
|
||||
Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
|
||||
Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
|
||||
Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
|
||||
Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
|
||||
Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
|
||||
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
|
||||
|
||||
vstream(chi[ss+v]()(0)(0), p_00);
|
||||
vstream(chi[ss+v]()(0)(1), p_01);
|
||||
vstream(chi[ss+v]()(0)(2), p_02);
|
||||
vstream(chi[ss+v]()(1)(0), p_10);
|
||||
vstream(chi[ss+v]()(1)(1), p_11);
|
||||
vstream(chi[ss+v]()(1)(2), p_12);
|
||||
vstream(chi[ss+v]()(2)(0), p_20);
|
||||
vstream(chi[ss+v]()(2)(1), p_21);
|
||||
vstream(chi[ss+v]()(2)(2), p_22);
|
||||
vstream(chi[ss+v]()(3)(0), p_30);
|
||||
vstream(chi[ss+v]()(3)(1), p_31);
|
||||
vstream(chi[ss+v]()(3)(2), p_32);
|
||||
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
this->M5Dtime += usecond();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef AVX512
|
||||
#include<simd/Intel512common.h>
|
||||
#include<simd/Intel512avx.h>
|
||||
#include<simd/Intel512single.h>
|
||||
#endif
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi_i, FermionField& chi_i,
|
||||
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
auto psi = psi_i.View();
|
||||
auto chi = chi_i.View();
|
||||
#ifndef AVX512
|
||||
{
|
||||
SiteHalfSpinor BcastP;
|
||||
SiteHalfSpinor BcastM;
|
||||
SiteHalfSpinor SiteChiP;
|
||||
SiteHalfSpinor SiteChiM;
|
||||
|
||||
// Ls*Ls * 2 * 12 * vol flops
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
|
||||
for(int s2=0; s2<LLs; s2++){
|
||||
for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
|
||||
|
||||
int s = s2 + l*LLs;
|
||||
int lex = s2 + LLs*site;
|
||||
|
||||
if( s2==0 && l==0 ){
|
||||
SiteChiP=Zero();
|
||||
SiteChiM=Zero();
|
||||
}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
|
||||
}}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
|
||||
}}
|
||||
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
|
||||
SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
|
||||
}}
|
||||
}}
|
||||
|
||||
{
|
||||
int lex = s1 + LLs*site;
|
||||
for(int sp=0; sp<2; sp++){
|
||||
for(int co=0; co<Nc; co++){
|
||||
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
|
||||
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
|
||||
}}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
{
|
||||
// pointers
|
||||
// MASK_REGS;
|
||||
#define Chi_00 %%zmm1
|
||||
#define Chi_01 %%zmm2
|
||||
#define Chi_02 %%zmm3
|
||||
#define Chi_10 %%zmm4
|
||||
#define Chi_11 %%zmm5
|
||||
#define Chi_12 %%zmm6
|
||||
#define Chi_20 %%zmm7
|
||||
#define Chi_21 %%zmm8
|
||||
#define Chi_22 %%zmm9
|
||||
#define Chi_30 %%zmm10
|
||||
#define Chi_31 %%zmm11
|
||||
#define Chi_32 %%zmm12
|
||||
|
||||
#define BCAST0 %%zmm13
|
||||
#define BCAST1 %%zmm14
|
||||
#define BCAST2 %%zmm15
|
||||
#define BCAST3 %%zmm16
|
||||
#define BCAST4 %%zmm17
|
||||
#define BCAST5 %%zmm18
|
||||
#define BCAST6 %%zmm19
|
||||
#define BCAST7 %%zmm20
|
||||
#define BCAST8 %%zmm21
|
||||
#define BCAST9 %%zmm22
|
||||
#define BCAST10 %%zmm23
|
||||
#define BCAST11 %%zmm24
|
||||
|
||||
int incr = LLs*LLs*sizeof(iSinglet<Simd>);
|
||||
|
||||
for(int s1=0; s1<LLs; s1++){
|
||||
|
||||
for(int s2=0; s2<LLs; s2++){
|
||||
|
||||
int lex = s2 + LLs*site;
|
||||
uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
|
||||
uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
|
||||
uint64_t a2 = (uint64_t) &psi[lex];
|
||||
|
||||
for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
|
||||
|
||||
if((s2+l)==0) {
|
||||
asm(
|
||||
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
|
||||
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
|
||||
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
|
||||
VBCASTCDUP(0,%2,BCAST0)
|
||||
VBCASTCDUP(1,%2,BCAST1)
|
||||
VBCASTCDUP(2,%2,BCAST2)
|
||||
VBCASTCDUP(3,%2,BCAST3)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMULMEM(0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMULMEM(0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMULMEM(0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMULMEM(0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMULMEM(0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMULMEM(0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMULMEM(0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMULMEM(0,%1,BCAST7,Chi_21)
|
||||
VMULMEM(0,%1,BCAST8,Chi_22)
|
||||
VMULMEM(0,%1,BCAST9,Chi_30)
|
||||
VMULMEM(0,%1,BCAST10,Chi_31)
|
||||
VMULMEM(0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
} else {
|
||||
asm(
|
||||
VBCASTCDUP(0,%2,BCAST0) VMADDMEM(0,%0,BCAST0,Chi_00)
|
||||
VBCASTCDUP(1,%2,BCAST1) VMADDMEM(0,%0,BCAST1,Chi_01)
|
||||
VBCASTCDUP(2,%2,BCAST2) VMADDMEM(0,%0,BCAST2,Chi_02)
|
||||
VBCASTCDUP(3,%2,BCAST3) VMADDMEM(0,%0,BCAST3,Chi_10)
|
||||
VBCASTCDUP(4,%2,BCAST4) VMADDMEM(0,%0,BCAST4,Chi_11)
|
||||
VBCASTCDUP(5,%2,BCAST5) VMADDMEM(0,%0,BCAST5,Chi_12)
|
||||
VBCASTCDUP(6,%2,BCAST6) VMADDMEM(0,%1,BCAST6,Chi_20)
|
||||
VBCASTCDUP(7,%2,BCAST7) VMADDMEM(0,%1,BCAST7,Chi_21)
|
||||
VBCASTCDUP(8,%2,BCAST8) VMADDMEM(0,%1,BCAST8,Chi_22)
|
||||
VBCASTCDUP(9,%2,BCAST9) VMADDMEM(0,%1,BCAST9,Chi_30)
|
||||
VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
|
||||
VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
|
||||
: : "r" (a0), "r" (a1), "r" (a2) );
|
||||
}
|
||||
|
||||
a0 = a0 + incr;
|
||||
a1 = a1 + incr;
|
||||
a2 = a2 + sizeof(typename Simd::scalar_type);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int lexa = s1+LLs*site;
|
||||
asm (
|
||||
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
|
||||
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
|
||||
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
|
||||
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
|
||||
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef Chi_00
|
||||
#undef Chi_01
|
||||
#undef Chi_02
|
||||
#undef Chi_10
|
||||
#undef Chi_11
|
||||
#undef Chi_12
|
||||
#undef Chi_20
|
||||
#undef Chi_21
|
||||
#undef Chi_22
|
||||
#undef Chi_30
|
||||
#undef Chi_31
|
||||
#undef Chi_32
|
||||
|
||||
#undef BCAST0
|
||||
#undef BCAST1
|
||||
#undef BCAST2
|
||||
#undef BCAST3
|
||||
#undef BCAST4
|
||||
#undef BCAST5
|
||||
#undef BCAST6
|
||||
#undef BCAST7
|
||||
#undef BCAST8
|
||||
#undef BCAST9
|
||||
#undef BCAST10
|
||||
#undef BCAST11
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
// Z-mobius version
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
|
||||
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
|
||||
{
|
||||
std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
|
||||
exit(-1);
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
|
||||
{
|
||||
chi.Checkerboard() = psi.Checkerboard();
|
||||
|
||||
int Ls = this->Ls;
|
||||
int LLs = psi.Grid()->_rdimensions[0];
|
||||
int vol = psi.Grid()->oSites()/LLs;
|
||||
|
||||
Vector<iSinglet<Simd>> Matp;
|
||||
Vector<iSinglet<Simd>> Matm;
|
||||
Vector<iSinglet<Simd>>* _Matp;
|
||||
Vector<iSinglet<Simd>>* _Matm;
|
||||
|
||||
// MooeeInternalCompute(dag,inv,Matp,Matm);
|
||||
if(inv && dag){
|
||||
_Matp = &this->MatpInvDag;
|
||||
_Matm = &this->MatmInvDag;
|
||||
}
|
||||
|
||||
if(inv && (!dag)){
|
||||
_Matp = &this->MatpInv;
|
||||
_Matm = &this->MatmInv;
|
||||
}
|
||||
|
||||
if(!inv){
|
||||
MooeeInternalCompute(dag, inv, Matp, Matm);
|
||||
_Matp = &Matp;
|
||||
_Matm = &Matm;
|
||||
}
|
||||
|
||||
assert(_Matp->size() == Ls*LLs);
|
||||
|
||||
this->MooeeInvCalls++;
|
||||
this->MooeeInvTime -= usecond();
|
||||
|
||||
if(switcheroo<Coeff_t>::iscomplex()){
|
||||
thread_loop( (auto site=0; site<vol; site++),{
|
||||
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||
});
|
||||
} else {
|
||||
thread_loop( (auto site=0; site<vol; site++),{
|
||||
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
|
||||
});
|
||||
}
|
||||
|
||||
this->MooeeInvTime += usecond();
|
||||
}
|
||||
|
||||
#ifdef MOBIUS_EOFA_DPERP_VEC
|
||||
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
|
||||
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
|
||||
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
|
||||
|
||||
template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
|
||||
template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
|
||||
|
||||
#endif
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -1,242 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// *NOT* EO
|
||||
template <class Impl>
|
||||
RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
|
||||
{
|
||||
FermionField temp(out.Grid());
|
||||
|
||||
// Wilson term
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
this->Dhop(in, out, DaggerNo);
|
||||
|
||||
// Clover term
|
||||
Mooee(in, temp);
|
||||
|
||||
out += temp;
|
||||
return norm2(out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
FermionField temp(out.Grid());
|
||||
|
||||
// Wilson term
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
this->Dhop(in, out, DaggerYes);
|
||||
|
||||
// Clover term
|
||||
MooeeDag(in, temp);
|
||||
|
||||
out += temp;
|
||||
return norm2(out);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
{
|
||||
WilsonFermion<Impl>::ImportGauge(_Umu);
|
||||
GridBase *grid = _Umu.Grid();
|
||||
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||
|
||||
// Compute the field strength terms mu>nu
|
||||
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
|
||||
|
||||
// Compute the Clover Operator acting on Colour and Spin
|
||||
// multiply here by the clover coefficients for the anisotropy
|
||||
CloverTerm = fillCloverYZ(Bx) * csw_r;
|
||||
CloverTerm += fillCloverXZ(By) * csw_r;
|
||||
CloverTerm += fillCloverXY(Bz) * csw_r;
|
||||
CloverTerm += fillCloverXT(Ex) * csw_t;
|
||||
CloverTerm += fillCloverYT(Ey) * csw_t;
|
||||
CloverTerm += fillCloverZT(Ez) * csw_t;
|
||||
CloverTerm += diag_mass;
|
||||
|
||||
int lvol = _Umu.Grid()->lSites();
|
||||
int DimRep = Impl::Dimension;
|
||||
|
||||
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
|
||||
Coordinate lcoor;
|
||||
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
||||
|
||||
for (int site = 0; site < lvol; site++)
|
||||
{
|
||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||
peekLocalSite(Qx, CloverTerm, lcoor);
|
||||
Qxinv = Zero();
|
||||
//if (csw!=0){
|
||||
for (int j = 0; j < Ns; j++)
|
||||
for (int k = 0; k < Ns; k++)
|
||||
for (int a = 0; a < DimRep; a++)
|
||||
for (int b = 0; b < DimRep; b++){
|
||||
auto zz = Qx()(j, k)(a, b);
|
||||
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
|
||||
}
|
||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
|
||||
|
||||
EigenInvCloverOp = EigenCloverOp.inverse();
|
||||
//std::cout << EigenInvCloverOp << std::endl;
|
||||
for (int j = 0; j < Ns; j++)
|
||||
for (int k = 0; k < Ns; k++)
|
||||
for (int a = 0; a < DimRep; a++)
|
||||
for (int b = 0; b < DimRep; b++)
|
||||
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
|
||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
||||
// }
|
||||
pokeLocalSite(Qxinv, CloverTermInv, lcoor);
|
||||
}
|
||||
|
||||
// Separate the even and odd parts
|
||||
pickCheckerboard(Even, CloverTermEven, CloverTerm);
|
||||
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
|
||||
|
||||
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
|
||||
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
|
||||
|
||||
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
|
||||
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
|
||||
|
||||
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
|
||||
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerNo, InverseNo);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerYes, InverseNo);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerNo, InverseYes);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
this->MooeeInternal(in, out, DaggerYes, InverseYes);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
CloverFieldType *Clover;
|
||||
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
|
||||
|
||||
if (dag)
|
||||
{
|
||||
if (in.Grid()->_isCheckerBoarded)
|
||||
{
|
||||
if (in.Checkerboard() == Odd)
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
|
||||
}
|
||||
out = *Clover * in;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInv : &CloverTerm;
|
||||
out = adj(*Clover) * in;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (in.Grid()->_isCheckerBoarded)
|
||||
{
|
||||
|
||||
if (in.Checkerboard() == Odd)
|
||||
{
|
||||
// std::cout << "Calling clover term Odd" << std::endl;
|
||||
Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
|
||||
}
|
||||
else
|
||||
{
|
||||
// std::cout << "Calling clover term Even" << std::endl;
|
||||
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
|
||||
}
|
||||
out = *Clover * in;
|
||||
// std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
Clover = (inv) ? &CloverTermInv : &CloverTerm;
|
||||
out = *Clover * in;
|
||||
}
|
||||
}
|
||||
|
||||
} // MooeeInternal
|
||||
|
||||
|
||||
// Derivative parts
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
// Derivative parts
|
||||
template <class Impl>
|
||||
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||
{
|
||||
assert(0); // not implemented yet
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -386,11 +386,9 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
DhopTotalTime-=usecond();
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||
DhopTotalTime+=usecond();
|
||||
}
|
||||
@ -401,111 +399,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
Compressor compressor(dag);
|
||||
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
int len = U.Grid()->oSites();
|
||||
|
||||
/////////////////////////////
|
||||
// Start comms // Gather intranode and extra node differentiated??
|
||||
/////////////////////////////
|
||||
DhopFaceTime-=usecond();
|
||||
st.HaloExchangeOptGather(in,compressor);
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
double ctime=0;
|
||||
double ptime=0;
|
||||
DhopCommTime -=usecond();
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
st.CommunicateBegin(requests);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
if (tid >= ncomms) {
|
||||
double start = usecond();
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = U.Grid()->oSites();
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
/////////////////////////////
|
||||
// Overlap with comms
|
||||
/////////////////////////////
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
// do the compute
|
||||
if (dag == DaggerYes) {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
int sF = LLs * sU;
|
||||
Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
|
||||
}
|
||||
} else {
|
||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||
int sU = ss;
|
||||
int sF = LLs * sU;
|
||||
Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,1,0);
|
||||
}
|
||||
}
|
||||
ptime = usecond() - start;
|
||||
}
|
||||
{
|
||||
double start = usecond();
|
||||
st.CommunicateThreaded();
|
||||
ctime = usecond() - start;
|
||||
}
|
||||
/////////////////////////////
|
||||
// do the compute interior
|
||||
/////////////////////////////
|
||||
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
||||
DhopComputeTime-=usecond();
|
||||
if (dag == DaggerYes) {
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
||||
} else {
|
||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
||||
}
|
||||
DhopCommTime += ctime;
|
||||
DhopComputeTime+=ptime;
|
||||
DhopComputeTime+=usecond();
|
||||
|
||||
// First to enter, last to leave timing
|
||||
st.CollateThreads();
|
||||
/////////////////////////////
|
||||
// Complete comms
|
||||
/////////////////////////////
|
||||
st.CommunicateComplete(requests);
|
||||
DhopCommTime +=usecond();
|
||||
|
||||
/////////////////////////////
|
||||
// do the compute exterior
|
||||
/////////////////////////////
|
||||
DhopFaceTime-=usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime+=usecond();
|
||||
|
||||
DhopComputeTime2-=usecond();
|
||||
if (dag == DaggerYes) {
|
||||
int sz=st.surface_list.size();
|
||||
thread_loop( (int ss = 0; ss < sz; ss++) ,{
|
||||
int sU = st.surface_list[ss];
|
||||
int sF = LLs * sU;
|
||||
Kernels::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
|
||||
});
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||
} else {
|
||||
int sz=st.surface_list.size();
|
||||
thread_loop( (int ss = 0; ss < sz; ss++) ,{
|
||||
int sU = st.surface_list[ss];
|
||||
int sF = LLs * sU;
|
||||
Kernels::DhopSite(Opt,st,U_v,st.CommBuf(),sF,sU,LLs,1,in_v,out_v,0,1);
|
||||
});
|
||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||
}
|
||||
DhopComputeTime2+=usecond();
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in,
|
||||
FermionField &out,int dag)
|
||||
{
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
Compressor compressor(dag);
|
||||
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
@ -515,24 +472,11 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
|
||||
DhopCommTime+=usecond();
|
||||
|
||||
DhopComputeTime-=usecond();
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
|
||||
auto U_v = U.View();
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
if (dag == DaggerYes) {
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
|
||||
// parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
|
||||
// int sU = ss;
|
||||
// int sF = LLs * sU;
|
||||
// Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||
// }
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
||||
} else {
|
||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U_v.size(),in,out);
|
||||
// parallel_for (int ss = 0; ss < U.Grid()->oSites(); ss++) {
|
||||
// int sU = ss;
|
||||
// int sF = LLs * sU;
|
||||
// Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
|
||||
// }
|
||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
||||
}
|
||||
DhopComputeTime+=usecond();
|
||||
}
|
||||
|
@ -375,78 +375,47 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag) {
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
#ifdef GRID_OMP
|
||||
|
||||
Compressor compressor(dag);
|
||||
int len = U.Grid()->oSites();
|
||||
const int LLs = 1;
|
||||
|
||||
/////////////////////////////
|
||||
// Start comms // Gather intranode and extra node differentiated??
|
||||
/////////////////////////////
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
st.CommunicateBegin(requests);
|
||||
|
||||
/////////////////////////////
|
||||
// Overlap with comms
|
||||
/////////////////////////////
|
||||
st.CommsMergeSHM(compressor);
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nthreads = omp_get_num_threads();
|
||||
int ncomms = CartesianCommunicator::nCommThreads;
|
||||
if (ncomms == -1) ncomms = 1;
|
||||
assert(nthreads > ncomms);
|
||||
if (tid >= ncomms) {
|
||||
nthreads -= ncomms;
|
||||
int ttid = tid - ncomms;
|
||||
int n = len;
|
||||
int chunk = n / nthreads;
|
||||
int rem = n % nthreads;
|
||||
int myblock, myn;
|
||||
if (ttid < rem) {
|
||||
myblock = ttid * chunk + ttid;
|
||||
myn = chunk+1;
|
||||
} else {
|
||||
myblock = ttid*chunk + rem;
|
||||
myn = chunk;
|
||||
}
|
||||
// do the compute
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
|
||||
if (dag == DaggerYes) {
|
||||
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||
Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
|
||||
// Kernels::DhopSiteDag(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
|
||||
}
|
||||
} else {
|
||||
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||
Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,1,0);
|
||||
// Kernels::DhopSite(st_v, lo, U_v, st.CommBuf(), sss, sss, 1, 1, in_v, out_v);
|
||||
}
|
||||
}
|
||||
/////////////////////////////
|
||||
// do the compute interior
|
||||
/////////////////////////////
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
if (dag == DaggerYes) {
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
||||
} else {
|
||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
||||
}
|
||||
|
||||
} else {
|
||||
st.CommunicateThreaded();
|
||||
}
|
||||
} //pragma
|
||||
/////////////////////////////
|
||||
// Complete comms
|
||||
/////////////////////////////
|
||||
st.CommunicateComplete(requests);
|
||||
st.CommsMerge(compressor);
|
||||
|
||||
{
|
||||
auto U_v = U.View();
|
||||
auto in_v = in.View();
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
if (dag == DaggerYes) {
|
||||
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
|
||||
Kernels::DhopSiteDag(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
|
||||
});
|
||||
} else {
|
||||
thread_loop( (int sss = 0; sss < in.Grid()->oSites(); sss++) ,{
|
||||
Kernels::DhopSite(Opt,st_v,U_v,st.CommBuf(),sss,sss,1,1,in_v,out_v,0,1);
|
||||
});
|
||||
}
|
||||
/////////////////////////////
|
||||
// do the compute exterior
|
||||
/////////////////////////////
|
||||
if (dag == DaggerYes) {
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
||||
} else {
|
||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
||||
}
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
@ -73,7 +73,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef GPU_VEC
|
||||
#if 1
|
||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||
if (SE._is_local) { \
|
||||
int mask = Nsimd >> (ptype + 1); \
|
||||
@ -96,7 +96,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
spProj(chi, in_t); \
|
||||
} \
|
||||
} else { \
|
||||
chi = buf[SE._offset+s]; \
|
||||
chi = (buf[SE._offset+s]; \
|
||||
} \
|
||||
synchronise();
|
||||
#endif
|
||||
@ -106,15 +106,9 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, Sit
|
||||
SiteHalfSpinor *buf, int Ls, int s,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
typename SiteHalfSpinor::scalar_object chi;
|
||||
typename SiteHalfSpinor::scalar_object Uchi;
|
||||
typename SiteSpinor::scalar_object result;
|
||||
#else
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
#endif
|
||||
|
||||
typedef typename SiteSpinor::scalar_type scalar_type;
|
||||
typedef typename SiteSpinor::vector_type vector_type;
|
||||
@ -173,11 +167,7 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, Sit
|
||||
GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
|
||||
accumReconTm(result, Uchi);
|
||||
#ifdef GPU_VEC
|
||||
insertLane (lane,out[sF],result);
|
||||
#else
|
||||
vstream(out[sF], result);
|
||||
#endif
|
||||
insertLane (lane,out[sF],result);
|
||||
}
|
||||
}
|
||||
|
||||
@ -186,15 +176,10 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDo
|
||||
SiteHalfSpinor *buf, int Ls, int s,
|
||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
typename SiteHalfSpinor::scalar_object chi;
|
||||
typename SiteHalfSpinor::scalar_object Uchi;
|
||||
typename SiteSpinor::scalar_object result;
|
||||
#else
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
#endif
|
||||
|
||||
typedef typename SiteSpinor::scalar_type scalar_type;
|
||||
typedef typename SiteSpinor::vector_type vector_type;
|
||||
constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||
@ -255,11 +240,7 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDo
|
||||
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
|
||||
accumReconTp(result, Uchi);
|
||||
|
||||
#ifdef GPU_VEC
|
||||
insertLane (lane,out[sF],result);
|
||||
#else
|
||||
vstream(out[sF], result);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@ -287,6 +268,25 @@ GPU_EMPTY(GparityWilsonImplFH);
|
||||
GPU_EMPTY(GparityWilsonImplD);
|
||||
GPU_EMPTY(GparityWilsonImplDF);
|
||||
|
||||
#define KERNEL_CALL(A) \
|
||||
const uint64_t nsimd = Simd::Nsimd(); \
|
||||
const uint64_t NN = Nsite*Ls*nsimd;\
|
||||
accelerator_loopN( sss, NN, { \
|
||||
uint64_t cur = sss; \
|
||||
cur = cur / nsimd; \
|
||||
uint64_t s = cur%Ls; \
|
||||
cur = cur / Ls; \
|
||||
uint64_t sU = cur; \
|
||||
WilsonKernels<Impl>::A(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);\
|
||||
});
|
||||
|
||||
#define HOST_CALL(A) \
|
||||
accelerator_loopN( ss, Ls*Nsite, { \
|
||||
int sF = ss; \
|
||||
int sU = ss/Ls; \
|
||||
WilsonKernels<Impl>::A(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v); \
|
||||
});
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
@ -297,25 +297,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
|
||||
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
|
||||
#define KERNEL_CALL(A) \
|
||||
const uint64_t nsimd = Simd::Nsimd(); \
|
||||
const uint64_t NN = Nsite*Ls*nsimd;\
|
||||
accelerator_loopN( sss, NN, { \
|
||||
uint64_t cur = sss; \
|
||||
cur = cur / nsimd; \
|
||||
uint64_t s = cur%Ls; \
|
||||
cur = cur / Ls; \
|
||||
uint64_t sU = cur;
|
||||
WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
|
||||
});
|
||||
} else {
|
||||
accelerator_loop( ss, U_v, {
|
||||
int sU = ss;
|
||||
int sF = Ls * sU;
|
||||
WilsonKernels<Impl>::GenericDhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
|
||||
});
|
||||
}
|
||||
if( interior && exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGpu) {
|
||||
KERNEL_CALL(GpuDhopSite);
|
||||
} else {
|
||||
HOST_CALL(GenericDhopSite);
|
||||
}
|
||||
} else if( interior ) {
|
||||
HOST_CALL(GenericDhopSiteInt);
|
||||
} else if( exterior ) {
|
||||
HOST_CALL(GenericDhopSiteExt);
|
||||
}
|
||||
|
||||
}
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
@ -327,25 +320,16 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
auto out_v = out.View();
|
||||
auto st_v = st.View();
|
||||
|
||||
if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {
|
||||
const uint64_t nsimd = Simd::Nsimd();
|
||||
const uint64_t NN = Nsite*Ls*nsimd;
|
||||
accelerator_loopN( sss, NN, {
|
||||
uint64_t cur = sss;
|
||||
// uint64_t lane = cur % nsimd;
|
||||
cur = cur / nsimd;
|
||||
uint64_t s = cur%Ls;
|
||||
// uint64_t sF = cur;
|
||||
cur = cur / Ls;
|
||||
uint64_t sU = cur;
|
||||
WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
|
||||
});
|
||||
} else {
|
||||
accelerator_loop( ss, U_v, {
|
||||
int sU = ss;
|
||||
int sF = Ls * sU;
|
||||
WilsonKernels<Impl>::GenericDhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
|
||||
});
|
||||
if( interior && exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGpu) {
|
||||
KERNEL_CALL(GpuDhopSiteDag);
|
||||
} else {
|
||||
HOST_CALL(GenericDhopSiteDag);
|
||||
}
|
||||
} else if( interior ) {
|
||||
HOST_CALL(GenericDhopSiteDagInt);
|
||||
} else if( exterior ) {
|
||||
HOST_CALL(GenericDhopSiteDagExt);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -267,7 +267,6 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
|
||||
int ptype;
|
||||
|
||||
SE = st.GetEntry(ptype, dir, sF);
|
||||
// GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
|
||||
if (gamma == Xp) {
|
||||
if (SE->_is_local ) {
|
||||
int perm= SE->_permute;
|
||||
|
@ -1,97 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*
|
||||
* BF sequence
|
||||
*
|
||||
void bfmbase<Float>::MooeeInv(Fermion_t psi,
|
||||
Fermion_t chi,
|
||||
int dag, int cb)
|
||||
|
||||
double m = this->mass;
|
||||
double tm = this->twistedmass;
|
||||
double mtil = 4.0+this->mass;
|
||||
|
||||
double sq = mtil*mtil + tm*tm;
|
||||
|
||||
double a = mtil/sq;
|
||||
double b = -tm /sq;
|
||||
if(dag) b=-b;
|
||||
axpibg5x(chi,psi,a,b);
|
||||
|
||||
void bfmbase<Float>::Mooee(Fermion_t psi,
|
||||
Fermion_t chi,
|
||||
int dag,int cb)
|
||||
double a = 4.0+this->mass;
|
||||
double b = this->twistedmass;
|
||||
if(dag) b=-b;
|
||||
axpibg5x(chi,psi,a,b);
|
||||
*/
|
||||
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
||||
RealD a = 4.0+this->mass;
|
||||
RealD b = this->mu;
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
axpibg5x(out,in,a,b);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
||||
RealD a = 4.0+this->mass;
|
||||
RealD b = -this->mu;
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
axpibg5x(out,in,a,b);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
||||
RealD m = this->mass;
|
||||
RealD tm = this->mu;
|
||||
RealD mtil = 4.0+m;
|
||||
RealD sq = mtil*mtil+tm*tm;
|
||||
RealD a = mtil/sq;
|
||||
RealD b = -tm /sq;
|
||||
axpibg5x(out,in,a,b);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
|
||||
RealD m = this->mass;
|
||||
RealD tm = this->mu;
|
||||
RealD mtil = 4.0+m;
|
||||
RealD sq = mtil*mtil+tm*tm;
|
||||
RealD a = mtil/sq;
|
||||
RealD b = tm /sq;
|
||||
axpibg5x(out,in,a,b);
|
||||
}
|
||||
|
||||
FermOpTemplateInstantiate(WilsonTMFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -0,0 +1,45 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h>
|
||||
|
||||
//#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h>
|
||||
//#include <Grid/qcd/action/fermion/implementation/CayleyFermion5Dgpu.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// FIXME: Break these out to parallel make accelerate
|
||||
FermOpTemplateInstantiate(CayleyFermion5D);
|
||||
GparityFermOpTemplateInstantiate(CayleyFermion5D);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -0,0 +1,37 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/ContinuedFractionFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
FermOpTemplateInstantiate(ContinuedFractionFermion5D);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -0,0 +1,45 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: David Murphy <dmurphy@phys.columbia.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid_Eigen_Dense.h>
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
FermOpTemplateInstantiate(DomainWallEOFAFermion);
|
||||
GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -0,0 +1,46 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h>
|
||||
#include <Grid/perfmon/PerfCount.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
const std::vector<int> ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
|
||||
const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
|
||||
|
||||
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
|
||||
FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
|
@ -0,0 +1,39 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi, Peter Boyle
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
const std::vector<int> ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
|
||||
const std::vector<int> ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
|
||||
|
||||
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -0,0 +1,38 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
FermOpTemplateInstantiate(PartialFractionFermion5D);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -0,0 +1,43 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi, Peter Boyle
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
||||
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
|
||||
|
||||
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
|
||||
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -0,0 +1,42 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
|
||||
|
||||
Copyright (C) 2017
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
FermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -0,0 +1,40 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
FermOpTemplateInstantiate(WilsonFermion5D);
|
||||
GparityFermOpTemplateInstantiate(WilsonFermion5D);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -0,0 +1,46 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
|
||||
const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
|
||||
int WilsonFermionStatic::HandOptDslash;
|
||||
|
||||
FermOpTemplateInstantiate(WilsonFermion);
|
||||
AdjointFermOpTemplateInstantiate(WilsonFermion);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonFermion);
|
||||
GparityFermOpTemplateInstantiate(WilsonFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -0,0 +1,51 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// Move these
|
||||
int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric;
|
||||
int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
|
||||
|
||||
// FIXME: Break these out to parallel make
|
||||
FermOpTemplateInstantiate(WilsonKernels);
|
||||
GparityFermOpTemplateInstantiate(WilsonKernels); // Specialisation in Gparity forces instantiation
|
||||
AdjointFermOpTemplateInstantiate(WilsonKernels);
|
||||
TwoIndexFermOpTemplateInstantiate(WilsonKernels);
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -0,0 +1,36 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonTMFermion.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonTMFermion.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
FermOpTemplateInstantiate(WilsonTMFermion);
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -267,11 +267,12 @@ struct getVectorType{
|
||||
template<typename T>
|
||||
class isSIMDvectorized{
|
||||
template<typename U>
|
||||
static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,
|
||||
typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
|
||||
static typename std::enable_if<
|
||||
!std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,
|
||||
typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value,
|
||||
char>::type test(void *);
|
||||
|
||||
template<typename U>
|
||||
static double test(...);
|
||||
template<typename U> static double test(...);
|
||||
|
||||
public:
|
||||
enum {value = sizeof(test<T>(0)) == sizeof(char) };
|
||||
|
Reference in New Issue
Block a user