1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

Assembler possibly working

This commit is contained in:
azusayamaguchi 2016-12-16 16:55:36 +00:00
parent 426197e446
commit eabc577940
11 changed files with 784 additions and 94 deletions

View File

@ -416,7 +416,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
#endif
#endif
BACKTRACE();
exit(0);
if ( si->si_signo != SIGTRAP ) exit(0);
return;
};

View File

@ -113,6 +113,10 @@ typedef SymanzikGaugeAction<ConjugateGimplD> ConjugateSymanzikGaugeAction
template class A<StaggeredImplF>; \
template class A<StaggeredImplD>;
#define FermOpStaggeredVec5dTemplateInstantiate(A) \
template class A<StaggeredVec5dImplF>; \
template class A<StaggeredVec5dImplD>;
#define FermOp4dVecTemplateInstantiate(A) \
template class A<WilsonImplF>; \
template class A<WilsonImplD>; \
@ -284,6 +288,10 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
}}
///////////////////////////////////////////////////////////////////////////////

View File

@ -224,12 +224,13 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
typedef Lattice<SiteSpinor> FermionField;
/////////////////////////////////////////////////
// Make the doubled gauge field a *scalar*
/////////////////////////////////////////////////
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor;
@ -261,11 +262,11 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu)
{
SiteScalarGaugeField ScalarUmu;
SiteScalarGaugeField ScalarUmu;
SiteDoubledGaugeField ScalarUds;
GaugeLinkField U(Umu._grid);
GaugeField Uadj(Umu._grid);
GaugeField Uadj(Umu._grid);
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
U = adj(Cshift(U, mu, -1));
@ -631,6 +632,184 @@ PARALLEL_FOR_LOOP
/////////////////////////////////////////////////////////////////////////////
// Single flavour one component spinors with colour index. 5d vec
/////////////////////////////////////////////////////////////////////////////
template <class S, class Representation = FundamentalRepresentation >
class StaggeredVec5dImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
public:
typedef RealD _Coeff_t ;
static const int Dimension = Representation::Dimension;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
//Necessary?
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
const bool LsVectorised=true;
typedef _Coeff_t Coeff_t;
INHERIT_GIMPL_TYPES(Gimpl);
template <typename vtype> using iImplScalar = iScalar<iScalar<iScalar<vtype> > >;
template <typename vtype> using iImplSpinor = iScalar<iScalar<iVector<vtype, Dimension> > >;
template <typename vtype> using iImplHalfSpinor = iScalar<iScalar<iVector<vtype, Dimension> > >;
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
// Make the doubled gauge field a *scalar*
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
typedef iImplGaugeLink<typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef iImplScalar<Simd> SiteComplex;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
typedef Lattice<SiteComplex> ComplexField;
typedef Lattice<SiteSpinor> FermionField;
typedef SimpleCompressor<SiteSpinor> Compressor;
typedef StaggeredImplParams ImplParams;
typedef CartesianStencil<SiteSpinor, SiteSpinor> StencilImpl;
ImplParams Params;
StaggeredVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
template <class ref>
inline void loadLinkElement(Simd &reg, ref &memory) {
vsplat(reg, memory);
}
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu) {
SiteGaugeLink UU;
for (int i = 0; i < Dimension; i++) {
for (int j = 0; j < Dimension; j++) {
vsplat(UU()()(i, j), U(mu)()(i, j));
}
}
mult(&phi(), &UU(), &chi());
}
inline void multLinkAdd(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu) {
SiteGaugeLink UU;
for (int i = 0; i < Dimension; i++) {
for (int j = 0; j < Dimension; j++) {
vsplat(UU()()(i, j), U(mu)()(i, j));
}
}
mac(&phi(), &UU(), &chi());
}
inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &UUUds, // for Naik term
DoubledGaugeField &Uds,
const GaugeField &Uthin,
const GaugeField &Ufat)
{
GridBase * InputGrid = Uthin._grid;
conformable(InputGrid,Ufat._grid);
GaugeLinkField U(InputGrid);
GaugeLinkField UU(InputGrid);
GaugeLinkField UUU(InputGrid);
GaugeLinkField Udag(InputGrid);
GaugeLinkField UUUdag(InputGrid);
for (int mu = 0; mu < Nd; mu++) {
// Staggered Phase.
Lattice<iScalar<vInteger> > coor(InputGrid);
Lattice<iScalar<vInteger> > x(InputGrid); LatticeCoordinate(x,0);
Lattice<iScalar<vInteger> > y(InputGrid); LatticeCoordinate(y,1);
Lattice<iScalar<vInteger> > z(InputGrid); LatticeCoordinate(z,2);
Lattice<iScalar<vInteger> > t(InputGrid); LatticeCoordinate(t,3);
Lattice<iScalar<vInteger> > lin_z(InputGrid); lin_z=x+y;
Lattice<iScalar<vInteger> > lin_t(InputGrid); lin_t=x+y+z;
ComplexField phases(InputGrid); phases=1.0;
if ( mu == 1 ) phases = where( mod(x ,2)==(Integer)0, phases,-phases);
if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
// 1 hop based on fat links
U = PeekIndex<LorentzIndex>(Ufat, mu);
Udag = adj( Cshift(U, mu, -1));
U = U *phases;
Udag = Udag *phases;
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, Uds, lcoor);
peekLocalSite(ScalarU, U, lcoor);
ScalarUds(mu) = ScalarU();
peekLocalSite(ScalarU, Udag, lcoor);
ScalarUds(mu + 4) = ScalarU();
pokeLocalSite(ScalarUds, Uds, lcoor);
}
// 3 hop based on thin links. Crazy huh ?
U = PeekIndex<LorentzIndex>(Uthin, mu);
UU = Gimpl::CovShiftForward(U,mu,U);
UUU= Gimpl::CovShiftForward(U,mu,UU);
UUUdag = adj( Cshift(UUU, mu, -3));
UUU = UUU *phases;
UUUdag = UUUdag *phases;
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, UUUds, lcoor);
peekLocalSite(ScalarU, UUU, lcoor);
ScalarUds(mu) = ScalarU();
peekLocalSite(ScalarU, UUUdag, lcoor);
ScalarUds(mu + 4) = ScalarU();
pokeLocalSite(ScalarUds, UUUds, lcoor);
}
}
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
assert(0);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
assert (0);
}
};
typedef WilsonImpl<vComplex, FundamentalRepresentation > WilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD; // Double
@ -663,6 +842,10 @@ PARALLEL_FOR_LOOP
typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF; // Float
typedef StaggeredImpl<vComplexD, FundamentalRepresentation > StaggeredImplD; // Double
typedef StaggeredVec5dImpl<vComplex, FundamentalRepresentation > StaggeredVec5dImplR; // Real.. whichever prec
typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF; // Float
typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD; // Double
}}
#endif

View File

@ -69,37 +69,57 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,
Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid)
{
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4);
assert(FiveDimRedBlackGrid._ndimension==5);
assert(FourDimRedBlackGrid._ndimension==4);
assert(FiveDimRedBlackGrid._checker_dim==1);
// Dimension zero of the five-d is the Ls direction
assert(FiveDimRedBlackGrid._ndimension==5);
assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
// extent of fifth dim and not spread out
Ls=FiveDimGrid._fdimensions[0];
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
assert(FiveDimRedBlackGrid._processors[0] ==1);
assert(FiveDimRedBlackGrid._simd_layout[0]==1);
assert(FiveDimGrid._processors[0] ==1);
assert(FiveDimGrid._simd_layout[0] ==1);
assert(FiveDimRedBlackGrid._processors[0] ==1);
// Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<4;d++){
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]);
}
if (Impl::LsVectorised) {
int nsimd = Simd::Nsimd();
// Dimension zero of the five-d is the Ls direction
assert(FiveDimGrid._simd_layout[0] ==nsimd);
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
for(int d=0;d<4;d++){
assert(FourDimGrid._simd_layout[d]=1);
assert(FourDimRedBlackGrid._simd_layout[d]=1);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
}
} else {
// Dimension zero of the five-d is the Ls direction
assert(FiveDimRedBlackGrid._simd_layout[0]==1);
assert(FiveDimGrid._simd_layout[0] ==1);
}
// Allocate the required comms buffer
ImportGauge(_Uthin,_Ufat);
}
@ -112,8 +132,6 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin)
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
{
GaugeLinkField U(GaugeGrid());
////////////////////////////////////////////////////////
// Double Store should take two fields for Naik and one hop separately.
////////////////////////////////////////////////////////
@ -126,7 +144,7 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,cons
////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
auto U = PeekIndex<LorentzIndex>(Umu, mu);
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
U = PeekIndex<LorentzIndex>(Umu, mu+4);
@ -221,7 +239,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
for (int ss = 0; ss < U._grid->oSites(); ss++) {
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
int sF=s+LLs*sU;
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), sF, sU, in, out);
}}
} else {
@ -229,7 +247,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
for (int ss = 0; ss < U._grid->oSites(); ss++) {
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
int sF=s+LLs*sU;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),sF,sU,in,out);
}}
}
@ -335,8 +353,8 @@ void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
}
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
}}

View File

@ -192,38 +192,51 @@ void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, Dou
int oneLink =0;
int threeLink=1;
switch(Opt) {
case OptInlineAsm:
DhopSiteAsm(st,lo,U,UUU,buf,sF,sU,in,out._odata[sF]);
break;
case OptHandUnroll:
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =-naive-naik;
break;
case OptGeneric:
default:
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =-naive-naik;
break;
default:
assert(0);
break;
}
out._odata[sF] =-naive-naik;
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out) {
SiteSpinor naik;
SiteSpinor naive;
int oneLink =0;
int threeLink=1;
SiteSpinor naik;
SiteSpinor naive;
static int once;
switch(Opt) {
case OptInlineAsm:
DhopSiteAsm(st,lo,U,UUU,buf,sF,sU,in,out._odata[sF]);
break;
case OptHandUnroll:
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =naive+naik;
break;
case OptGeneric:
default:
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =naive+naik;
break;
default:
assert(0);
break;
}
out._odata[sF] =naive+naik;
};
template <class Impl>
@ -238,6 +251,7 @@ void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U, Do
}
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
}}

View File

@ -58,6 +58,9 @@ public:
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
int sF, int sU, const FermionField &in, SiteSpinor &out);
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);

View File

@ -0,0 +1,327 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
#include <simd/Intel512common.h>
#include <simd/Intel512avx.h>
#include <simd/Intel512single.h>
// Interleave operations from two directions
// This looks just like a 2 spin multiply and reuse same sequence from the Wilson
// Kernel. But the spin index becomes a mu index instead.
#define Chi_00 %zmm0
#define Chi_01 %zmm1
#define Chi_02 %zmm2
#define Chi_10 %zmm3
#define Chi_11 %zmm4
#define Chi_12 %zmm5
#define Chi_20 %zmm6
#define Chi_21 %zmm7
#define Chi_22 %zmm8
#define Chi_30 %zmm9
#define Chi_31 %zmm10
#define Chi_32 %zmm11
#define UChi_00 %zmm12
#define UChi_01 %zmm13
#define UChi_02 %zmm14
#define UChi_10 %zmm15
#define UChi_11 %zmm16
#define UChi_12 %zmm17
#define UChi_20 %zmm18
#define UChi_21 %zmm19
#define UChi_22 %zmm20
#define UChi_30 %zmm21
#define UChi_31 %zmm22
#define UChi_32 %zmm23
#define pChi_00 %%zmm0
#define pChi_01 %%zmm1
#define pChi_02 %%zmm2
#define pChi_10 %%zmm3
#define pChi_11 %%zmm4
#define pChi_12 %%zmm5
#define pChi_20 %%zmm6
#define pChi_21 %%zmm7
#define pChi_22 %%zmm8
#define pChi_30 %%zmm9
#define pChi_31 %%zmm10
#define pChi_32 %%zmm11
#define pUChi_00 %%zmm12
#define pUChi_01 %%zmm13
#define pUChi_02 %%zmm14
#define pUChi_10 %%zmm15
#define pUChi_11 %%zmm16
#define pUChi_12 %%zmm17
#define pUChi_20 %%zmm18
#define pUChi_21 %%zmm19
#define pUChi_22 %%zmm20
#define pUChi_30 %%zmm21
#define pUChi_31 %%zmm22
#define pUChi_32 %%zmm23
#define T0 %zmm24
#define T1 %zmm25
#define T2 %zmm26
#define T3 %zmm27
#define MULT_ADD_LS(g0,g1,g2,g3) \
asm ( "movq %0, %%r8 \n\t" \
"movq %1, %%r9 \n\t" \
"movq %2, %%r10 \n\t" \
"movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
asm ( \
VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
VMADDSUBIDUP(0,%r8,T0,UChi_00) VMADDSUBIDUP(0,%r9,T1,UChi_10) \
VMADDSUBIDUP(3,%r8,T0,UChi_01) VMADDSUBIDUP(3,%r9,T1,UChi_11) \
VMADDSUBIDUP(6,%r8,T0,UChi_02) VMADDSUBIDUP(6,%r9,T1,UChi_12) \
VMADDSUBIDUP(0,%r10,T2,UChi_20) VMADDSUBIDUP(0,%r11,T3,UChi_30) \
VMADDSUBIDUP(3,%r10,T2,UChi_21) VMADDSUBIDUP(3,%r11,T3,UChi_31) \
VMADDSUBIDUP(6,%r10,T2,UChi_22) VMADDSUBIDUP(6,%r11,T3,UChi_32) \
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
#define MULT_LS(g0,g1,g2,g3) \
asm ( "movq %0, %%r8 \n\t" \
"movq %1, %%r9 \n\t" \
"movq %2, %%r10 \n\t" \
"movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
asm ( \
VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
VMULIDUP(0,%r8,T0,UChi_00) VMULIDUP(0,%r9,T1,UChi_10) \
VMULIDUP(3,%r8,T0,UChi_01) VMULIDUP(3,%r9,T1,UChi_11) \
VMULIDUP(6,%r8,T0,UChi_02) VMULIDUP(6,%r9,T1,UChi_12) \
VMULIDUP(0,%r10,T2,UChi_20) VMULIDUP(0,%r11,T3,UChi_30) \
VMULIDUP(3,%r10,T2,UChi_21) VMULIDUP(3,%r11,T3,UChi_31) \
VMULIDUP(6,%r10,T2,UChi_22) VMULIDUP(6,%r11,T3,UChi_32) \
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
#define LOAD_CHI(a0,a1,a2,a3) \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_00) \
VLOAD(1,%%r8,pChi_01) \
VLOAD(2,%%r8,pChi_02) \
: : "r" (a0) : "%r8" ); \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_10) \
VLOAD(1,%%r8,pChi_11) \
VLOAD(2,%%r8,pChi_12) \
: : "r" (a1) : "%r8" ); \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_20) \
VLOAD(1,%%r8,pChi_21) \
VLOAD(2,%%r8,pChi_22) \
: : "r" (a2) : "%r8" ); \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_30) \
VLOAD(1,%%r8,pChi_31) \
VLOAD(2,%%r8,pChi_32) \
: : "r" (a3) : "%r8" );
#define PF_CHI(a0) \
asm ( \
"movq %0, %%r8 \n\t" \
VPREFETCH1(0,%%r8) \
VPREFETCH1(1,%%r8) \
VPREFETCH1(2,%%r8) \
: : "r" (a0) : "%r8" ); \
#define REDUCE(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) \
VADD(UChi_30,UChi_20,UChi_30) \
VADD(UChi_31,UChi_21,UChi_31) \
VADD(UChi_32,UChi_22,UChi_32) \
VADD(UChi_00,UChi_30,UChi_00) \
VADD(UChi_01,UChi_31,UChi_01) \
VADD(UChi_02,UChi_32,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0);\
permute##dir(Chi_1,Chi_1);\
permute##dir(Chi_2,Chi_2);
namespace Grid {
namespace QCD {
// This is the single precision 5th direction vectorised kernel
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int sF,
int sU, const FermionField &in, SiteSpinor &out)
{
uint64_t gauge0,gauge1,gauge2,gauge3;
uint64_t addr0,addr1,addr2,addr3;
int o0,o1,o2,o3; // offsets
int l0,l1,l2,l3; // local
int p0,p1,p2,p3; // perm
int ptype;
StencilEntry *SE0;
StencilEntry *SE1;
StencilEntry *SE2;
StencilEntry *SE3;
// Xp, Yp, Zp, Tp
#define PREPARE(X,Y,Z,T,skew,UU) \
SE0=st.GetEntry(ptype,X+skew,sF); \
o0 = SE0->_offset; \
l0 = SE0->_is_local; \
addr0 = l0 ? (uint64_t) &in._odata[o0] : (uint64_t) &buf[o0]; \
PF_CHI(addr0); \
\
SE1=st.GetEntry(ptype,Y+skew,sF); \
o1 = SE1->_offset; \
l1 = SE1->_is_local; \
addr1 = l1 ? (uint64_t) &in._odata[o1] : (uint64_t) &buf[o1]; \
PF_CHI(addr1); \
\
SE2=st.GetEntry(ptype,Z+skew,sF); \
o2 = SE2->_offset; \
l2 = SE2->_is_local; \
addr2 = l2 ? (uint64_t) &in._odata[o2] : (uint64_t) &buf[o2]; \
PF_CHI(addr2); \
\
SE3=st.GetEntry(ptype,T+skew,sF); \
o3 = SE3->_offset; \
l3 = SE3->_is_local; \
addr3 = l3 ? (uint64_t) &in._odata[o3] : (uint64_t) &buf[o3]; \
PF_CHI(addr3); \
\
gauge0 =(uint64_t)&UU._odata[sU]( X ); \
gauge1 =(uint64_t)&UU._odata[sU]( Y ); \
gauge2 =(uint64_t)&UU._odata[sU]( Z ); \
gauge3 =(uint64_t)&UU._odata[sU]( T );
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_LS(gauge0,gauge1,gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
addr0 = (uint64_t) &out;
REDUCE(addr0);
}
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
}}

View File

@ -279,5 +279,6 @@ void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &l
}
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
}}

View File

@ -62,71 +62,55 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid)
{
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4);
assert(FourDimRedBlackGrid._ndimension==4);
assert(FiveDimRedBlackGrid._ndimension==5);
assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
// extent of fifth dim and not spread out
Ls=FiveDimGrid._fdimensions[0];
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
assert(FiveDimGrid._processors[0] ==1);
assert(FiveDimRedBlackGrid._processors[0] ==1);
// Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<4;d++){
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]);
}
if (Impl::LsVectorised) {
int nsimd = Simd::Nsimd();
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FiveDimRedBlackGrid._ndimension==5);
assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
assert(FourDimGrid._ndimension==4);
// Dimension zero of the five-d is the Ls direction
Ls=FiveDimGrid._fdimensions[0];
assert(FiveDimGrid._processors[0] ==1);
assert(FiveDimGrid._simd_layout[0] ==nsimd);
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
assert(FiveDimRedBlackGrid._processors[0] ==1);
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
// Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<4;d++){
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FourDimGrid._simd_layout[d]=1);
assert(FourDimRedBlackGrid._simd_layout[d]=1);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
}
} else {
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4);
assert(FiveDimRedBlackGrid._ndimension==5);
assert(FourDimRedBlackGrid._ndimension==4);
assert(FiveDimRedBlackGrid._checker_dim==1);
// Dimension zero of the five-d is the Ls direction
Ls=FiveDimGrid._fdimensions[0];
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
assert(FiveDimRedBlackGrid._processors[0] ==1);
assert(FiveDimRedBlackGrid._simd_layout[0]==1);
assert(FiveDimGrid._processors[0] ==1);
assert(FiveDimGrid._simd_layout[0] ==1);
// Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<4;d++){
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
}
}
// Allocate the required comms buffer

View File

@ -51,7 +51,6 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
@ -66,7 +65,10 @@ int main (int argc, char ** argv)
typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField;
typename ImprovedStaggeredFermion5DR::ImplParams params;
FermionField src (FGrid); random(pRNG5,src);
FermionField src (FGrid);
random(pRNG5,src);
FermionField result(FGrid); result=zero;
FermionField ref(FGrid); ref=zero;
FermionField tmp(FGrid); tmp=zero;
@ -75,6 +77,7 @@ int main (int argc, char ** argv)
FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu);
LatticeGaugeField Umua(UGrid); Umua=Umu;
double volume=Ls;
for(int mu=0;mu<Nd;mu++){
@ -93,19 +96,9 @@ int main (int argc, char ** argv)
}
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
if ( mu!=0 ) U[mu]=zero;
PokeIndex<LorentzIndex>(Umu5d,U[mu],mu);
}
std::vector<LatticeColourMatrix> Ua(4,UGrid);
for(int mu=0;mu<Nd;mu++){
Ua[mu] = PeekIndex<LorentzIndex>(Umu,mu);
if ( mu!=0 ) {
Ua[mu]=zero;
}
PokeIndex<LorentzIndex>(Umu,Ua[mu],mu);
}
RealD mass=0.1;
@ -171,6 +164,8 @@ int main (int argc, char ** argv)
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
// std::cout<<GridLogMessage << "result"<< result <<std::endl;
// std::cout<<GridLogMessage << "ref "<< ref <<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;

View File

@ -0,0 +1,157 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_wilson.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG4(UGrid);
GridParallelRNG pRNG5(FGrid);
pRNG4.SeedFixedIntegers(seeds);
pRNG5.SeedFixedIntegers(seeds);
typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField;
typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField;
typename ImprovedStaggeredFermion5DR::ImplParams params;
FermionField src (FGrid);
//random(pRNG5,src);
std::vector<int> site({0,0,0,0,0});
ColourVector cv = zero;
cv()()(0)=1.0;
src = zero;
pokeSite(cv,src,site);
FermionField result(FGrid); result=zero;
FermionField tmp(FGrid); tmp=zero;
FermionField err(FGrid); tmp=zero;
FermionField phi (FGrid); random(pRNG5,phi);
FermionField chi (FGrid); random(pRNG5,chi);
LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu);
double volume=Ls;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params);
ImprovedStaggeredFermionVec5dR sDs(Umu,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,c1,c2,u0,params);
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation "<<std::endl;
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
std::cout<<GridLogMessage << "Calling staggered operator"<<std::endl;
int ncall=1000;
double t0=usecond();
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t1=usecond();
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
FermionField ssrc (sFGrid); localConvert(src,ssrc);
FermionField sresult(sFGrid); sresult=zero;
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
t0=usecond();
for(int i=0;i<ncall;i++){
sDs.Dhop(ssrc,sresult,0);
}
t1=usecond();
localConvert(sresult,tmp);
std::cout<<GridLogMessage << "Called sDs"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
err = tmp-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
t0=usecond();
for(int i=0;i<ncall;i++){
sDs.Dhop(ssrc,sresult,0);
}
t1=usecond();
localConvert(sresult,tmp);
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = tmp-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Grid_finalize();
}