diff --git a/lib/Init.cc b/lib/Init.cc index c1bc4b4a..fffc6c31 100644 --- a/lib/Init.cc +++ b/lib/Init.cc @@ -416,7 +416,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) #endif #endif BACKTRACE(); - exit(0); + if ( si->si_signo != SIGTRAP ) exit(0); return; }; diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h index f7471cf0..34656602 100644 --- a/lib/qcd/action/Actions.h +++ b/lib/qcd/action/Actions.h @@ -113,6 +113,10 @@ typedef SymanzikGaugeAction ConjugateSymanzikGaugeAction template class A; \ template class A; +#define FermOpStaggeredVec5dTemplateInstantiate(A) \ + template class A; \ + template class A; + #define FermOp4dVecTemplateInstantiate(A) \ template class A; \ template class A; \ @@ -284,6 +288,10 @@ typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DD; +typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dR; +typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dF; +typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dD; + }} /////////////////////////////////////////////////////////////////////////////// diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 36ab35ca..dc4d0879 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -224,12 +224,13 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres typedef iImplSpinor SiteSpinor; typedef iImplHalfSpinor SiteHalfSpinor; typedef Lattice FermionField; - + + ///////////////////////////////////////////////// // Make the doubled gauge field a *scalar* + ///////////////////////////////////////////////// typedef iImplDoubledGaugeField SiteDoubledGaugeField; // This is a scalar typedef iImplGaugeField SiteScalarGaugeField; // scalar typedef iImplGaugeLink SiteScalarGaugeLink; // scalar - typedef Lattice DoubledGaugeField; typedef WilsonCompressor Compressor; @@ -261,11 +262,11 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) { - SiteScalarGaugeField ScalarUmu; + SiteScalarGaugeField ScalarUmu; SiteDoubledGaugeField ScalarUds; GaugeLinkField U(Umu._grid); - GaugeField Uadj(Umu._grid); + GaugeField Uadj(Umu._grid); for (int mu = 0; mu < Nd; mu++) { U = PeekIndex(Umu, mu); U = adj(Cshift(U, mu, -1)); @@ -631,6 +632,184 @@ PARALLEL_FOR_LOOP + ///////////////////////////////////////////////////////////////////////////// + // Single flavour one component spinors with colour index. 5d vec + ///////////////////////////////////////////////////////////////////////////// + template + class StaggeredVec5dImpl : public PeriodicGaugeImpl > { + + public: + + typedef RealD _Coeff_t ; + static const int Dimension = Representation::Dimension; + typedef PeriodicGaugeImpl > Gimpl; + + //Necessary? + constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} + + const bool LsVectorised=true; + + typedef _Coeff_t Coeff_t; + + INHERIT_GIMPL_TYPES(Gimpl); + + template using iImplScalar = iScalar > >; + template using iImplSpinor = iScalar > >; + template using iImplHalfSpinor = iScalar > >; + template using iImplDoubledGaugeField = iVector >, Nds>; + template using iImplGaugeField = iVector >, Nd>; + template using iImplGaugeLink = iScalar > >; + + // Make the doubled gauge field a *scalar* + typedef iImplDoubledGaugeField SiteDoubledGaugeField; // This is a scalar + typedef iImplGaugeField SiteScalarGaugeField; // scalar + typedef iImplGaugeLink SiteScalarGaugeLink; // scalar + typedef Lattice DoubledGaugeField; + + typedef iImplScalar SiteComplex; + typedef iImplSpinor SiteSpinor; + typedef iImplHalfSpinor SiteHalfSpinor; + + + typedef Lattice ComplexField; + typedef Lattice FermionField; + + typedef SimpleCompressor Compressor; + typedef StaggeredImplParams ImplParams; + typedef CartesianStencil StencilImpl; + + ImplParams Params; + + StaggeredVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){}; + + template + inline void loadLinkElement(Simd ®, ref &memory) { + vsplat(reg, memory); + } + + inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, int mu) { + SiteGaugeLink UU; + for (int i = 0; i < Dimension; i++) { + for (int j = 0; j < Dimension; j++) { + vsplat(UU()()(i, j), U(mu)()(i, j)); + } + } + mult(&phi(), &UU(), &chi()); + } + inline void multLinkAdd(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, + const SiteHalfSpinor &chi, int mu) { + SiteGaugeLink UU; + for (int i = 0; i < Dimension; i++) { + for (int j = 0; j < Dimension; j++) { + vsplat(UU()()(i, j), U(mu)()(i, j)); + } + } + mac(&phi(), &UU(), &chi()); + } + + inline void DoubleStore(GridBase *GaugeGrid, + DoubledGaugeField &UUUds, // for Naik term + DoubledGaugeField &Uds, + const GaugeField &Uthin, + const GaugeField &Ufat) + { + + GridBase * InputGrid = Uthin._grid; + conformable(InputGrid,Ufat._grid); + + GaugeLinkField U(InputGrid); + GaugeLinkField UU(InputGrid); + GaugeLinkField UUU(InputGrid); + GaugeLinkField Udag(InputGrid); + GaugeLinkField UUUdag(InputGrid); + + for (int mu = 0; mu < Nd; mu++) { + + // Staggered Phase. + Lattice > coor(InputGrid); + Lattice > x(InputGrid); LatticeCoordinate(x,0); + Lattice > y(InputGrid); LatticeCoordinate(y,1); + Lattice > z(InputGrid); LatticeCoordinate(z,2); + Lattice > t(InputGrid); LatticeCoordinate(t,3); + + Lattice > lin_z(InputGrid); lin_z=x+y; + Lattice > lin_t(InputGrid); lin_t=x+y+z; + + ComplexField phases(InputGrid); phases=1.0; + + if ( mu == 1 ) phases = where( mod(x ,2)==(Integer)0, phases,-phases); + if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases); + if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases); + + // 1 hop based on fat links + U = PeekIndex(Ufat, mu); + Udag = adj( Cshift(U, mu, -1)); + + U = U *phases; + Udag = Udag *phases; + + + for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { + SiteScalarGaugeLink ScalarU; + SiteDoubledGaugeField ScalarUds; + + std::vector lcoor; + GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); + peekLocalSite(ScalarUds, Uds, lcoor); + + peekLocalSite(ScalarU, U, lcoor); + ScalarUds(mu) = ScalarU(); + + peekLocalSite(ScalarU, Udag, lcoor); + ScalarUds(mu + 4) = ScalarU(); + + pokeLocalSite(ScalarUds, Uds, lcoor); + } + + // 3 hop based on thin links. Crazy huh ? + U = PeekIndex(Uthin, mu); + UU = Gimpl::CovShiftForward(U,mu,U); + UUU= Gimpl::CovShiftForward(U,mu,UU); + + UUUdag = adj( Cshift(UUU, mu, -3)); + + UUU = UUU *phases; + UUUdag = UUUdag *phases; + + for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { + + SiteScalarGaugeLink ScalarU; + SiteDoubledGaugeField ScalarUds; + + std::vector lcoor; + GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); + + peekLocalSite(ScalarUds, UUUds, lcoor); + + peekLocalSite(ScalarU, UUU, lcoor); + ScalarUds(mu) = ScalarU(); + + peekLocalSite(ScalarU, UUUdag, lcoor); + ScalarUds(mu + 4) = ScalarU(); + + pokeLocalSite(ScalarUds, UUUds, lcoor); + } + + } + } + + inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ + assert(0); + } + + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ + assert (0); + } + }; + + + typedef WilsonImpl WilsonImplR; // Real.. whichever prec typedef WilsonImpl WilsonImplF; // Float typedef WilsonImpl WilsonImplD; // Double @@ -663,6 +842,10 @@ PARALLEL_FOR_LOOP typedef StaggeredImpl StaggeredImplF; // Float typedef StaggeredImpl StaggeredImplD; // Double + typedef StaggeredVec5dImpl StaggeredVec5dImplR; // Real.. whichever prec + typedef StaggeredVec5dImpl StaggeredVec5dImplF; // Float + typedef StaggeredVec5dImpl StaggeredVec5dImplD; // Double + }} #endif diff --git a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc index 71a6bf06..0455df0d 100644 --- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc +++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc @@ -69,37 +69,57 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GaugeField &_Uthin, Lebesgue(_FourDimGrid), LebesgueEvenOdd(_FourDimRedBlackGrid) { + // some assertions assert(FiveDimGrid._ndimension==5); assert(FourDimGrid._ndimension==4); - assert(FiveDimRedBlackGrid._ndimension==5); assert(FourDimRedBlackGrid._ndimension==4); - assert(FiveDimRedBlackGrid._checker_dim==1); - - // Dimension zero of the five-d is the Ls direction + assert(FiveDimRedBlackGrid._ndimension==5); + assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction + + // extent of fifth dim and not spread out Ls=FiveDimGrid._fdimensions[0]; assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); - assert(FiveDimRedBlackGrid._processors[0] ==1); - assert(FiveDimRedBlackGrid._simd_layout[0]==1); assert(FiveDimGrid._processors[0] ==1); - assert(FiveDimGrid._simd_layout[0] ==1); - + assert(FiveDimRedBlackGrid._processors[0] ==1); + // Other dimensions must match the decomposition of the four-D fields for(int d=0;d<4;d++){ - assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]); - assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); - - assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]); - assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); - - assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]); - assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]); - - assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); + assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); + assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]); + + assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); + assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); + assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]); + assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); + assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]); + assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]); } + + if (Impl::LsVectorised) { + + int nsimd = Simd::Nsimd(); + // Dimension zero of the five-d is the Ls direction + assert(FiveDimGrid._simd_layout[0] ==nsimd); + assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd); + + for(int d=0;d<4;d++){ + assert(FourDimGrid._simd_layout[d]=1); + assert(FourDimRedBlackGrid._simd_layout[d]=1); + assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); + } + + } else { + + // Dimension zero of the five-d is the Ls direction + assert(FiveDimRedBlackGrid._simd_layout[0]==1); + assert(FiveDimGrid._simd_layout[0] ==1); + + } + // Allocate the required comms buffer ImportGauge(_Uthin,_Ufat); } @@ -112,8 +132,6 @@ void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin) template void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) { - GaugeLinkField U(GaugeGrid()); - //////////////////////////////////////////////////////// // Double Store should take two fields for Naik and one hop separately. //////////////////////////////////////////////////////// @@ -126,7 +144,7 @@ void ImprovedStaggeredFermion5D::ImportGauge(const GaugeField &_Uthin,cons //////////////////////////////////////////////////////// for (int mu = 0; mu < Nd; mu++) { - U = PeekIndex(Umu, mu); + auto U = PeekIndex(Umu, mu); PokeIndex(Umu, U*( 0.5*c1/u0), mu ); U = PeekIndex(Umu, mu+4); @@ -221,7 +239,7 @@ void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOr for (int ss = 0; ss < U._grid->oSites(); ss++) { for(int s=0;s::DhopInternal(StencilImpl & st, LebesgueOr for (int ss = 0; ss < U._grid->oSites(); ss++) { for(int s=0;s::MooeeInvDag(const FermionField &in, } - FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D); +FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D); }} diff --git a/lib/qcd/action/fermion/StaggeredKernels.cc b/lib/qcd/action/fermion/StaggeredKernels.cc index 6608f8de..bb8dee8c 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.cc +++ b/lib/qcd/action/fermion/StaggeredKernels.cc @@ -192,38 +192,51 @@ void StaggeredKernels::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, Dou int oneLink =0; int threeLink=1; switch(Opt) { + case OptInlineAsm: + DhopSiteAsm(st,lo,U,UUU,buf,sF,sU,in,out._odata[sF]); + break; case OptHandUnroll: DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink); DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink); + out._odata[sF] =-naive-naik; break; case OptGeneric: - default: DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); + out._odata[sF] =-naive-naik; + break; + default: + assert(0); break; } - out._odata[sF] =-naive-naik; }; template void StaggeredKernels::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf, int sF, int sU, const FermionField &in, FermionField &out) { - SiteSpinor naik; - SiteSpinor naive; int oneLink =0; int threeLink=1; + SiteSpinor naik; + SiteSpinor naive; + static int once; switch(Opt) { + case OptInlineAsm: + DhopSiteAsm(st,lo,U,UUU,buf,sF,sU,in,out._odata[sF]); + break; case OptHandUnroll: DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink); DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink); + out._odata[sF] =naive+naik; break; case OptGeneric: - default: DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); + out._odata[sF] =naive+naik; + break; + default: + assert(0); break; } - out._odata[sF] =naive+naik; }; template @@ -238,6 +251,7 @@ void StaggeredKernels::DhopDir( StencilImpl &st, DoubledGaugeField &U, Do } FermOpStaggeredTemplateInstantiate(StaggeredKernels); +FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels); }} diff --git a/lib/qcd/action/fermion/StaggeredKernels.h b/lib/qcd/action/fermion/StaggeredKernels.h index 5a6cb45c..e4cc8cdd 100644 --- a/lib/qcd/action/fermion/StaggeredKernels.h +++ b/lib/qcd/action/fermion/StaggeredKernels.h @@ -58,6 +58,9 @@ public: void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink); + + void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf, + int sF, int sU, const FermionField &in, SiteSpinor &out); void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, int sF, int sU, const FermionField &in, FermionField &out); diff --git a/lib/qcd/action/fermion/StaggeredKernelsAsm.cc b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc new file mode 100644 index 00000000..bb38a6c9 --- /dev/null +++ b/lib/qcd/action/fermion/StaggeredKernelsAsm.cc @@ -0,0 +1,327 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +// Interleave operations from two directions +// This looks just like a 2 spin multiply and reuse same sequence from the Wilson +// Kernel. But the spin index becomes a mu index instead. +#define Chi_00 %zmm0 +#define Chi_01 %zmm1 +#define Chi_02 %zmm2 +#define Chi_10 %zmm3 +#define Chi_11 %zmm4 +#define Chi_12 %zmm5 +#define Chi_20 %zmm6 +#define Chi_21 %zmm7 +#define Chi_22 %zmm8 +#define Chi_30 %zmm9 +#define Chi_31 %zmm10 +#define Chi_32 %zmm11 + +#define UChi_00 %zmm12 +#define UChi_01 %zmm13 +#define UChi_02 %zmm14 +#define UChi_10 %zmm15 +#define UChi_11 %zmm16 +#define UChi_12 %zmm17 +#define UChi_20 %zmm18 +#define UChi_21 %zmm19 +#define UChi_22 %zmm20 +#define UChi_30 %zmm21 +#define UChi_31 %zmm22 +#define UChi_32 %zmm23 + +#define pChi_00 %%zmm0 +#define pChi_01 %%zmm1 +#define pChi_02 %%zmm2 +#define pChi_10 %%zmm3 +#define pChi_11 %%zmm4 +#define pChi_12 %%zmm5 +#define pChi_20 %%zmm6 +#define pChi_21 %%zmm7 +#define pChi_22 %%zmm8 +#define pChi_30 %%zmm9 +#define pChi_31 %%zmm10 +#define pChi_32 %%zmm11 + +#define pUChi_00 %%zmm12 +#define pUChi_01 %%zmm13 +#define pUChi_02 %%zmm14 +#define pUChi_10 %%zmm15 +#define pUChi_11 %%zmm16 +#define pUChi_12 %%zmm17 +#define pUChi_20 %%zmm18 +#define pUChi_21 %%zmm19 +#define pUChi_22 %%zmm20 +#define pUChi_30 %%zmm21 +#define pUChi_31 %%zmm22 +#define pUChi_32 %%zmm23 + +#define T0 %zmm24 +#define T1 %zmm25 +#define T2 %zmm26 +#define T3 %zmm27 + +#define MULT_ADD_LS(g0,g1,g2,g3) \ + asm ( "movq %0, %%r8 \n\t" \ + "movq %1, %%r9 \n\t" \ + "movq %2, %%r10 \n\t" \ + "movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\ + asm ( \ + VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \ + VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \ + VMADDSUBIDUP(0,%r8,T0,UChi_00) VMADDSUBIDUP(0,%r9,T1,UChi_10) \ + VMADDSUBIDUP(3,%r8,T0,UChi_01) VMADDSUBIDUP(3,%r9,T1,UChi_11) \ + VMADDSUBIDUP(6,%r8,T0,UChi_02) VMADDSUBIDUP(6,%r9,T1,UChi_12) \ + VMADDSUBIDUP(0,%r10,T2,UChi_20) VMADDSUBIDUP(0,%r11,T3,UChi_30) \ + VMADDSUBIDUP(3,%r10,T2,UChi_21) VMADDSUBIDUP(3,%r11,T3,UChi_31) \ + VMADDSUBIDUP(6,%r10,T2,UChi_22) VMADDSUBIDUP(6,%r11,T3,UChi_32) \ + VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \ + VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \ + VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \ + VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \ + VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \ + VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \ + VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \ + VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \ + VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \ + VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \ + VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \ + VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \ + VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \ + VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \ + VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \ + VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \ + VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \ + VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \ + VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \ + VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \ + VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \ + VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \ + VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \ + VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \ + VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \ + VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \ + VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \ + VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \ + VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \ + VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \ + VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \ + VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \ + VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \ + VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) ); + +#define MULT_LS(g0,g1,g2,g3) \ + asm ( "movq %0, %%r8 \n\t" \ + "movq %1, %%r9 \n\t" \ + "movq %2, %%r10 \n\t" \ + "movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\ + asm ( \ + VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \ + VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \ + VMULIDUP(0,%r8,T0,UChi_00) VMULIDUP(0,%r9,T1,UChi_10) \ + VMULIDUP(3,%r8,T0,UChi_01) VMULIDUP(3,%r9,T1,UChi_11) \ + VMULIDUP(6,%r8,T0,UChi_02) VMULIDUP(6,%r9,T1,UChi_12) \ + VMULIDUP(0,%r10,T2,UChi_20) VMULIDUP(0,%r11,T3,UChi_30) \ + VMULIDUP(3,%r10,T2,UChi_21) VMULIDUP(3,%r11,T3,UChi_31) \ + VMULIDUP(6,%r10,T2,UChi_22) VMULIDUP(6,%r11,T3,UChi_32) \ + VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \ + VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \ + VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \ + VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \ + VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \ + VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \ + VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \ + VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \ + VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \ + VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \ + VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \ + VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \ + VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \ + VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \ + VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \ + VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \ + VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \ + VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \ + VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \ + VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \ + VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \ + VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \ + VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \ + VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \ + VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \ + VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \ + VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \ + VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \ + VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \ + VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \ + VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \ + VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \ + VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \ + VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) ); + +#define LOAD_CHI(a0,a1,a2,a3) \ + asm ( \ + "movq %0, %%r8 \n\t" \ + VLOAD(0,%%r8,pChi_00) \ + VLOAD(1,%%r8,pChi_01) \ + VLOAD(2,%%r8,pChi_02) \ + : : "r" (a0) : "%r8" ); \ + asm ( \ + "movq %0, %%r8 \n\t" \ + VLOAD(0,%%r8,pChi_10) \ + VLOAD(1,%%r8,pChi_11) \ + VLOAD(2,%%r8,pChi_12) \ + : : "r" (a1) : "%r8" ); \ + asm ( \ + "movq %0, %%r8 \n\t" \ + VLOAD(0,%%r8,pChi_20) \ + VLOAD(1,%%r8,pChi_21) \ + VLOAD(2,%%r8,pChi_22) \ + : : "r" (a2) : "%r8" ); \ + asm ( \ + "movq %0, %%r8 \n\t" \ + VLOAD(0,%%r8,pChi_30) \ + VLOAD(1,%%r8,pChi_31) \ + VLOAD(2,%%r8,pChi_32) \ + : : "r" (a3) : "%r8" ); + +#define PF_CHI(a0) \ + asm ( \ + "movq %0, %%r8 \n\t" \ + VPREFETCH1(0,%%r8) \ + VPREFETCH1(1,%%r8) \ + VPREFETCH1(2,%%r8) \ + : : "r" (a0) : "%r8" ); \ + + +#define REDUCE(out) \ + asm ( \ + VADD(UChi_00,UChi_10,UChi_00) \ + VADD(UChi_01,UChi_11,UChi_01) \ + VADD(UChi_02,UChi_12,UChi_02) \ + VADD(UChi_30,UChi_20,UChi_30) \ + VADD(UChi_31,UChi_21,UChi_31) \ + VADD(UChi_32,UChi_22,UChi_32) \ + VADD(UChi_00,UChi_30,UChi_00) \ + VADD(UChi_01,UChi_31,UChi_01) \ + VADD(UChi_02,UChi_32,UChi_02) ); \ + asm ( \ + VSTORE(0,%0,pUChi_00) \ + VSTORE(1,%0,pUChi_01) \ + VSTORE(2,%0,pUChi_02) \ + : : "r" (out) : "memory" ); + +#define PERMUTE_DIR(dir) \ + permute##dir(Chi_0,Chi_0);\ + permute##dir(Chi_1,Chi_1);\ + permute##dir(Chi_2,Chi_2); + +namespace Grid { +namespace QCD { + + // This is the single precision 5th direction vectorised kernel +template +void StaggeredKernels::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, + DoubledGaugeField &U, + DoubledGaugeField &UUU, + SiteSpinor *buf, int sF, + int sU, const FermionField &in, SiteSpinor &out) +{ + uint64_t gauge0,gauge1,gauge2,gauge3; + uint64_t addr0,addr1,addr2,addr3; + + int o0,o1,o2,o3; // offsets + int l0,l1,l2,l3; // local + int p0,p1,p2,p3; // perm + int ptype; + StencilEntry *SE0; + StencilEntry *SE1; + StencilEntry *SE2; + StencilEntry *SE3; + + // Xp, Yp, Zp, Tp +#define PREPARE(X,Y,Z,T,skew,UU) \ + SE0=st.GetEntry(ptype,X+skew,sF); \ + o0 = SE0->_offset; \ + l0 = SE0->_is_local; \ + addr0 = l0 ? (uint64_t) &in._odata[o0] : (uint64_t) &buf[o0]; \ + PF_CHI(addr0); \ + \ + SE1=st.GetEntry(ptype,Y+skew,sF); \ + o1 = SE1->_offset; \ + l1 = SE1->_is_local; \ + addr1 = l1 ? (uint64_t) &in._odata[o1] : (uint64_t) &buf[o1]; \ + PF_CHI(addr1); \ + \ + SE2=st.GetEntry(ptype,Z+skew,sF); \ + o2 = SE2->_offset; \ + l2 = SE2->_is_local; \ + addr2 = l2 ? (uint64_t) &in._odata[o2] : (uint64_t) &buf[o2]; \ + PF_CHI(addr2); \ + \ + SE3=st.GetEntry(ptype,T+skew,sF); \ + o3 = SE3->_offset; \ + l3 = SE3->_is_local; \ + addr3 = l3 ? (uint64_t) &in._odata[o3] : (uint64_t) &buf[o3]; \ + PF_CHI(addr3); \ + \ + gauge0 =(uint64_t)&UU._odata[sU]( X ); \ + gauge1 =(uint64_t)&UU._odata[sU]( Y ); \ + gauge2 =(uint64_t)&UU._odata[sU]( Z ); \ + gauge3 =(uint64_t)&UU._odata[sU]( T ); + + PREPARE(Xp,Yp,Zp,Tp,0,U); + LOAD_CHI(addr0,addr1,addr2,addr3); + MULT_LS(gauge0,gauge1,gauge2,gauge3); + + PREPARE(Xm,Ym,Zm,Tm,0,U); + LOAD_CHI(addr0,addr1,addr2,addr3); + MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); + + PREPARE(Xp,Yp,Zp,Tp,8,UUU); + LOAD_CHI(addr0,addr1,addr2,addr3); + MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); + + PREPARE(Xm,Ym,Zm,Tm,8,UUU); + LOAD_CHI(addr0,addr1,addr2,addr3); + MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); + + addr0 = (uint64_t) &out; + REDUCE(addr0); +} + +FermOpStaggeredTemplateInstantiate(StaggeredKernels); +FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels); + +}} + diff --git a/lib/qcd/action/fermion/StaggeredKernelsHand.cc b/lib/qcd/action/fermion/StaggeredKernelsHand.cc index 5f9e11e5..06e9d219 100644 --- a/lib/qcd/action/fermion/StaggeredKernelsHand.cc +++ b/lib/qcd/action/fermion/StaggeredKernelsHand.cc @@ -279,5 +279,6 @@ void StaggeredKernels::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &l } FermOpStaggeredTemplateInstantiate(StaggeredKernels); +FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels); }} diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index d2ac96e3..d3a7f941 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -62,71 +62,55 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, Lebesgue(_FourDimGrid), LebesgueEvenOdd(_FourDimRedBlackGrid) { + // some assertions + assert(FiveDimGrid._ndimension==5); + assert(FourDimGrid._ndimension==4); + assert(FourDimRedBlackGrid._ndimension==4); + assert(FiveDimRedBlackGrid._ndimension==5); + assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction + + // extent of fifth dim and not spread out + Ls=FiveDimGrid._fdimensions[0]; + assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); + assert(FiveDimGrid._processors[0] ==1); + assert(FiveDimRedBlackGrid._processors[0] ==1); + + // Other dimensions must match the decomposition of the four-D fields + for(int d=0;d<4;d++){ + + assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); + assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); + assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]); + + assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); + assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); + assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]); + + assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); + assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]); + assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]); + } + if (Impl::LsVectorised) { int nsimd = Simd::Nsimd(); - // some assertions - assert(FiveDimGrid._ndimension==5); - assert(FiveDimRedBlackGrid._ndimension==5); - assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction - assert(FourDimGrid._ndimension==4); - // Dimension zero of the five-d is the Ls direction - Ls=FiveDimGrid._fdimensions[0]; - assert(FiveDimGrid._processors[0] ==1); assert(FiveDimGrid._simd_layout[0] ==nsimd); - - assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); - assert(FiveDimRedBlackGrid._processors[0] ==1); assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd); - // Other dimensions must match the decomposition of the four-D fields for(int d=0;d<4;d++){ - assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); - assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); - assert(FourDimGrid._simd_layout[d]=1); assert(FourDimRedBlackGrid._simd_layout[d]=1); assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); - - assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); - assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); - assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); } } else { - - // some assertions - assert(FiveDimGrid._ndimension==5); - assert(FourDimGrid._ndimension==4); - assert(FiveDimRedBlackGrid._ndimension==5); - assert(FourDimRedBlackGrid._ndimension==4); - assert(FiveDimRedBlackGrid._checker_dim==1); // Dimension zero of the five-d is the Ls direction - Ls=FiveDimGrid._fdimensions[0]; - assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); - assert(FiveDimRedBlackGrid._processors[0] ==1); assert(FiveDimRedBlackGrid._simd_layout[0]==1); - assert(FiveDimGrid._processors[0] ==1); assert(FiveDimGrid._simd_layout[0] ==1); - - // Other dimensions must match the decomposition of the four-D fields - for(int d=0;d<4;d++){ - assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]); - assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); - - assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]); - assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); - - assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]); - assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]); - - assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); - assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); - assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); - } + } // Allocate the required comms buffer diff --git a/tests/core/Test_staggered5D.cc b/tests/core/Test_staggered5D.cc index b467e9b0..be31c438 100644 --- a/tests/core/Test_staggered5D.cc +++ b/tests/core/Test_staggered5D.cc @@ -51,7 +51,6 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); std::cout< U(4,FGrid); + for(int mu=0;mu(Umu5d,mu); - if ( mu!=0 ) U[mu]=zero; - PokeIndex(Umu5d,U[mu],mu); - } - - std::vector Ua(4,UGrid); - for(int mu=0;mu(Umu,mu); - if ( mu!=0 ) { - Ua[mu]=zero; - } - PokeIndex(Umu,Ua[mu],mu); } RealD mass=0.1; @@ -171,6 +164,8 @@ int main (int argc, char ** argv) double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146 std::cout< + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + + const int Ls=16; + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds({1,2,3,4}); + GridParallelRNG pRNG4(UGrid); + GridParallelRNG pRNG5(FGrid); + pRNG4.SeedFixedIntegers(seeds); + pRNG5.SeedFixedIntegers(seeds); + + typedef typename ImprovedStaggeredFermion5DR::FermionField FermionField; + typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; + typename ImprovedStaggeredFermion5DR::ImplParams params; + + FermionField src (FGrid); + + //random(pRNG5,src); + std::vector site({0,0,0,0,0}); + ColourVector cv = zero; + cv()()(0)=1.0; + src = zero; + pokeSite(cv,src,site); + + FermionField result(FGrid); result=zero; + FermionField tmp(FGrid); tmp=zero; + FermionField err(FGrid); tmp=zero; + FermionField phi (FGrid); random(pRNG5,phi); + FermionField chi (FGrid); random(pRNG5,chi); + + LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu); + + double volume=Ls; + for(int mu=0;mu