#ifndef Hadrons_MContraction_A2APionField_hpp_ #define Hadrons_MContraction_A2APionField_hpp_ #include #include #include #include #include BEGIN_HADRONS_NAMESPACE /****************************************************************************** * A2APionField * ******************************************************************************/ BEGIN_MODULE_NAMESPACE(MContraction) typedef std::pair GammaPair; class A2APionFieldPar : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(A2APionFieldPar, int, cacheBlock, int, schurBlock, int, Nmom, std::string, A2A_i, std::string, A2A_j, std::string, output); }; template class TA2APionField : public Module { public: FERM_TYPE_ALIASES(FImpl, ); SOLVER_TYPE_ALIASES(FImpl, ); typedef A2AModesSchurDiagTwo A2ABase; int d_unroll ; public: // constructor TA2APionField(const std::string name); // destructor virtual ~TA2APionField(void){}; // dependency relation virtual std::vector getInput(void); virtual std::vector getOutput(void); // setup virtual void setup(void); // execution virtual void execute(void); virtual void DeltaFeq2(int dt_min,int dt_max, Eigen::Tensor &dF2_fig8, Eigen::Tensor &dF2_trtr, Eigen::Tensor &WW_sd, const LatticeFermion *vs, const LatticeFermion *vd, int orthogdim); virtual void DeltaFeq2_alt(int dt_min,int dt_max, Eigen::Tensor &dF2_fig8, Eigen::Tensor &dF2_trtr, Eigen::Tensor &den0, Eigen::Tensor &den1, Eigen::Tensor &WW_sd, const LatticeFermion *vs, const LatticeFermion *vd, int orthogdim); /////////////////////////////////////// // Arithmetic help. Move to Grid?? /////////////////////////////////////// virtual void PionFieldXX(Eigen::Tensor &mat, const LatticeFermion *wi, const LatticeFermion *vj, int orthogdim, int g5); /////////////////////////// // Simple wrappers /////////////////////////// virtual void PionFieldWVmom(Eigen::Tensor &mat, const LatticeFermion *wi, const LatticeFermion *vj, const std::vector &mom, int orthogdim); virtual void PionFieldWV(Eigen::Tensor &mat, const LatticeFermion *wi, const LatticeFermion *vj, int orthogdim); virtual void PionFieldVV(Eigen::Tensor &mat, const LatticeFermion *vi, const LatticeFermion *vj, int orthogdim); virtual void PionFieldWW(Eigen::Tensor &mat, const LatticeFermion *wi, const LatticeFermion *wj, int orthogdim); }; MODULE_REGISTER(A2APionField, ARG(TA2APionField), MContraction); MODULE_REGISTER(ZA2APionField, ARG(TA2APionField), MContraction); /****************************************************************************** * TA2APionField implementation * ******************************************************************************/ // constructor ///////////////////////////////////////////////////////////////// template TA2APionField::TA2APionField(const std::string name) : Module(name) { } // dependencies/products /////////////////////////////////////////////////////// template std::vector TA2APionField::getInput(void) { std::vector in; in.push_back(par().A2A_i + "_class"); in.push_back(par().A2A_i + "_w_high_4d"); in.push_back(par().A2A_i + "_v_high_4d"); in.push_back(par().A2A_j + "_class"); in.push_back(par().A2A_j + "_w_high_4d"); in.push_back(par().A2A_j + "_v_high_4d"); return in; } template std::vector TA2APionField::getOutput(void) { std::vector out = {}; return out; } // setup /////////////////////////////////////////////////////////////////////// template void TA2APionField::setup(void) { d_unroll = 32; // empirical default. Can be overridden if desired // Four D fields envTmp(std::vector, "wi", 1, par().schurBlock, FermionField(env().getGrid(1))); envTmp(std::vector, "vi", 1, par().schurBlock, FermionField(env().getGrid(1))); envTmp(std::vector, "wj", 1, par().schurBlock, FermionField(env().getGrid(1))); envTmp(std::vector, "vj", 1, par().schurBlock, FermionField(env().getGrid(1))); // 5D tmp int Ls_i = env().getObjectLs(par().A2A_i + "_class"); envTmpLat(FermionField, "tmp_5d", Ls_i); int Ls_j= env().getObjectLs(par().A2A_j + "_class"); assert ( Ls_i == Ls_j ); } ////////////////////////////////////////////////////////////////////////////////// // Cache blocked arithmetic routine // Could move to Grid ??? ////////////////////////////////////////////////////////////////////////////////// template void TA2APionField::PionFieldWVmom(Eigen::Tensor &mat, const LatticeFermion *wi, const LatticeFermion *vj, const std::vector &mom, int orthogdim) { double t0,t1,t2,t3; t0=t1=t2=t3= 0.0; typedef typename FImpl::SiteSpinor vobj; typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; typedef iSpinMatrix SpinMatrix_v; typedef iSpinMatrix SpinMatrix_s; int Lblock = mat.dimension(2); int Rblock = mat.dimension(3); GridBase *grid = wi[0]._grid; const int nd = grid->_ndimension; const int Nsimd = grid->Nsimd(); int Nt = grid->GlobalDimensions()[orthogdim]; int Nmom = mom.size(); int fd=grid->_fdimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; // will locally sum vectors first // sum across these down to scalars // splitting the SIMD int MFrvol = rd*Lblock*Rblock*Nmom; int MFlvol = ld*Lblock*Rblock*Nmom; Vector lvSum(MFrvol); parallel_for (int r = 0; r < MFrvol; r++){ lvSum[r] = zero; } Vector lsSum(MFlvol); parallel_for (int r = 0; r < MFlvol; r++){ lsSum[r]=scalar_type(0.0); } int e1= grid->_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; t0-=usecond(); parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane for(int n=0;n icoor(nd); iScalar temp; std::vector > extracted(Nsimd); // std::vector extracted(Nsimd); for(int i=0;iiCoorFromIindex(icoor,idx); int ldx = rt+icoor[orthogdim]*rd; int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; } }}} } t1+=usecond(); assert(mat.dimension(0) == Nmom); assert(mat.dimension(1) == Nt); t2-=usecond(); // ld loop and local only?? int pd = grid->_processors[orthogdim]; int pc = grid->_processor_coor[orthogdim]; parallel_for_nest2(int lt=0;ltGlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock); t3+=usecond(); } /////////////////////////////////////////////////////////////////// //Meson // Interested in // // sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] // // Conventional meson field: // // = sum_x,y Trace[ sum_j G |v_j(y,ty)> // = sum_ij PI_ji(tx) PI_ij(ty) // // G5-Hermiticity // // sum_x,y Trace[ G S(x,tx,y,ty) G S(y,ty,x,tx) ] // = sum_x,y Trace[ G S(x,tx,y,ty) G g5 S^dag(x,tx,y,ty) g5 ] // = sum_x,y Trace[ g5 G sum_j |v_j(y,ty)> // = sum_ij PionVV(ty) PionWW(tx) // // (*) is only correct estimator if w_i and w_j come from distinct noise sets to preserve the kronecker // expectation value. Otherwise biased. //////////////////////////////////////////////////////////////////// template void TA2APionField::PionFieldXX(Eigen::Tensor &mat, const LatticeFermion *wi, const LatticeFermion *vj, int orthogdim, int g5) { typedef typename FImpl::SiteSpinor vobj; typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; typedef iSpinMatrix SpinMatrix_v; typedef iSpinMatrix SpinMatrix_s; int Lblock = mat.dimension(1); int Rblock = mat.dimension(2); GridBase *grid = wi[0]._grid; const int nd = grid->_ndimension; const int Nsimd = grid->Nsimd(); int Nt = grid->GlobalDimensions()[orthogdim]; int fd=grid->_fdimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; // will locally sum vectors first // sum across these down to scalars // splitting the SIMD int MFrvol = rd*Lblock*Rblock; int MFlvol = ld*Lblock*Rblock; Vector lvSum(MFrvol); parallel_for (int r = 0; r < MFrvol; r++){ lvSum[r] = zero; } Vector lsSum(MFlvol); parallel_for (int r = 0; r < MFlvol; r++){ lsSum[r]=scalar_type(0.0); } int e1= grid->_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane for(int n=0;n icoor(nd); iScalar temp; std::vector > extracted(Nsimd); for(int i=0;iiCoorFromIindex(icoor,idx); int ldx = rt+icoor[orthogdim]*rd; int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx; lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal; } }} } assert(mat.dimension(0) == Nt); // ld loop and local only?? int pd = grid->_processors[orthogdim]; int pc = grid->_processor_coor[orthogdim]; parallel_for_nest2(int lt=0;ltGlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock); } template void TA2APionField::PionFieldWV(Eigen::Tensor &mat, const LatticeFermion *wi, const LatticeFermion *vj, int orthogdim) { const int g5=1; PionFieldXX(mat,wi,vj,orthogdim,g5); } template void TA2APionField::PionFieldWW(Eigen::Tensor &mat, const LatticeFermion *wi, const LatticeFermion *wj, int orthogdim) { const int nog5=0; PionFieldXX(mat,wi,wj,orthogdim,nog5); } template void TA2APionField::PionFieldVV(Eigen::Tensor &mat, const LatticeFermion *vi, const LatticeFermion *vj, int orthogdim) { const int nog5=0; PionFieldXX(mat,vi,vj,orthogdim,nog5); } ///////////////////////////////////////////////////////////////////////// // New dirac trace code needed for efficiency (I think). // TODO: Ask Antonin to auto gen from Mathemetica ///////////////////////////////////////////////////////////////////////// template inline iScalar traceGammaZ(const iMatrix &rhs) { iScalar ret; ret() = timesI(rhs(2,0)) + timesMinusI(rhs(3,1)) + timesMinusI(rhs(0,2)) + timesI(rhs(1,3)); return ret; }; template inline iScalar traceGammaZGamma5(const iMatrix &rhs) { iScalar ret; ret() = timesMinusI(rhs(2,0)) + timesI(rhs(3,1)) + timesMinusI(rhs(0,2)) + timesI(rhs(1,3)); return ret; }; template inline iScalar traceGammaY(const iMatrix &rhs) { iScalar ret; ret() = -rhs(3,0) + rhs(2,1) + rhs(1,2) -rhs(0,3); return ret; }; template inline iScalar traceGammaYGamma5(const iMatrix &rhs) { iScalar ret; ret() = rhs(3,0) - rhs(2,1) + rhs(1,2) -rhs(0,3); return ret; }; template inline iScalar traceGamma5(const iMatrix &rhs) { iScalar ret; ret() = rhs(0, 0) + rhs(1, 1) - rhs(2, 2) - rhs(3,3); return ret; }; template inline iScalar traceGammaT(const iMatrix &rhs) { iScalar ret; ret() = rhs(2, 0) + rhs(3, 1) + rhs(0, 2) + rhs(1,3); return ret; }; template inline iScalar traceGammaTGamma5(const iMatrix &rhs) { iScalar ret; ret() = -rhs(2, 0) - rhs(3, 1) + rhs(0, 2) + rhs(1,3); return ret; }; template inline iScalar traceGammaX(const iMatrix &rhs) { iScalar ret; ret() = timesMinusI(rhs(0, 3)) +timesMinusI(rhs(1, 2)) + timesI(rhs(2, 1)) +timesI(rhs(3,0)); return ret; }; template inline iScalar traceGammaXGamma5(const iMatrix &rhs) { iScalar ret; ret() = timesMinusI(rhs(0, 3)) +timesMinusI(rhs(1, 2)) + timesMinusI(rhs(2, 1)) +timesMinusI(rhs(3,0)); return ret; }; //////////////////////////////////////////////////////////////////////// // DeltaF=2 contraction ; use exernal WW field for Kaon, anti Kaon sink //////////////////////////////////////////////////////////////////////// // // WW -- i vectors have adjoint, and j vectors not. // -- Think of "i" as the strange quark, forward prop from 0 // -- Think of "j" as the anti-down quark. // // WW_sd are w^dag_s w_d // // Hence VV vectors correspondingly are v^dag_d, v_s from t=0 // and v^dag_d, v_s from t=dT // // There is an implicit g5 associated with each v^dag_d from use of g5 Hermiticity. // The other gamma_5 lies in the WW external meson operator. // // From UKhadron wallbag.cc: // // LatticePropagator anti_d0 = adj( Gamma(G5) * Qd0 * Gamma(G5)); // LatticePropagator anti_d1 = adj( Gamma(G5) * Qd1 * Gamma(G5)); // // PR1 = Qs0 * Gamma(G5) * anti_d0; // PR2 = Qs1 * Gamma(G5) * anti_d1; // // TR1 = trace( PR1 * G1 ); // TR2 = trace( PR2 * G2 ); // Wick1 = TR1 * TR2; // // Wick2 = trace( PR1* G1 * PR2 * G2 ); // // was Wick2 = trace( Qs0 * Gamma(G5) * anti_d0 * G1 * Qs1 * Gamma(G5) * anti_d1 * G2 ); // // TR TR(tx) = Wick1 = sum_x WW[t0]_sd < v^_d |g5 G| v_s> WW[t1]_s'd' < v^_d' |g5 G| v_s'> |_{x,tx) // = sum_x [ Trace(WW[t0] VgV(t,x) ) x Trace( WW_[t1] VgV(t,x) ) ] // // // Calc all Nt Trace(WW VV) products at once, take Nt^2 products of these. // // Fig8(tx) = Wick2 = sum_x WW[t0]_sd WW[t1]_s'd' < v^_d |g5 G| v_s'> < v^_d' |g5 G| v_s> |_{x,tx} // // = sum_x Trace( WW[t0] VV[t,x] WW[t1] VV[t,x] ) // // Might as well form Ns x Nj x Ngamma matrix // /////////////////////////////////////////////////////////////////////////////// template void TA2APionField::DeltaFeq2(int dt_min,int dt_max, Eigen::Tensor &dF2_fig8, Eigen::Tensor &dF2_trtr, Eigen::Tensor &WW_sd, const LatticeFermion *vs, const LatticeFermion *vd, int orthogdim) { LOG(Message) << "Computing A2A DeltaF=2 graph" << std::endl; int dt = dt_min; // HACK ; should loop over dt typedef typename FImpl::SiteSpinor vobj; typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; typedef iSpinMatrix SpinMatrix_v; typedef iSpinMatrix SpinMatrix_s; typedef iSinglet Scalar_v; typedef iSinglet Scalar_s; int N_s = WW_sd.dimension(1); int N_d = WW_sd.dimension(2); GridBase *grid = vs[0]._grid; const int nd = grid->_ndimension; const int Nsimd = grid->Nsimd(); int Nt = grid->GlobalDimensions()[orthogdim]; dF2_trtr.resize(Nt,16); dF2_fig8.resize(Nt,16); for(int t=0;t WWVV (Nt,grid) int fd=grid->_fdimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; ////////////////////////////////////// // will locally sum vectors first // sum across these down to scalars // splitting the SIMD ////////////////////////////////////// int MFrvol = rd*N_s*N_d; int MFlvol = ld*N_s*N_d; Vector lvSum(MFrvol); parallel_for (int r = 0; r < MFrvol; r++){ lvSum[r] = zero; } Vector lsSum(MFlvol); parallel_for (int r = 0; r < MFlvol; r++){ lsSum[r]=scalar_type(0.0); } int e1= grid->_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; Eigen::Tensor VgV_sd(N_s,N_d,16); // trace with dirac structure Eigen::Tensor VgV_sd_l(N_s,N_d,16,Nsimd); int Ng; LOG(Message) << "Computing A2A DeltaF=2 graph entering site loop" << std::endl; double t_tot =0; double t_vv =0; double t_extr =0; double t_transp=0; double t_WW =0; double t_trtr =0; double t_fig8 =0; t_tot -=usecond(); for(int r=0;r_ostride[orthogdim]; // base offset for start of plane for(int n=0;n extracted(Nsimd); Scalar_v temp; for(int s=0;s icoor(nd); grid->iCoorFromIindex(icoor,l); int ttt = r+icoor[orthogdim]*rd; Eigen::MatrixXcd VgV(N_d,N_s); Eigen::MatrixXcd VgV_T(N_s,N_d); Eigen::MatrixXcd WW0(N_s,N_d); Eigen::MatrixXcd WW1(N_s,N_d); ///////////////////////////////////////// // Single site VgV , scalar order ///////////////////////////////////////// t_transp -=usecond(); parallel_for(int d=0;dGlobalSumVector(&dF2_fig8(0),Nt*Ng); grid->GlobalSumVector(&dF2_trtr(0),Nt*Ng); t_tot +=usecond(); LOG(Message) << "Computing A2A DeltaF=2 graph t_tot " << t_tot << " us "<< std::endl; LOG(Message) << "Computing A2A DeltaF=2 graph t_vv " << t_vv << " us "<< std::endl; LOG(Message) << "Computing A2A DeltaF=2 graph t_extr " << t_extr << " us "<< std::endl; LOG(Message) << "Computing A2A DeltaF=2 graph t_transp " << t_transp << " us "<< std::endl; LOG(Message) << "Computing A2A DeltaF=2 graph t_WW " << t_WW << " us "<< std::endl; LOG(Message) << "Computing A2A DeltaF=2 graph t_trtr " << t_trtr << " us "<< std::endl; LOG(Message) << "Computing A2A DeltaF=2 graph t_fig8 " << t_fig8 << " us "<< std::endl; } // WW is w_s^dag (x) w_d (G5 implicitly absorbed) // // WWVV will have spin-col (x) spin-col tensor. // // Want this to look like a strange fwd prop, anti-d prop. // // Take WW_sd v^dag_d (x) v_s // // template void TA2APionField::DeltaFeq2_alt(int dt_min,int dt_max, Eigen::Tensor &dF2_fig8, Eigen::Tensor &dF2_trtr, Eigen::Tensor &den0, Eigen::Tensor &den1, Eigen::Tensor &WW_sd, const LatticeFermion *vs, const LatticeFermion *vd, int orthogdim) { LOG(Message) << "Computing A2A DeltaF=2 graph" << std::endl; int dt = dt_min; // HACK ; should loop over dt auto G5 = Gamma(Gamma::Algebra::Gamma5); typedef typename FImpl::SiteSpinor vobj; typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; typedef iSpinMatrix SpinMatrix_v; typedef iSpinMatrix SpinMatrix_s; typedef iSinglet Scalar_v; typedef iSinglet Scalar_s; int N_s = WW_sd.dimension(1); int N_d = WW_sd.dimension(2); GridBase *grid = vs[0]._grid; const int nd = grid->_ndimension; const int Nsimd = grid->Nsimd(); int N_t = grid->GlobalDimensions()[orthogdim]; double vol = 1.0; for(int dim=0;dimGlobalDimensions()[dim]; } double nodes = grid->NodeCount(); dF2_trtr.resize(N_t,16); dF2_fig8.resize(N_t,16); den0.resize(N_t); den1.resize(N_t); for(int t=0;t correlator from each wall LatticeComplex D1(grid); LatticeComplex O1_trtr(grid); LatticeComplex O2_trtr(grid); LatticeComplex O3_trtr(grid); LatticeComplex O4_trtr(grid); LatticeComplex O5_trtr(grid); LatticeComplex O1_fig8(grid); LatticeComplex O2_fig8(grid); LatticeComplex O3_fig8(grid); LatticeComplex O4_fig8(grid); LatticeComplex O5_fig8(grid); O1_trtr = zero; O2_trtr = zero; O3_trtr = zero; O4_trtr = zero; O5_trtr = zero; O1_fig8 = zero; O2_fig8 = zero; O3_fig8 = zero; O4_fig8 = zero; O5_fig8 = zero; D0 = zero; D1 = zero; double t_tot = -usecond(); std::vector WWVV (N_t,grid); for(int t=0;toSites();ss++){ for(int s=0;soSites();ss++){ for(int d_o=0;d_o C1; std::vector C2; std::vector C3; std::vector C4; std::vector C5; ////////////////////////////////////////////////////////// // Could do AA, VV, SS, PP, TT and form linear combinations later. // Almost 2x. but for many modes, the above loop dominates. ////////////////////////////////////////////////////////// double t_contr= -usecond(); for(int t0=0;t0oSites();ss++){ auto VV0= WWVV[t0]._odata[ss]; auto VV1= WWVV[t1]._odata[ss]; // Tr Tr Wick contraction auto VX = Gamma(Gamma::Algebra::GammaX); auto VY = Gamma(Gamma::Algebra::GammaY); auto VZ = Gamma(Gamma::Algebra::GammaZ); auto VT = Gamma(Gamma::Algebra::GammaT); auto AX = Gamma(Gamma::Algebra::GammaXGamma5); auto AY = Gamma(Gamma::Algebra::GammaYGamma5); auto AZ = Gamma(Gamma::Algebra::GammaZGamma5); auto AT = Gamma(Gamma::Algebra::GammaTGamma5); auto S = Gamma(Gamma::Algebra::Identity); auto P = Gamma(Gamma::Algebra::Gamma5); auto T0 = Gamma(Gamma::Algebra::SigmaXY); auto T1 = Gamma(Gamma::Algebra::SigmaXZ); auto T2 = Gamma(Gamma::Algebra::SigmaXT); auto T3 = Gamma(Gamma::Algebra::SigmaYZ); auto T4 = Gamma(Gamma::Algebra::SigmaYT); auto T5 = Gamma(Gamma::Algebra::SigmaZT); O1_trtr._odata[ss] = trace(VX*VV0) * trace(VX*VV1) + trace(VY*VV0) * trace(VY*VV1) + trace(VZ*VV0) * trace(VZ*VV1) + trace(VT*VV0) * trace(VT*VV1) + trace(AX*VV0) * trace(AX*VV1) + trace(AY*VV0) * trace(AY*VV1) + trace(AZ*VV0) * trace(AZ*VV1) + trace(AT*VV0) * trace(AT*VV1); O2_trtr._odata[ss] = trace(VX*VV0) * trace(VX*VV1) + trace(VY*VV0) * trace(VY*VV1) + trace(VZ*VV0) * trace(VZ*VV1) + trace(VT*VV0) * trace(VT*VV1) - trace(AX*VV0) * trace(AX*VV1) - trace(AY*VV0) * trace(AY*VV1) - trace(AZ*VV0) * trace(AZ*VV1) - trace(AT*VV0) * trace(AT*VV1); O3_trtr._odata[ss] = trace(S*VV0) * trace(S*VV1) + trace(P*VV0) * trace(P*VV1); O4_trtr._odata[ss] = trace(S*VV0) * trace(S*VV1) - trace(P*VV0) * trace(P*VV1); O5_trtr._odata[ss] = trace(T0*VV0) * trace(T0*VV1) + trace(T1*VV0) * trace(T1*VV1) + trace(T2*VV0) * trace(T2*VV1) + trace(T3*VV0) * trace(T3*VV1) + trace(T4*VV0) * trace(T4*VV1) + trace(T5*VV0) * trace(T5*VV1); //////////////////////////////////// // Fig8 Wick contraction //////////////////////////////////// // was (UKhadron) Wick2 = trace( Qs0 * Gamma(G5) * anti_d0 * G * Qs1 * Gamma(G5) * anti_d1 * G ); // // = trace( [ Vs WW_sd'[t0] Vd'^dag ] G [ Vs' WW_s'd[t1] Vd^dag ] G ) // = trace( WWVV * G * WWVV * G ) // // Can do VV and AA seperately and then add/sub later cheaper O1_fig8._odata[ss] = trace (VV0 * VX * VV1 * VX) + trace (VV0 * VY * VV1 * VY) + trace (VV0 * VZ * VV1 * VZ) + trace (VV0 * VT * VV1 * VT) + trace (VV0 * AX * VV1 * AX) + trace (VV0 * AY * VV1 * AY) + trace (VV0 * AZ * VV1 * AZ) + trace (VV0 * AT * VV1 * AT); O2_fig8._odata[ss] = trace (VV0 * VX * VV1 * VX) + trace (VV0 * VY * VV1 * VY) + trace (VV0 * VZ * VV1 * VZ) + trace (VV0 * VT * VV1 * VT) - trace (VV0 * AX * VV1 * AX) - trace (VV0 * AY * VV1 * AY) - trace (VV0 * AZ * VV1 * AZ) - trace (VV0 * AT * VV1 * AT); O3_fig8._odata[ss] = trace (VV0 * S * VV1 * S) + trace (VV0 * P * VV1 * P); O4_fig8._odata[ss] = trace (VV0 * S * VV1 * S) - trace (VV0 * P * VV1 * P); O5_fig8._odata[ss] = trace (VV0 * T0 * VV1 * T0) + trace (VV0 * T1 * VV1 * T1) + trace (VV0 * T2 * VV1 * T2) + trace (VV0 * T3 * VV1 * T3) + trace (VV0 * T4 * VV1 * T4) + trace (VV0 * T5 * VV1 * T5); // Hack force PP correlator // D0._odata[ss] = trace(P*VV0); // These match signed off // D1._odata[ss] = trace(P*VV1); D0._odata[ss] = trace(AT*VV0); D1._odata[ss] = trace(AT*VV1); } sliceSum(O1_trtr,C1, orthogdim); sliceSum(O2_trtr,C2, orthogdim); sliceSum(O3_trtr,C3, orthogdim); sliceSum(O4_trtr,C4, orthogdim); sliceSum(O5_trtr,C5, orthogdim); for(int t=0;t void TA2APionField::execute(void) { LOG(Message) << "Computing A2A Pion fields" << std::endl; auto &a2a_i = envGet(A2ABase, par().A2A_i + "_class"); auto &a2a_j = envGet(A2ABase, par().A2A_j + "_class"); /////////////////////////////////////////////// // Square assumption for now Nl = Nr = N /////////////////////////////////////////////// int nt = env().getDim(Tp); int nx = env().getDim(Xp); int ny = env().getDim(Yp); int nz = env().getDim(Zp); // int N_i = a2a_i.par().N; // int N_j = a2a_j.par().N; int N_i = a2a_i.getN(); int N_j = a2a_j.getN(); int nmom=par().Nmom; int schurBlock = par().schurBlock; int cacheBlock = par().cacheBlock; /////////////////////////////////////////////// // Momentum setup /////////////////////////////////////////////// GridBase *grid = env().getGrid(1); std::vector phases(nmom,grid); for(int m=0;m pionFieldWVmom_ij (nmom,nt,N_i,N_j); Eigen::Tensor pionFieldWV_ij (nt,N_i,N_j); Eigen::Tensor pionFieldWVmom_ji (nmom,nt,N_j,N_i); Eigen::Tensor pionFieldWV_ji (nt,N_j,N_i); LOG(Message) << "Rank for A2A PionField is " << N_i << " x "<, wi); envGetTmp(std::vector, vi); envGetTmp(std::vector, wj); envGetTmp(std::vector, vj); envGetTmp(FermionField, tmp_5d); LOG(Message) << "Finding v and w vectors " << std::endl; ////////////////////////////////////////////////////////////////////////// // i,j is first loop over SchurBlock factors reusing 5D matrices // ii,jj is second loop over cacheBlock factors for high perf contractoin // iii,jjj are loops within cacheBlock // Total index is sum of these i+ii+iii etc... ////////////////////////////////////////////////////////////////////////// double flops = 0.0; double bytes = 0.0; double vol = nx*ny*nz*nt; double vol3 = nx*ny*nz; double t_schur=0; double t_contr_vwm=0; double t_contr_vw=0; double t_contr_ww=0; double t_contr_vv=0; double tt0 = usecond(); for(int i=0;i pionFieldWVmomB_ij(nmom,nt,N_iii,N_jjj); Eigen::Tensor pionFieldWVmomB_ji(nmom,nt,N_jjj,N_iii); Eigen::Tensor pionFieldWVB_ij(nt,N_iii,N_jjj); Eigen::Tensor pionFieldWVB_ji(nt,N_jjj,N_iii); t_contr_vwm-=usecond(); PionFieldWVmom(pionFieldWVmomB_ij, &wi[ii], &vj[jj], phases,Tp); PionFieldWVmom(pionFieldWVmomB_ji, &wj[jj], &vi[ii], phases,Tp); t_contr_vwm+=usecond(); t_contr_vw-=usecond(); PionFieldWV(pionFieldWVB_ij, &wi[ii], &vj[jj],Tp); PionFieldWV(pionFieldWVB_ji, &wj[jj], &vi[ii],Tp); t_contr_vw+=usecond(); flops += vol * ( 2 * 8.0 + 6.0 + 8.0*nmom) * N_iii*N_jjj; bytes += vol * (12.0 * sizeof(Complex) ) * N_iii*N_jjj + vol * ( 2.0 * sizeof(Complex) *nmom ) * N_iii*N_jjj; /////////////////////////////////////////////////////////////// // Copy back to full meson field tensor /////////////////////////////////////////////////////////////// parallel_for_nest2(int iii=0;iii< N_iii;iii++) { for(int jjj=0;jjj< N_jjj;jjj++) { for(int m =0;m< nmom;m++) { for(int t =0;t< nt;t++) { pionFieldWVmom_ij(m,t,i+ii+iii,j+jj+jjj) = pionFieldWVmomB_ij(m,t,iii,jjj); pionFieldWVmom_ji(m,t,j+jj+jjj,i+ii+iii) = pionFieldWVmomB_ji(m,t,jjj,iii); }} for(int t =0;t< nt;t++) { pionFieldWV_ij(t,i+ii+iii,j+jj+jjj) = pionFieldWVB_ij(t,iii,jjj); pionFieldWV_ji(t,j+jj+jjj,i+ii+iii) = pionFieldWVB_ji(t,jjj,iii); } }} }} }} double nodes=grid->NodeCount(); double tt1 = usecond(); LOG(Message) << " Contraction of PionFields took "<<(tt1-tt0)/1.0e6<< " seconds " << std::endl; LOG(Message) << " Schur "<<(t_schur)/1.0e6<< " seconds " << std::endl; LOG(Message) << " Contr WVmom "<<(t_contr_vwm)/1.0e6<< " seconds " << std::endl; LOG(Message) << " Contr WV "<<(t_contr_vw)/1.0e6<< " seconds " << std::endl; double t_kernel = t_contr_vwm; LOG(Message) << " Arith "< ///////////////////////////////////////////////////////////////////////// std::vector corrMom(nt,ComplexD(0.0)); for(int i=0;i ///////////////////////////////////////////////////////////////////////// std::vector corr(nt,ComplexD(0.0)); for(int i=0;i corr_wwvv(nt,ComplexD(0.0)); wi.resize(N_i,grid); vi.resize(N_i,grid); wj.resize(N_j,grid); vj.resize(N_j,grid); for(int i =0;i < N_i;i++) a2a_i.return_v(i, tmp_5d, vi[i]); for(int i =0;i < N_i;i++) a2a_i.return_w(i, tmp_5d, wi[i]); for(int j =0;j < N_j;j++) a2a_j.return_v(j, tmp_5d, vj[j]); for(int j =0;j < N_j;j++) a2a_j.return_w(j, tmp_5d, wj[j]); Eigen::Tensor pionFieldWW_ij (nt,N_i,N_j); Eigen::Tensor pionFieldVV_ji (nt,N_j,N_i); Eigen::Tensor pionFieldWW_ji (nt,N_j,N_i); Eigen::Tensor pionFieldVV_ij (nt,N_i,N_j); PionFieldWW(pionFieldWW_ij, &wi[0], &wj[0],Tp); PionFieldVV(pionFieldVV_ji, &vj[0], &vi[0],Tp); PionFieldWW(pionFieldWW_ji, &wj[0], &wi[0],Tp); PionFieldVV(pionFieldVV_ij, &vi[0], &vj[0],Tp); for(int i=0;i corr_z2(nt,ComplexD(0.0)); Eigen::Tensor pionFieldWW (nt,N_i,N_i); Eigen::Tensor pionFieldVV (nt,N_i,N_i); PionFieldWW(pionFieldWW, &wi[0], &wi[0],Tp); PionFieldVV(pionFieldVV, &vi[0], &vi[0],Tp); for(int i=0;i DeltaF2_fig8 (nt,16); Eigen::Tensor DeltaF2_trtr (nt,16); Eigen::Tensor denom0 (nt); Eigen::Tensor denom1 (nt); const int dT=16; DeltaFeq2_alt (dT,dT,DeltaF2_fig8,DeltaF2_trtr, denom0,denom1, pionFieldWW_ij,&vi[0],&vj[0],Tp); for(int t=0;t(Qd0, vi[idx0], s, c); FermToProp(Qd1, vi[idx1], s, c); FermToProp(Qs0, vj[idx0], s, c); FermToProp(Qs1, vj[idx1], s, c); } } std::vector gammas ( { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT, Gamma::Algebra::GammaXGamma5, Gamma::Algebra::GammaYGamma5, Gamma::Algebra::GammaZGamma5, Gamma::Algebra::GammaTGamma5, Gamma::Algebra::Identity, Gamma::Algebra::Gamma5, Gamma::Algebra::SigmaXY, Gamma::Algebra::SigmaXZ, Gamma::Algebra::SigmaXT, Gamma::Algebra::SigmaYZ, Gamma::Algebra::SigmaYT, Gamma::Algebra::SigmaZT }); auto G5 = Gamma::Algebra::Gamma5; LatticePropagator anti_d0 = adj( Gamma(G5) * Qd0 * Gamma(G5)); LatticePropagator anti_d1 = adj( Gamma(G5) * Qd1 * Gamma(G5)); LatticeComplex TR1(grid); LatticeComplex TR2(grid); LatticeComplex Wick1(grid); LatticeComplex Wick2(grid); LatticePropagator PR1(grid); LatticePropagator PR2(grid); PR1 = Qs0 * Gamma(G5) * anti_d0; PR2 = Qs1 * Gamma(G5) * anti_d1; for(int g=0;g C1; std::vector C2; std::vector C3; sliceSum(Wick1,C1, Tp); sliceSum(Wick2,C2, Tp); sliceSum(TR1 ,C3, Tp); if(g<5){ for(int t=0;t["<