diff --git a/extras/Hadrons/Modules/MContraction/A2APionField.hpp b/extras/Hadrons/Modules/MContraction/A2APionField.hpp index da3a2137..0a867909 100644 --- a/extras/Hadrons/Modules/MContraction/A2APionField.hpp +++ b/extras/Hadrons/Modules/MContraction/A2APionField.hpp @@ -57,21 +57,13 @@ class TA2APionField : public Module virtual void DeltaFeq2(int dt_min,int dt_max, Eigen::Tensor &dF2_fig8, Eigen::Tensor &dF2_trtr, + Eigen::Tensor &den0, + Eigen::Tensor &den1, Eigen::Tensor &WW_sd, const LatticeFermion *vs, const LatticeFermion *vd, int orthogdim); - virtual void DeltaFeq2_alt(int dt_min,int dt_max, - Eigen::Tensor &dF2_fig8, - Eigen::Tensor &dF2_trtr, - Eigen::Tensor &den0, - Eigen::Tensor &den1, - Eigen::Tensor &WW_sd, - const LatticeFermion *vs, - const LatticeFermion *vd, - int orthogdim); - /////////////////////////////////////// // Arithmetic help. Move to Grid?? /////////////////////////////////////// @@ -670,300 +662,9 @@ inline iScalar traceGammaXGamma5(const iMatrix &rhs) // Fig8(tx) = Wick2 = sum_x WW[t0]_sd WW[t1]_s'd' < v^_d |g5 G| v_s'> < v^_d' |g5 G| v_s> |_{x,tx} // // = sum_x Trace( WW[t0] VV[t,x] WW[t1] VV[t,x] ) -// -// Might as well form Ns x Nj x Ngamma matrix // /////////////////////////////////////////////////////////////////////////////// - -template -void TA2APionField::DeltaFeq2(int dt_min,int dt_max, - Eigen::Tensor &dF2_fig8, - Eigen::Tensor &dF2_trtr, - Eigen::Tensor &WW_sd, - const LatticeFermion *vs, - const LatticeFermion *vd, - int orthogdim) -{ - LOG(Message) << "Computing A2A DeltaF=2 graph" << std::endl; - - int dt = dt_min; // HACK ; should loop over dt - - typedef typename FImpl::SiteSpinor vobj; - - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - typedef iSpinMatrix SpinMatrix_v; - typedef iSpinMatrix SpinMatrix_s; - typedef iSinglet Scalar_v; - typedef iSinglet Scalar_s; - - int N_s = WW_sd.dimension(1); - int N_d = WW_sd.dimension(2); - - GridBase *grid = vs[0]._grid; - - const int nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - int Nt = grid->GlobalDimensions()[orthogdim]; - - dF2_trtr.resize(Nt,16); - dF2_fig8.resize(Nt,16); - for(int t=0;t WWVV (Nt,grid) - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - ////////////////////////////////////// - // will locally sum vectors first - // sum across these down to scalars - // splitting the SIMD - ////////////////////////////////////// - int MFrvol = rd*N_s*N_d; - int MFlvol = ld*N_s*N_d; - - Vector lvSum(MFrvol); - parallel_for (int r = 0; r < MFrvol; r++){ - lvSum[r] = zero; - } - - Vector lsSum(MFlvol); - parallel_for (int r = 0; r < MFlvol; r++){ - lsSum[r]=scalar_type(0.0); - } - - int e1= grid->_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - - Eigen::Tensor VgV_sd(N_s,N_d,16); // trace with dirac structure - Eigen::Tensor VgV_sd_l(N_s,N_d,16,Nsimd); - int Ng; - - LOG(Message) << "Computing A2A DeltaF=2 graph entering site loop" << std::endl; - - double t_tot =0; - double t_vv =0; - double t_extr =0; - double t_transp=0; - double t_WW =0; - double t_trtr =0; - double t_fig8 =0; - - t_tot -=usecond(); - - for(int r=0;r_ostride[orthogdim]; // base offset for start of plane - - for(int n=0;n extracted(Nsimd); - Scalar_v temp; - for(int s=0;s icoor(nd); - grid->iCoorFromIindex(icoor,l); - int ttt = r+icoor[orthogdim]*rd; - - Eigen::MatrixXcd VgV(N_d,N_s); - Eigen::MatrixXcd VgV_T(N_s,N_d); - Eigen::MatrixXcd WW0(N_s,N_d); - Eigen::MatrixXcd WW1(N_s,N_d); - - ///////////////////////////////////////// - // Single site VgV , scalar order - ///////////////////////////////////////// - t_transp -=usecond(); - parallel_for(int d=0;dGlobalSumVector(&dF2_fig8(0),Nt*Ng); - grid->GlobalSumVector(&dF2_trtr(0),Nt*Ng); - t_tot +=usecond(); - - LOG(Message) << "Computing A2A DeltaF=2 graph t_tot " << t_tot << " us "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph t_vv " << t_vv << " us "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph t_extr " << t_extr << " us "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph t_transp " << t_transp << " us "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph t_WW " << t_WW << " us "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph t_trtr " << t_trtr << " us "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph t_fig8 " << t_fig8 << " us "<< std::endl; - -} - +// // WW is w_s^dag (x) w_d (G5 implicitly absorbed) // // WWVV will have spin-col (x) spin-col tensor. @@ -975,19 +676,19 @@ void TA2APionField::DeltaFeq2(int dt_min,int dt_max, // template -void TA2APionField::DeltaFeq2_alt(int dt_min,int dt_max, - Eigen::Tensor &dF2_fig8, - Eigen::Tensor &dF2_trtr, - Eigen::Tensor &den0, - Eigen::Tensor &den1, - Eigen::Tensor &WW_sd, - const LatticeFermion *vs, - const LatticeFermion *vd, - int orthogdim) +void TA2APionField::DeltaFeq2(int dt_min,int dt_max, + Eigen::Tensor &dF2_fig8, + Eigen::Tensor &dF2_trtr, + Eigen::Tensor &den0, + Eigen::Tensor &den1, + Eigen::Tensor &WW_sd, + const LatticeFermion *vs, + const LatticeFermion *vd, + int orthogdim) { LOG(Message) << "Computing A2A DeltaF=2 graph" << std::endl; - int dt = dt_min; // HACK ; should loop over dt + int dt = dt_min; auto G5 = Gamma(Gamma::Algebra::Gamma5); @@ -1027,7 +728,6 @@ void TA2APionField::DeltaFeq2_alt(int dt_min,int dt_max, den1(t) =ComplexD(0.0); } - LatticeComplex D0(grid); // correlator from each wall LatticeComplex D1(grid); @@ -1064,70 +764,28 @@ void TA2APionField::DeltaFeq2_alt(int dt_min,int dt_max, WWVV[t] = zero; } - ////////////////////////////////////////////////////////// - // Easily cache blocked and unrolled. - ////////////////////////////////////////////////////////// - // - // Ways to speed up?? 20 min in 8^3 x 16 local volume with 400 modes. - // - // Flops are (cmul=6 + 1 (add))* 12*12 * N_s * N_d - // Bytes are read Nc x Ns x 2 + read/write Nc^2 x Ns^2 complex - // + ////////////////////////////////////////////////////////////////// + // Method-5 - wrap this assembly in a distinct routine for reuse + ////////////////////////////////////////////////////////////////// double t_outer= -usecond(); - double t_outer_0 =0.0; - double t_outer_1 =0.0; - - -#undef METHOD_1 -#define METHOD_5 - -#ifdef METHOD_1 - LOG(Message) << "METHOD_1" << std::endl; - // Method-1 - // i) calculate flop rate and data rate. 17 GF/s and 80 GB/s - // Dominated by accumulating the N_t summands - parallel_for(int ss=0;ssoSites();ss++){ - for(int s=0;soSites();ss++){ for(int d_o=0;d_o::DeltaFeq2_alt(int dt_min,int dt_max, }} } } -#endif - t_outer+=usecond(); ////////////////////////////// @@ -1328,15 +984,8 @@ void TA2APionField::DeltaFeq2_alt(int dt_min,int dt_max, double million=1.0e6; LOG(Message) << "Computing A2A DeltaF=2 graph t_tot " << t_tot /million << " s "<< std::endl; LOG(Message) << "Computing A2A DeltaF=2 graph t_outer " << t_outer /million << " s "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph t_outer_0 " << t_outer_0 /million << " s "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph t_outer_1 " << t_outer_1 /million << " s "<< std::endl; LOG(Message) << "Computing A2A DeltaF=2 graph t_contr " << t_contr /million << " s "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph mflops/s outer " << f_outer/t_outer << " MF/s "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph MB/s outer " << b_outer/t_outer << " MB/s "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph mflops/s outer/node " << f_outer/t_outer/nodes << " MF/s "<< std::endl; - LOG(Message) << "Computing A2A DeltaF=2 graph MB/s outer/node " << b_outer/t_outer/nodes << " MB/s "<< std::endl; - } @@ -1614,16 +1263,12 @@ void TA2APionField::execute(void) const int dT=16; - DeltaFeq2_alt (dT,dT,DeltaF2_fig8,DeltaF2_trtr, - denom0,denom1, - pionFieldWW_ij,&vi[0],&vj[0],Tp); - for(int t=0;t::execute(void) LOG(Message) << " Wick2["<["<