A2A Lepton-Meson Field contraction

2025-10-31 03:54:33 +00:00 · 2019-04-30 12:04:59 +01:00
parent c2cd0e15d7
commit 4f0631615f
5 changed files with 528 additions and 0 deletions
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -68,6 +68,16 @@ public:
        const std::vector<ComplexField> &emB1,
        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);

+  template <typename TensorType>
+  static void LeptonField(TensorType &mat,
+        const FermionField *lhs_wi,
+        const FermionField *rhs_vj,
+        const std::vector<ComplexField> &lepton02,
+        const std::vector<ComplexField> &lepton03,
+        const std::vector<ComplexField> &lepton12,
+        const std::vector<ComplexField> &lepton13,
+        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
+
  static void ContractWWVV(std::vector<PropagatorField> &WWVV,
 			   const Eigen::Tensor<ComplexD,3> &WW_sd,
 			   const FermionField *vs,
@@ -809,6 +819,204 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
    if (t_gsum) *t_gsum += usecond();
 }

+// "Lepton" field w_i(x)^dag * L_mu * g^L_mu * v_j(x)
+//
+// With V-A current:
+//
+// g^L_mu = g_mu * (1 - g5)
+//
+// then in spin space
+//
+//                 ( 0	0	L_02	L_03	)
+// L_mu * g^L_mu = ( 0	0	L_12	L_13 	)
+//                 ( 0	0       0       0	)
+//                 ( 0	0 	0       0	)
+//
+//
+// With
+//
+// L_02 =  2*L3 + 2*i*L2
+// L_03 = -2*L1 + 2*i*L0
+// L_12 =  2*L1 + 2*i*L0
+// L_13 =  2*L3 - 2*i*L2
+//
+// (where mu = (x,y,z,t) )
+//
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::LeptonField(TensorType &mat,
+          const FermionField *lhs_wi,
+          const FermionField *rhs_vj,
+          const std::vector<ComplexField> &lepton02,
+          const std::vector<ComplexField> &lepton03,
+          const std::vector<ComplexField> &lepton12,
+          const std::vector<ComplexField> &lepton13,
+          int orthogdim, double *t_kernel, double *t_gsum)
+{
+    typedef typename FermionField::vector_object vobj;
+    typedef typename vobj::scalar_object         sobj;
+    typedef typename vobj::scalar_type           scalar_type;
+    typedef typename vobj::vector_type           vector_type;
+
+    typedef iSpinMatrix<vector_type> SpinMatrix_v;
+    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+    typedef iSinglet<vector_type>    Singlet_v;
+    typedef iSinglet<scalar_type>    Singlet_s;
+
+    int Lblock = mat.dimension(3);
+    int Rblock = mat.dimension(4);
+
+    GridBase *grid = lhs_wi[0]._grid;
+
+    const int    Nd = grid->_ndimension;
+    const int Nsimd = grid->Nsimd();
+
+    int Nt  = grid->GlobalDimensions()[orthogdim];
+
+    int Nlep = lepton02.size();
+    assert(lepton03.size() == Nlep);
+    assert(lepton12.size() == Nlep);
+    assert(lepton13.size() == Nlep);
+
+    int fd=grid->_fdimensions[orthogdim];
+    int ld=grid->_ldimensions[orthogdim];
+    int rd=grid->_rdimensions[orthogdim];
+
+    // will locally sum vectors first
+    // sum across these down to scalars
+    // splitting the SIMD
+    int MFrvol = rd*Lblock*Rblock*Nlep;
+    int MFlvol = ld*Lblock*Rblock*Nlep;
+
+    Vector<vector_type> lvSum(MFrvol);
+    parallel_for (int r = 0; r < MFrvol; r++)
+    {
+        lvSum[r] = zero;
+    }
+
+    Vector<scalar_type> lsSum(MFlvol);
+    parallel_for (int r = 0; r < MFlvol; r++)
+    {
+        lsSum[r] = scalar_type(0.0);
+    }
+
+    int e1=    grid->_slice_nblock[orthogdim];
+    int e2=    grid->_slice_block [orthogdim];
+    int stride=grid->_slice_stride[orthogdim];
+
+    // Nested parallelism would be ok
+    // Wasting cores here. Test case r
+    if (t_kernel) *t_kernel = -usecond();
+    parallel_for(int r=0;r<rd;r++)
+    {
+        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
+
+        for(int n=0;n<e1;n++)
+        for(int b=0;b<e2;b++)
+        {
+            int ss= so+n*stride+b;
+
+            for(int i=0;i<Lblock;i++)
+            {
+                auto left = conjugate(lhs_wi[i]._odata[ss]);
+
+                for(int j=0;j<Rblock;j++)
+                {
+                    SpinMatrix_v vv;
+                    auto right = rhs_vj[j]._odata[ss];
+
+                    for(int s1=0;s1<Ns;s1++)
+                    for(int s2=0;s2<Ns;s2++)
+                    {
+                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+                                        + left()(s2)(1) * right()(s1)(1)
+                                        + left()(s2)(2) * right()(s1)(2);
+                    }
+
+                    // After getting the sitewise product do the mom phase loop
+                    int base = Nlep*i+Nlep*Lblock*j+Nlep*Lblock*Rblock*r;
+
+                    for ( int m=0;m<Nlep;m++)
+                    {
+                        int idx  = m+base;
+
+                        auto lep02  = lepton02[m]._odata[ss];
+                        auto lep12  = lepton12[m]._odata[ss];
+                        auto lep03  = lepton03[m]._odata[ss];
+                        auto lep13  = lepton13[m]._odata[ss];
+
+			lvSum[idx] += vv()(2,0)()*lep02()()() + vv()(3,0)()*lep03()()()
+					+ vv()(2,1)()*lep12()()() + vv()(3,1)()*lep13()()();
+                    }
+                }
+            }
+        }
+    }
+    // Sum across simd lanes in the plane, breaking out orthog dir.
+    parallel_for(int rt=0;rt<rd;rt++)
+    {
+        std::vector<int> icoor(Nd);
+        std::vector<scalar_type> extracted(Nsimd);
+
+        for(int i=0;i<Lblock;i++)
+        for(int j=0;j<Rblock;j++)
+        for(int m=0;m<Nlep;m++)
+        {
+
+            int ij_rdx = m+Nlep*i+Nlep*Lblock*j+Nlep*Lblock*Rblock*rt;
+
+            extract<vector_type,scalar_type>(lvSum[ij_rdx],extracted);
+            for(int idx=0;idx<Nsimd;idx++)
+            {
+                grid->iCoorFromIindex(icoor,idx);
+
+                int ldx    = rt+icoor[orthogdim]*rd;
+                int ij_ldx = m+Nlep*i+Nlep*Lblock*j+Nlep*Lblock*Rblock*ldx;
+
+                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
+            }
+        }
+    }
+    if (t_kernel) *t_kernel += usecond();
+
+    // ld loop and local only??
+    int pd = grid->_processors[orthogdim];
+    int pc = grid->_processor_coor[orthogdim];
+    parallel_for_nest2(int lt=0;lt<ld;lt++)
+    {
+        for(int pt=0;pt<pd;pt++)
+        {
+            int t = lt + pt*ld;
+            if (pt == pc)
+            {
+                for(int i=0;i<Lblock;i++)
+                for(int j=0;j<Rblock;j++)
+                for(int m=0;m<Nlep;m++)
+                {
+                    int ij_dx = m+Nlep*i + Nlep*Lblock * j + Nlep*Lblock * Rblock * lt;
+
+                    mat(m,0,t,i,j) = lsSum[ij_dx];
+                }
+            }
+            else
+            {
+                const scalar_type zz(0.0);
+
+                for(int i=0;i<Lblock;i++)
+                for(int j=0;j<Rblock;j++)
+                for(int m=0;m<Nlep;m++)
+                {
+                    mat(m,0,t,i,j) = zz;
+                }
+            }
+        }
+    }
+    if (t_gsum) *t_gsum = -usecond();
+    grid->GlobalSumVector(&mat(0,0,0,0,0),Nlep*Nt*Lblock*Rblock);
+    if (t_gsum) *t_gsum += usecond();
+}
+
+
 ////////////////////////////////////////////
 // Schematic thoughts about more generalised four quark insertion
 //