Hadrons: integration of Peter's A2Autils

2025-11-04 05:54:32 +00:00 · 2018-10-05 16:42:44 +01:00
parent d69a52079f
commit 866449c804
5 changed files with 20 additions and 428 deletions
--- a/Hadrons/Modules/MContraction/A2AKernels.hpp
+++ b/Hadrons/Modules/MContraction/A2AKernels.hpp
@@ -1,411 +0,0 @@
-/*************************************************************************************
-
-Grid physics library, www.github.com/paboyle/Grid 
-
-Source file: Hadrons/Modules/MContraction/A2AMesonFieldKernels.hpp
-
-Copyright (C) 2015-2018
-
-Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#ifndef Hadrons_MContraction_A2AMesonFieldKernels_hpp_
-#define Hadrons_MContraction_A2AMesonFieldKernels_hpp_
-
-#include <Hadrons/Global.hpp>
-#include <Hadrons/Module.hpp>
-
-BEGIN_HADRONS_NAMESPACE
-
-BEGIN_MODULE_NAMESPACE(MContraction)
-
-////////////////////////////////////////////////////////////////////////////////
-// Cache blocked arithmetic routine
-// Could move to Grid ???
-////////////////////////////////////////////////////////////////////////////////
-template <typename Field, typename MesonField>
-void makeMesonFieldBlock(MesonField &mat, 
-                         const Field *lhs_wi,
-                         const Field *rhs_vj,
-                         const std::vector<Gamma::Algebra> &gamma,
-                         const std::vector<LatticeComplex> &mom,
-                         int orthogdim,
-                         double &time) 
-{
-    typedef typename Field::vector_object vobj;
-    typedef typename vobj::scalar_object  sobj;
-    typedef typename vobj::scalar_type    scalar_type;
-    typedef typename vobj::vector_type    vector_type;
-
-    typedef iSpinMatrix<vector_type> SpinMatrix_v;
-    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
-    
-    TimerArray tArray;
-
-    int Lblock = mat.dimension(3); 
-    int Rblock = mat.dimension(4);
-
-    GridBase *grid = lhs_wi[0]._grid;
-    
-    const int    Nd = grid->_ndimension;
-    const int Nsimd = grid->Nsimd();
-
-    int Nt     = grid->GlobalDimensions()[orthogdim];
-    int Ngamma = gamma.size();
-    int Nmom   = mom.size();
-
-    int fd=grid->_fdimensions[orthogdim];
-    int ld=grid->_ldimensions[orthogdim];
-    int rd=grid->_rdimensions[orthogdim];
-
-    // will locally sum vectors first
-    // sum across these down to scalars
-    // splitting the SIMD
-    int MFrvol = rd*Lblock*Rblock*Nmom;
-    int MFlvol = ld*Lblock*Rblock*Nmom;
-
-    Vector<SpinMatrix_v > lvSum(MFrvol);
-    parallel_for (int r = 0; r < MFrvol; r++)
-    {
-        lvSum[r] = zero;
-    }
-
-    Vector<SpinMatrix_s > lsSum(MFlvol);             
-    parallel_for (int r = 0; r < MFlvol; r++)
-    {
-        lsSum[r]=scalar_type(0.0);
-    }
-
-    int e1=    grid->_slice_nblock[orthogdim];
-    int e2=    grid->_slice_block [orthogdim];
-    int stride=grid->_slice_stride[orthogdim];
-
-    tArray.startTimer("contraction: colour trace & mom.");
-    // Nested parallelism would be ok
-    // Wasting cores here. Test case r
-    parallel_for(int r=0;r<rd;r++)
-    {
-        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-        for(int n=0;n<e1;n++)
-        for(int b=0;b<e2;b++)
-        {
-            int ss= so+n*stride+b;
-
-            for(int i=0;i<Lblock;i++)
-            {
-                auto left = conjugate(lhs_wi[i]._odata[ss]);
-
-                for(int j=0;j<Rblock;j++)
-                {
-                    SpinMatrix_v vv;
-                    auto right = rhs_vj[j]._odata[ss];
-
-                    for(int s1=0;s1<Ns;s1++)
-                    for(int s2=0;s2<Ns;s2++)
-                    {
-                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
-                                        + left()(s2)(1) * right()(s1)(1)
-                                        + left()(s2)(2) * right()(s1)(2);
-                    }
-                    
-                    // After getting the sitewise product do the mom phase loop
-                    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
-
-                    for ( int m=0;m<Nmom;m++)
-                    {
-                        int idx = m+base;
-                        auto phase = mom[m]._odata[ss];
-                        mac(&lvSum[idx],&vv,&phase);
-                    }
-                }
-            }
-        }
-    }
-    tArray.stopTimer("contraction: colour trace & mom.");
-
-    // Sum across simd lanes in the plane, breaking out orthog dir.
-    tArray.startTimer("contraction: local space sum");
-    parallel_for(int rt=0;rt<rd;rt++)
-    {
-        std::vector<int> icoor(Nd);
-        std::vector<SpinMatrix_s> extracted(Nsimd);               
-
-        for(int i=0;i<Lblock;i++)
-        for(int j=0;j<Rblock;j++)
-        for(int m=0;m<Nmom;m++)
-        {
-
-            int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
-
-            extract(lvSum[ij_rdx],extracted);
-            for(int idx=0;idx<Nsimd;idx++)
-            {
-                grid->iCoorFromIindex(icoor,idx);
-
-                int ldx    = rt+icoor[orthogdim]*rd;
-                int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
-
-                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
-            }
-        }
-    }
-    tArray.stopTimer("contraction: local space sum");
-    time = tArray.getDTimer("contraction: colour trace & mom.")
-           + tArray.getDTimer("contraction: local space sum");
-
-    // ld loop and local only??
-    tArray.startTimer("contraction: spin trace");
-    int pd = grid->_processors[orthogdim];
-    int pc = grid->_processor_coor[orthogdim];
-    parallel_for_nest2(int lt=0;lt<ld;lt++)
-    {
-        for(int pt=0;pt<pd;pt++)
-        {
-            int t = lt + pt*ld;
-            if (pt == pc)
-            {
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int m=0;m<Nmom;m++)
-                {
-                    int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
-
-                    for(int mu=0;mu<Ngamma;mu++)
-                    {
-                        // this is a bit slow
-                        mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gamma[mu]));
-                    }
-                }
-            } 
-            else 
-            { 
-                const scalar_type zz(0.0);
-
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int mu=0;mu<Ngamma;mu++)
-                for(int m=0;m<Nmom;m++)
-                {
-                    mat(m,mu,t,i,j) =zz;
-                }
-            }
-        }
-    }
-    tArray.stopTimer("contraction: spin trace");
-    ////////////////////////////////////////////////////////////////////
-    // This global sum is taking as much as 50% of time on 16 nodes
-    // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
-    // Healthy size that should suffice
-    ////////////////////////////////////////////////////////////////////
-    tArray.startTimer("contraction: global sum");
-    grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
-    tArray.stopTimer("contraction: global sum");
-}
-
-template <typename Field, typename AslashField>
-void makeAslashFieldBlock(AslashField &mat, 
-                          const Field *lhs_wi,
-                          const Field *rhs_vj,
-                          const std::vector<LatticeComplex> &emB0,
-                          const std::vector<LatticeComplex> &emB1,
-                          int orthogdim,
-                          ModuleBase *caller = nullptr) 
-{
-    typedef typename Field::vector_object vobj;
-    typedef typename vobj::scalar_object  sobj;
-    typedef typename vobj::scalar_type    scalar_type;
-    typedef typename vobj::vector_type    vector_type;
-
-    typedef iSpinMatrix<vector_type> SpinMatrix_v;
-    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
-    
-    int Lblock = mat.dimension(2); 
-    int Rblock = mat.dimension(3);
-
-    GridBase *grid = lhs_wi[0]._grid;
-    
-    const int    Nd = grid->_ndimension;
-    const int Nsimd = grid->Nsimd();
-
-    int Nt  = grid->GlobalDimensions()[orthogdim];
-    int Nem = emB0.size();
-    assert(emB1.size() == Nem);
-
-    int fd=grid->_fdimensions[orthogdim];
-    int ld=grid->_ldimensions[orthogdim];
-    int rd=grid->_rdimensions[orthogdim];
-
-    // will locally sum vectors first
-    // sum across these down to scalars
-    // splitting the SIMD
-    int MFrvol = rd*Lblock*Rblock*Nem;
-    int MFlvol = ld*Lblock*Rblock*Nem;
-
-    Vector<vector_type> lvSum(MFrvol);
-    parallel_for (int r = 0; r < MFrvol; r++)
-    {
-        lvSum[r] = zero;
-    }
-
-    Vector<scalar_type> lsSum(MFlvol);             
-    parallel_for (int r = 0; r < MFlvol; r++)
-    {
-        lsSum[r] = scalar_type(0.0);
-    }
-
-    int e1=    grid->_slice_nblock[orthogdim];
-    int e2=    grid->_slice_block [orthogdim];
-    int stride=grid->_slice_stride[orthogdim];
-
-    if (caller) caller->startTimer("contraction: colour trace & Aslash mul");
-    // Nested parallelism would be ok
-    // Wasting cores here. Test case r
-    parallel_for(int r=0;r<rd;r++)
-    {
-        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-        for(int n=0;n<e1;n++)
-        for(int b=0;b<e2;b++)
-        {
-            int ss= so+n*stride+b;
-
-            for(int i=0;i<Lblock;i++)
-            {
-                auto left = conjugate(lhs_wi[i]._odata[ss]);
-
-                for(int j=0;j<Rblock;j++)
-                {
-                    SpinMatrix_v vv;
-                    auto right = rhs_vj[j]._odata[ss];
-
-                    for(int s1=0;s1<Ns;s1++)
-                    for(int s2=0;s2<Ns;s2++)
-                    {
-                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
-                                        + left()(s2)(1) * right()(s1)(1)
-                                        + left()(s2)(2) * right()(s1)(2);
-                    }
-                    
-                    // After getting the sitewise product do the mom phase loop
-                    int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
-
-                    for ( int m=0;m<Nem;m++)
-                    {
-                        int idx  = m+base;
-                        auto b0  = emB0[m]._odata[ss];
-                        auto b1  = emB1[m]._odata[ss];
-                        auto cb0 = conjugate(b0);
-                        auto cb1 = conjugate(b1);
-
-                        // B_0 = A_1 + i A_0
-                        // B_1 = A_3 + i A_2
-                        // 
-                        // then in spin space
-                        // 
-                        //             ( 0          0         B_1 -conj(B_0) )
-                        // A_mu g_mu = ( 0          0         B_0  conj(B_1) )
-                        //             ( conj(B_1)  conj(B_0) 0    0         )
-                        //             ( -B_0       B_1       0    0         )
-
-                        lvSum[idx] +=   vv()(0,2)()*b1  - vv()(0,3)()*cb0
-                                        + vv()(1,2)()*b0  + vv()(1,3)()*cb1
-                                        + vv()(2,0)()*cb1 + vv()(2,1)()*cb0 
-                                        - vv()(3,0)()*b0  + vv()(3,1)()*b1;
-                    }
-                }
-            }
-        }
-    }
-    if (caller) caller->stopTimer("contraction: colour trace & Aslash mul");
-
-    // Sum across simd lanes in the plane, breaking out orthog dir.
-    if (caller) caller->startTimer("contraction: local space sum");
-    parallel_for(int rt=0;rt<rd;rt++)
-    {
-        std::vector<int> icoor(Nd);
-        std::vector<scalar_type> extracted(Nsimd);               
-
-        for(int i=0;i<Lblock;i++)
-        for(int j=0;j<Rblock;j++)
-        for(int m=0;m<Nem;m++)
-        {
-
-            int ij_rdx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*rt;
-
-            extract(lvSum[ij_rdx],extracted);
-            for(int idx=0;idx<Nsimd;idx++)
-            {
-                grid->iCoorFromIindex(icoor,idx);
-
-                int ldx    = rt+icoor[orthogdim]*rd;
-                int ij_ldx = m+Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*ldx;
-
-                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
-            }
-        }
-    }
-    if (caller) caller->stopTimer("contraction: local space sum");
-
-    // ld loop and local only??
-    if (caller) caller->startTimer("contraction: tensor store");
-    int pd = grid->_processors[orthogdim];
-    int pc = grid->_processor_coor[orthogdim];
-    parallel_for_nest2(int lt=0;lt<ld;lt++)
-    {
-        for(int pt=0;pt<pd;pt++)
-        {
-            int t = lt + pt*ld;
-            if (pt == pc)
-            {
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int m=0;m<Nem;m++)
-                {
-                    int ij_dx = m+Nem*i + Nem*Lblock * j + Nem*Lblock * Rblock * lt;
-
-                    mat(m,t,i,j) = lsSum[ij_dx];
-                }
-            } 
-            else 
-            { 
-                const scalar_type zz(0.0);
-
-                for(int i=0;i<Lblock;i++)
-                for(int j=0;j<Rblock;j++)
-                for(int m=0;m<Nem;m++)
-                {
-                    mat(m,t,i,j) = zz;
-                }
-            }
-        }
-    }
-    if (caller) caller->stopTimer("contraction: tensor store");
-
-    if (caller) caller->startTimer("contraction: global sum");
-    grid->GlobalSumVector(&mat(0,0,0,0),Nem*Nt*Lblock*Rblock);
-    if (caller) caller->stopTimer("contraction: global sum");
-}
-
-END_MODULE_NAMESPACE
-
-END_HADRONS_NAMESPACE
-
-#endif //Hadrons_MContraction_A2AMesonField_hpp_
--- a/Hadrons/Modules/MContraction/A2AMesonField.hpp
+++ b/Hadrons/Modules/MContraction/A2AMesonField.hpp
@@ -36,7 +36,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/A2AVectors.hpp>
 #include <Hadrons/A2AMatrix.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
-#include <Hadrons/Modules/MContraction/A2AKernels.hpp>

 #define MF_PARALLEL_IO
 #ifndef MF_IO_TYPE
@@ -71,9 +70,11 @@ public:
                                    Gamma::Algebra, gamma);
 };

-template <typename T, typename Field>
-class MesonFieldKernel: public A2AKernel<T, Field>
+template <typename T, typename FImpl>
+class MesonFieldKernel: public A2AKernel<T, typename FImpl::FermionField>
 {
+public:
+    typedef typename FImpl::FermionField FermionField;
 public:
    MesonFieldKernel(const std::vector<Gamma::Algebra> &gamma,
                     const std::vector<LatticeComplex> &mom,
@@ -88,10 +89,11 @@ public:
    }

    virtual ~MesonFieldKernel(void) = default;
-    virtual void operator()(A2AMatrixSet<T> &m, const Field *left, const Field *right,
-                          const unsigned int orthogDim, double &time)
+    virtual void operator()(A2AMatrixSet<T> &m, const FermionField *left, 
+                            const FermionField *right,
+                            const unsigned int orthogDim, double &t)
    {
-        makeMesonFieldBlock(m, left, right, gamma_, mom_, orthogDim, time);
+        A2Autils<FImpl>::MesonField(m, left, right, gamma_, mom_, orthogDim, &t);
    }

    virtual double flops(const unsigned int blockSizei, const unsigned int blockSizej)
@@ -121,7 +123,7 @@ public:
                                      FermionField, 
                                      A2AMesonFieldMetadata, 
                                      MF_IO_TYPE> Computation;
-    typedef MesonFieldKernel<Complex, FermionField> Kernel;
+    typedef MesonFieldKernel<Complex, FImpl> Kernel;
    struct IoHelper
    {
        A2AMatrixIo<MF_IO_TYPE> io;