diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index 89b4d4bd..84c5fe60 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -101,6 +101,186 @@ public: #endif }; +/* +template +template +void A2Autils::BaryonField(TensorType &mat, + const FermionField *one, + const FermionField *two, + const FermionField *three, + std::vector gammaA, + std::vector gammaB, + const std::vector &mom, + int orthogdim, double *t_kernel, double *t_gsum) +{ + typedef typename FImpl::SiteSpinor vobj; + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + + typedef iSpinMatrix SpinMatrix_v; + typedef iSpinMatrix SpinMatrix_s; + + int oneBlock = mat.dimension(3); + int twoBlock = mat.dimension(4); + int threeBlock = mat.dimension(5); + + GridBase *grid = one[0]._grid; + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + int Nt = grid->GlobalDimensions()[orthogdim]; + int Ngamma = gammaA.size(); + assert (Ngamma = gammaB.size()); // Only combinatin of two gammas gives correct operator! + int Nmom = mom.size(); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + int MFrvol = rd*oneBlock*twoBlock*threeBlock*Nmom; + int MFlvol = ld*oneBlock*twoBlock*threeBlock*Nmom; + + Vector lvSum(MFrvol); + parallel_for (int r = 0; r < MFrvol; r++){ + lvSum[r] = zero; + } + + Vector lsSum(MFlvol); + parallel_for (int r = 0; r < MFlvol; r++){ + lsSum[r]=scalar_type(0.0); + } + + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + // potentially wasting cores here if local time extent too small + if (t_kernel) *t_kernel = -usecond(); + parallel_for(int r=0;r_ostride[orthogdim]; // base offset for start of plane + + // first, the diquark two*gammaB*three + for(int n=0;n icoor(Nd); + std::vector extracted(Nsimd); + + for(int i=0;iiCoorFromIindex(icoor,idx); + + int ldx = rt+icoor[orthogdim]*rd; + + int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx; + + lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]; + + } + }}} + } + if (t_kernel) *t_kernel += usecond(); + assert(mat.dimension(0) == Nmom); + assert(mat.dimension(1) == Ngamma); + assert(mat.dimension(2) == Nt); + + // ld loop and local only?? + int pd = grid->_processors[orthogdim]; + int pc = grid->_processor_coor[orthogdim]; + parallel_for_nest2(int lt=0;ltGlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock); + if (t_gsum) *t_gsum += usecond(); +} +*/ + template template void A2Autils::MesonField(TensorType &mat,