diff --git a/extras/Hadrons/AllToAllReduction.hpp b/extras/Hadrons/AllToAllReduction.hpp new file mode 100644 index 00000000..7b2d2c1a --- /dev/null +++ b/extras/Hadrons/AllToAllReduction.hpp @@ -0,0 +1,146 @@ +#ifndef A2A_Reduction_hpp_ +#define A2A_Reduction_hpp_ + +#include +#include +#include + +BEGIN_HADRONS_NAMESPACE + +//////////////////////////////////////////// +// A2A Meson Field Inner Product +//////////////////////////////////////////// + +template +void sliceInnerProductMesonField(std::vector> &mat, + const std::vector> &lhs, + const std::vector> &rhs, + int orthogdim) +{ + typedef typename FermionField::scalar_type scalar_type; + typedef typename FermionField::vector_type vector_type; + + int Lblock = lhs.size(); + int Rblock = rhs.size(); + + GridBase *grid = lhs[0]._grid; + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + int Nt = grid->GlobalDimensions()[orthogdim]; + + assert(mat.size() == Lblock * Rblock); + for (int t = 0; t < mat.size(); t++) + { + assert(mat[t].size() == Nt); + } + + int fd = grid->_fdimensions[orthogdim]; + int ld = grid->_ldimensions[orthogdim]; + int rd = grid->_rdimensions[orthogdim]; + + // will locally sum vectors first + // sum across these down to scalars + // splitting the SIMD + std::vector> lvSum(rd * Lblock * Rblock); + for(int r=0;r lsSum(ld * Lblock * Rblock, scalar_type(0.0)); + + int e1 = grid->_slice_nblock[orthogdim]; + int e2 = grid->_slice_block[orthogdim]; + int stride = grid->_slice_stride[orthogdim]; + + // std::cout << GridLogMessage << " Entering first parallel loop " << std::endl; + // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL + parallel_for(int r = 0; r < rd; r++) + { + int so = r * grid->_ostride[orthogdim]; // base offset for start of plane + for (int n = 0; n < e1; n++) + { + for (int b = 0; b < e2; b++) + { + int ss = so + n * stride + b; + for (int i = 0; i < Lblock; i++) + { + auto left = conjugate(lhs[i]._odata[ss]); + for (int j = 0; j < Rblock; j++) + { + int idx = i + Lblock * j + Lblock * Rblock * r; + auto right = rhs[j]._odata[ss]; + vector_type vv = left()(0)(0) * right()(0)(0) + + left()(0)(1) * right()(0)(1) + + left()(0)(2) * right()(0)(2) + + left()(1)(0) * right()(1)(0) + + left()(1)(1) * right()(1)(1) + + left()(1)(2) * right()(1)(2) + + left()(2)(0) * right()(2)(0) + + left()(2)(1) * right()(2)(1) + + left()(2)(2) * right()(2)(2) + + left()(3)(0) * right()(3)(0) + + left()(3)(1) * right()(3)(1) + + left()(3)(2) * right()(3)(2); + + lvSum[idx] = lvSum[idx] + vv; + } + } + } + } + } + + // std::cout << GridLogMessage << " Entering second parallel loop " << std::endl; + // Sum across simd lanes in the plane, breaking out orthog dir. + parallel_for(int rt = 0; rt < rd; rt++) + { + std::vector icoor(Nd); + for (int i = 0; i < Lblock; i++) + { + for (int j = 0; j < Rblock; j++) + { + iScalar temp; + std::vector> extracted(Nsimd); + temp._internal = lvSum[i + Lblock * j + Lblock * Rblock * rt]; + extract(temp, extracted); + for (int idx = 0; idx < Nsimd; idx++) + { + grid->iCoorFromIindex(icoor, idx); + int ldx = rt + icoor[orthogdim] * rd; + int ij_dx = i + Lblock * j + Lblock * Rblock * ldx; + lsSum[ij_dx] = lsSum[ij_dx] + extracted[idx]._internal; + } + } + } + } + + // std::cout << GridLogMessage << " Entering non parallel loop " << std::endl; + for (int t = 0; t < fd; t++) + { + int pt = t/ld; // processor plane + int lt = t%ld; + for (int i = 0; i < Lblock; i++) + { + for (int j = 0; j < Rblock; j++) + { + if (pt == grid->_processor_coor[orthogdim]) + { + int ij_dx = i + Lblock * j + Lblock * Rblock * lt; + mat[i + j * Lblock][t] = lsSum[ij_dx]; + } + else + { + mat[i + j * Lblock][t] = scalar_type(0.0); + } + + } + } + } + // std::cout << GridLogMessage << " Done " << std::endl; + // defer sum over nodes. + return; +} + +END_HADRONS_NAMESPACE + +#endif // A2A_Reduction_hpp_ \ No newline at end of file diff --git a/extras/Hadrons/AllToAllVectors.hpp b/extras/Hadrons/AllToAllVectors.hpp index a514beac..24f5397e 100644 --- a/extras/Hadrons/AllToAllVectors.hpp +++ b/extras/Hadrons/AllToAllVectors.hpp @@ -182,6 +182,7 @@ class A2AModesSchurDiagTwo assert(sol_o.checkerboard == Odd); action.DminusDag(tmp_wout, wout_5d); + action.ExportPhysicalFermionSolution(wout_5d, wout_4d); } diff --git a/extras/Hadrons/Makefile.am b/extras/Hadrons/Makefile.am index 7240be82..abfcf9a8 100644 --- a/extras/Hadrons/Makefile.am +++ b/extras/Hadrons/Makefile.am @@ -15,6 +15,7 @@ libHadrons_adir = $(pkgincludedir)/Hadrons nobase_libHadrons_a_HEADERS = \ $(modules_hpp) \ AllToAllVectors.hpp \ + AllToAllReduction.hpp \ Application.hpp \ EigenPack.hpp \ Environment.hpp \ diff --git a/extras/Hadrons/Modules/MContraction/A2AMeson.hpp b/extras/Hadrons/Modules/MContraction/A2AMeson.hpp index fa1cb82e..45fb4db9 100644 --- a/extras/Hadrons/Modules/MContraction/A2AMeson.hpp +++ b/extras/Hadrons/Modules/MContraction/A2AMeson.hpp @@ -174,8 +174,8 @@ void TA2AMeson::execute(void) { for (unsigned int j = 0; j < N; j++) { - sliceInnerProductVector(MF_x, w1[i], v1[j], Tp); - sliceInnerProductVector(MF_y, w1[j], v1[i], Tp); + mySliceInnerProductVector(MF_x, w1[i], v1[j], Tp); + mySliceInnerProductVector(MF_y, w1[j], v1[i], Tp); for (unsigned int t = 0; t < nt; ++t) { for (unsigned int tx = 0; tx < nt; tx++) diff --git a/extras/Hadrons/Modules/MContraction/MesonFieldGamma.hpp b/extras/Hadrons/Modules/MContraction/MesonFieldGamma.hpp index 0b81f607..eefd5e64 100644 --- a/extras/Hadrons/Modules/MContraction/MesonFieldGamma.hpp +++ b/extras/Hadrons/Modules/MContraction/MesonFieldGamma.hpp @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include BEGIN_HADRONS_NAMESPACE @@ -19,6 +22,7 @@ class MesonFieldPar : Serializable GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFieldPar, int, Nl, int, N, + int, Nblock, std::string, A2A1, std::string, A2A2, std::string, gammas, @@ -40,6 +44,7 @@ class TMesonFieldGamma : public Module GRID_SERIALIZABLE_CLASS_MEMBERS(Result, Gamma::Algebra, gamma, std::vector>>, MesonField, + std::vector>, MesonFiield, ComplexD, last); }; @@ -52,6 +57,9 @@ class TMesonFieldGamma : public Module virtual std::vector getInput(void); virtual std::vector getOutput(void); virtual void parseGammaString(std::vector &gammaList); + virtual void vectorOfWs(std::vector &w, int i, int Nblock, FermionField &tmpw_5d, std::vector &vec_w); + virtual void vectorOfVs(std::vector &v, int j, int Nblock, FermionField &tmpv_5d, std::vector &vec_v); + virtual void gammaMult(std::vector &v, Gamma gamma); // setup virtual void setup(void); // execution @@ -107,19 +115,54 @@ void TMesonFieldGamma::parseGammaString(std::vector &gamm } } +template +void TMesonFieldGamma::vectorOfWs(std::vector &w, int i, int Nblock, FermionField &tmpw_5d, std::vector &vec_w) +{ + for (unsigned int ni = 0; ni < Nblock; ni++) + { + vec_w[ni] = w[i + ni]; + } +} + +template +void TMesonFieldGamma::vectorOfVs(std::vector &v, int j, int Nblock, FermionField &tmpv_5d, std::vector &vec_v) +{ + for (unsigned int nj = 0; nj < Nblock; nj++) + { + vec_v[nj] = v[j+nj]; + } +} + +template +void TMesonFieldGamma::gammaMult(std::vector &v, Gamma gamma) +{ + int Nblock = v.size(); + for (unsigned int nj = 0; nj < Nblock; nj++) + { + v[nj] = gamma * v[nj]; + } +} + // setup /////////////////////////////////////////////////////////////////////// template void TMesonFieldGamma::setup(void) { int nt = env().getDim(Tp); int N = par().N; + int Nblock = par().Nblock; int Ls_ = env().getObjectLs(par().A2A1 + "_class"); - envTmpLat(FermionField, "w", Ls_); - envTmpLat(FermionField, "v", Ls_); envTmpLat(FermionField, "tmpv_5d", Ls_); envTmpLat(FermionField, "tmpw_5d", Ls_); + + envTmp(std::vector, "w", 1, N, FermionField(env().getGrid(1))); + envTmp(std::vector, "v", 1, N, FermionField(env().getGrid(1))); + + envTmp(Eigen::MatrixXcd, "MF", 1, Eigen::MatrixXcd::Zero(nt, N * N)); + + envTmp(std::vector, "w_block", 1, Nblock, FermionField(env().getGrid(1))); + envTmp(std::vector, "v_block", 1, Nblock, FermionField(env().getGrid(1))); } // execution /////////////////////////////////////////////////////////////////// @@ -130,6 +173,7 @@ void TMesonFieldGamma::execute(void) int N = par().N; int nt = env().getDim(Tp); + int Nblock = par().Nblock; std::vector result; std::vector gammaResultList; @@ -145,33 +189,54 @@ void TMesonFieldGamma::execute(void) { result[i].gamma = gammaResultList[i]; result[i].MesonField.resize(N, std::vector>(N, std::vector(nt))); + result[i].MesonFiield.resize(N, std::vector<(nt)); Gamma gamma(gammaResultList[i]); gammaList[i] = gamma; } - std::vector MesonField_ij; - MesonField_ij.resize(nt); - auto &a2a1 = envGet(A2ABase, par().A2A1 + "_class"); auto &a2a2 = envGet(A2ABase, par().A2A2 + "_class"); - envGetTmp(FermionField, w); - envGetTmp(FermionField, v); envGetTmp(FermionField, tmpv_5d); envGetTmp(FermionField, tmpw_5d); - for (unsigned int i = 0; i < N; i++) + envGetTmp(std::vector, v); + envGetTmp(std::vector, w); + LOG(Message) << "Finding v and w vectors for N = " << N << std::endl; + for (int i = 0; i < N; i++) { - a2a1.return_w(i, tmpw_5d, w); - for (unsigned int j = 0; j < N; j++) + a2a2.return_v(i, tmpv_5d, v[i]); + a2a1.return_w(i, tmpw_5d, w[i]); + } + LOG(Message) << "Found v and w vectors for N = " << N << std::endl; + + std::vector> MesonField_ij; + LOG(Message) << "Before blocked MFs, Nblock = " << Nblock << std::endl; + envGetTmp(std::vector, v_block); + envGetTmp(std::vector, w_block); + MesonField_ij.resize(Nblock * Nblock, std::vector(nt)); + + envGetTmp(Eigen::MatrixXcd, MF); + + LOG(Message) << "Before blocked MFs, Nblock = " << Nblock << std::endl; + for (unsigned int i = 0; i < N; i += Nblock) + { + vectorOfWs(w, i, Nblock, tmpw_5d, w_block); + for (unsigned int j = 0; j < N; j += Nblock) { - a2a2.return_v(j, tmpv_5d, v); + vectorOfVs(v, j, Nblock, tmpv_5d, v_block); for (unsigned int k = 0; k < result.size(); k++) { - v = gammaList[k]*v; - sliceInnerProductVector(MesonField_ij, w, v, Tp); - result[k].MesonField[i][j] = MesonField_ij; + gammaMult(v_block, gammaList[k]); + sliceInnerProductMesonField(MesonField_ij, w_block, v_block, Tp); + for (unsigned int nj = 0; nj < Nblock; nj++) + { + for (unsigned int ni = 0; ni < Nblock; ni++) + { + MF.col((i + ni) + (j + nj) * N) = Eigen::VectorXcd::Map(&MesonField_ij[nj * Nblock + ni][0], MesonField_ij[nj * Nblock + ni].size()); + } + } } } if (i % 10 == 0) @@ -179,7 +244,22 @@ void TMesonFieldGamma::execute(void) LOG(Message) << "MF for i = " << i << " of " << N << std::endl; } } - result[0].last = MesonField_ij[7]; + LOG(Message) << "Before Global sum, Nblock = " << Nblock << std::endl; + v_block[0]._grid->GlobalSumVector(MF.data(), MF.size()); + LOG(Message) << "After Global sum, Nblock = " << Nblock << std::endl; + for (unsigned int i = 0; i < N; i++) + { + for (unsigned int j = 0; j < N; j++) + { + for (unsigned int k = 0; k < result.size(); k++) + { + for (unsigned int t = 0; t < nt; t++) + { + result[k].MesonField[i][j][t] = MF.col(i + N * j)[t]; + } + } + } + } saveResult(par().output, "meson", result); } diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 12bfebed..b8d13b47 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -41,7 +41,6 @@ template inline RealD norm2(const Lattice &arg){ template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { - std::cout << GridLogMessage << "Start alloc innerProduct" << std::endl; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; GridBase *grid = left._grid; @@ -50,8 +49,6 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ ComplexD inner; Vector sumarray(grid->SumArraySize()*pad); - std::cout << GridLogMessage << "End alloc innerProduct" << std::endl; - std::cout << GridLogMessage << "Start parallel for innerProduct" << std::endl; parallel_for(int thr=0;thrSumArraySize();thr++){ int nwork, mywork, myoff; GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); @@ -65,18 +62,12 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ ComplexD tmp = Reduce(TensorRemove(vinner)) ; vstream(sumarray[thr*pad],tmp); } - std::cout << GridLogMessage << "End parallel for innerProduct" << std::endl; - - std::cout << GridLogMessage << "Start inner sum innerProduct" << std::endl; inner=0.0; for(int i=0;iSumArraySize();i++){ inner = inner+sumarray[i*pad]; } right._grid->GlobalSum(inner); return inner; - std::cout << GridLogMessage << "End inner sum innerProduct" << std::endl; - - std::cout << GridLogMessage << "End innerProduct" << std::endl; } ///////////////////////// @@ -285,7 +276,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< template static void mySliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) { - std::cout << GridLogMessage << "Start mySsliceInnerProductVector" << std::endl; + std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl; typedef typename vobj::scalar_type scalar_type; std::vector lsSum; @@ -313,12 +304,12 @@ static void localSliceInnerProductVector(std::vector &result, const La int fd=grid->_fdimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - std::cout << GridLogMessage << "Start alloc" << std::endl; + // std::cout << GridLogMessage << "Start alloc" << std::endl; std::vector > lvSum(rd); // will locally sum vectors first lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars std::vector> extracted(Nsimd); // splitting the SIMD - std::cout << GridLogMessage << "End alloc" << std::endl; + // std::cout << GridLogMessage << "End alloc" << std::endl; result.resize(fd); // And then global sum to return the same vector to every node for IO to file for(int r=0;r &result, const La int e1= grid->_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; - std::cout << GridLogMessage << "End prep" << std::endl; - std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl; + // std::cout << GridLogMessage << "End prep" << std::endl; + // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl; vector_type vv; parallel_for(int r=0;r &result, const La for(int n=0;n icoor(Nd); @@ -364,7 +355,7 @@ static void localSliceInnerProductVector(std::vector &result, const La } } - std::cout << GridLogMessage << "End sum over simd lanes" << std::endl; + // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl; } template static void globalSliceInnerProductVector(std::vector &result, const Lattice &lhs, std::vector &lsSum, int orthogdim) @@ -376,7 +367,7 @@ static void globalSliceInnerProductVector(std::vector &result, const L // sum over nodes. std::vector gsum; gsum.resize(fd, scalar_type(0.0)); - std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl; + // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl; for(int t=0;t &result, const L gsum[t]=lsSum[lt]; } } - std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl; - std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl; + // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl; + // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl; grid->GlobalSumVector(&gsum[0], fd); - std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl; + // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl; result = gsum; } diff --git a/lib/tensors/Tensor_inner.h b/lib/tensors/Tensor_inner.h index b7a8417d..46185652 100644 --- a/lib/tensors/Tensor_inner.h +++ b/lib/tensors/Tensor_inner.h @@ -106,7 +106,6 @@ inline vRealD innerProductD(const vRealF &l,const vRealF &r){ typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; iScalar ret; ret=zero; - // std::cout << GridLogMessage << "innerProduct iVector" << std::endl; for(int c1=0;c1 ret; - // std::cout << GridLogMessage << "innerProduct iScalar" << std::endl; - ret._internal = innerProduct(lhs._internal,rhs._internal); return ret; } - template inline - auto myInnerProduct (const iVector& lhs,const iVector& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t; - iScalar ret; - ret=zero; - std::cout << GridLogMessage << "myInnerProduct iVector, N = " << N << std::endl; - for(int c1=0;c1 inline - auto myInnerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar - { - typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t; - iScalar ret; - std::cout << GridLogMessage << "myInnerProduct iScalar" << std::endl; - - ret._internal = myInnerProduct(lhs._internal,rhs._internal); - return ret; - } } #endif