1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

Moved the meson field inner product to its own header file

This commit is contained in:
fionnoh 2018-07-19 15:56:52 +01:00
parent 9deae8c962
commit 2c54a536f3
7 changed files with 257 additions and 64 deletions

View File

@ -0,0 +1,146 @@
#ifndef A2A_Reduction_hpp_
#define A2A_Reduction_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Environment.hpp>
#include <Grid/Hadrons/Solver.hpp>
BEGIN_HADRONS_NAMESPACE
////////////////////////////////////////////
// A2A Meson Field Inner Product
////////////////////////////////////////////
template <class FermionField>
void sliceInnerProductMesonField(std::vector<std::vector<ComplexD>> &mat,
const std::vector<Lattice<FermionField>> &lhs,
const std::vector<Lattice<FermionField>> &rhs,
int orthogdim)
{
typedef typename FermionField::scalar_type scalar_type;
typedef typename FermionField::vector_type vector_type;
int Lblock = lhs.size();
int Rblock = rhs.size();
GridBase *grid = lhs[0]._grid;
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
int Nt = grid->GlobalDimensions()[orthogdim];
assert(mat.size() == Lblock * Rblock);
for (int t = 0; t < mat.size(); t++)
{
assert(mat[t].size() == Nt);
}
int fd = grid->_fdimensions[orthogdim];
int ld = grid->_ldimensions[orthogdim];
int rd = grid->_rdimensions[orthogdim];
// will locally sum vectors first
// sum across these down to scalars
// splitting the SIMD
std::vector<vector_type, alignedAllocator<vector_type>> lvSum(rd * Lblock * Rblock);
for(int r=0;r<rd * Lblock * Rblock;r++)
{
lvSum[r]=zero;
}
std::vector<scalar_type> lsSum(ld * Lblock * Rblock, scalar_type(0.0));
int e1 = grid->_slice_nblock[orthogdim];
int e2 = grid->_slice_block[orthogdim];
int stride = grid->_slice_stride[orthogdim];
// std::cout << GridLogMessage << " Entering first parallel loop " << std::endl;
// Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
parallel_for(int r = 0; r < rd; r++)
{
int so = r * grid->_ostride[orthogdim]; // base offset for start of plane
for (int n = 0; n < e1; n++)
{
for (int b = 0; b < e2; b++)
{
int ss = so + n * stride + b;
for (int i = 0; i < Lblock; i++)
{
auto left = conjugate(lhs[i]._odata[ss]);
for (int j = 0; j < Rblock; j++)
{
int idx = i + Lblock * j + Lblock * Rblock * r;
auto right = rhs[j]._odata[ss];
vector_type vv = left()(0)(0) * right()(0)(0)
+ left()(0)(1) * right()(0)(1)
+ left()(0)(2) * right()(0)(2)
+ left()(1)(0) * right()(1)(0)
+ left()(1)(1) * right()(1)(1)
+ left()(1)(2) * right()(1)(2)
+ left()(2)(0) * right()(2)(0)
+ left()(2)(1) * right()(2)(1)
+ left()(2)(2) * right()(2)(2)
+ left()(3)(0) * right()(3)(0)
+ left()(3)(1) * right()(3)(1)
+ left()(3)(2) * right()(3)(2);
lvSum[idx] = lvSum[idx] + vv;
}
}
}
}
}
// std::cout << GridLogMessage << " Entering second parallel loop " << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir.
parallel_for(int rt = 0; rt < rd; rt++)
{
std::vector<int> icoor(Nd);
for (int i = 0; i < Lblock; i++)
{
for (int j = 0; j < Rblock; j++)
{
iScalar<vector_type> temp;
std::vector<iScalar<scalar_type>> extracted(Nsimd);
temp._internal = lvSum[i + Lblock * j + Lblock * Rblock * rt];
extract(temp, extracted);
for (int idx = 0; idx < Nsimd; idx++)
{
grid->iCoorFromIindex(icoor, idx);
int ldx = rt + icoor[orthogdim] * rd;
int ij_dx = i + Lblock * j + Lblock * Rblock * ldx;
lsSum[ij_dx] = lsSum[ij_dx] + extracted[idx]._internal;
}
}
}
}
// std::cout << GridLogMessage << " Entering non parallel loop " << std::endl;
for (int t = 0; t < fd; t++)
{
int pt = t/ld; // processor plane
int lt = t%ld;
for (int i = 0; i < Lblock; i++)
{
for (int j = 0; j < Rblock; j++)
{
if (pt == grid->_processor_coor[orthogdim])
{
int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
mat[i + j * Lblock][t] = lsSum[ij_dx];
}
else
{
mat[i + j * Lblock][t] = scalar_type(0.0);
}
}
}
}
// std::cout << GridLogMessage << " Done " << std::endl;
// defer sum over nodes.
return;
}
END_HADRONS_NAMESPACE
#endif // A2A_Reduction_hpp_

View File

@ -182,6 +182,7 @@ class A2AModesSchurDiagTwo
assert(sol_o.checkerboard == Odd);
action.DminusDag(tmp_wout, wout_5d);
action.ExportPhysicalFermionSolution(wout_5d, wout_4d);
}

View File

@ -15,6 +15,7 @@ libHadrons_adir = $(pkgincludedir)/Hadrons
nobase_libHadrons_a_HEADERS = \
$(modules_hpp) \
AllToAllVectors.hpp \
AllToAllReduction.hpp \
Application.hpp \
EigenPack.hpp \
Environment.hpp \

View File

@ -174,8 +174,8 @@ void TA2AMeson<FImpl>::execute(void)
{
for (unsigned int j = 0; j < N; j++)
{
sliceInnerProductVector(MF_x, w1[i], v1[j], Tp);
sliceInnerProductVector(MF_y, w1[j], v1[i], Tp);
mySliceInnerProductVector(MF_x, w1[i], v1[j], Tp);
mySliceInnerProductVector(MF_y, w1[j], v1[i], Tp);
for (unsigned int t = 0; t < nt; ++t)
{
for (unsigned int tx = 0; tx < nt; tx++)

View File

@ -5,6 +5,9 @@
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
#include <Grid/Hadrons/AllToAllVectors.hpp>
#include <Grid/Hadrons/AllToAllReduction.hpp>
#include <Grid/Grid_Eigen_Dense.h>
#include <fstream>
BEGIN_HADRONS_NAMESPACE
@ -19,6 +22,7 @@ class MesonFieldPar : Serializable
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFieldPar,
int, Nl,
int, N,
int, Nblock,
std::string, A2A1,
std::string, A2A2,
std::string, gammas,
@ -40,6 +44,7 @@ class TMesonFieldGamma : public Module<MesonFieldPar>
GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
Gamma::Algebra, gamma,
std::vector<std::vector<std::vector<ComplexD>>>, MesonField,
std::vector<std::vector<ComplexD>>, MesonFiield,
ComplexD, last);
};
@ -52,6 +57,9 @@ class TMesonFieldGamma : public Module<MesonFieldPar>
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
virtual void parseGammaString(std::vector<Gamma::Algebra> &gammaList);
virtual void vectorOfWs(std::vector<FermionField> &w, int i, int Nblock, FermionField &tmpw_5d, std::vector<FermionField> &vec_w);
virtual void vectorOfVs(std::vector<FermionField> &v, int j, int Nblock, FermionField &tmpv_5d, std::vector<FermionField> &vec_v);
virtual void gammaMult(std::vector<FermionField> &v, Gamma gamma);
// setup
virtual void setup(void);
// execution
@ -107,19 +115,54 @@ void TMesonFieldGamma<FImpl>::parseGammaString(std::vector<Gamma::Algebra> &gamm
}
}
template <typename FImpl>
void TMesonFieldGamma<FImpl>::vectorOfWs(std::vector<FermionField> &w, int i, int Nblock, FermionField &tmpw_5d, std::vector<FermionField> &vec_w)
{
for (unsigned int ni = 0; ni < Nblock; ni++)
{
vec_w[ni] = w[i + ni];
}
}
template <typename FImpl>
void TMesonFieldGamma<FImpl>::vectorOfVs(std::vector<FermionField> &v, int j, int Nblock, FermionField &tmpv_5d, std::vector<FermionField> &vec_v)
{
for (unsigned int nj = 0; nj < Nblock; nj++)
{
vec_v[nj] = v[j+nj];
}
}
template <typename FImpl>
void TMesonFieldGamma<FImpl>::gammaMult(std::vector<FermionField> &v, Gamma gamma)
{
int Nblock = v.size();
for (unsigned int nj = 0; nj < Nblock; nj++)
{
v[nj] = gamma * v[nj];
}
}
// setup ///////////////////////////////////////////////////////////////////////
template <typename FImpl>
void TMesonFieldGamma<FImpl>::setup(void)
{
int nt = env().getDim(Tp);
int N = par().N;
int Nblock = par().Nblock;
int Ls_ = env().getObjectLs(par().A2A1 + "_class");
envTmpLat(FermionField, "w", Ls_);
envTmpLat(FermionField, "v", Ls_);
envTmpLat(FermionField, "tmpv_5d", Ls_);
envTmpLat(FermionField, "tmpw_5d", Ls_);
envTmp(std::vector<FermionField>, "w", 1, N, FermionField(env().getGrid(1)));
envTmp(std::vector<FermionField>, "v", 1, N, FermionField(env().getGrid(1)));
envTmp(Eigen::MatrixXcd, "MF", 1, Eigen::MatrixXcd::Zero(nt, N * N));
envTmp(std::vector<FermionField>, "w_block", 1, Nblock, FermionField(env().getGrid(1)));
envTmp(std::vector<FermionField>, "v_block", 1, Nblock, FermionField(env().getGrid(1)));
}
// execution ///////////////////////////////////////////////////////////////////
@ -130,6 +173,7 @@ void TMesonFieldGamma<FImpl>::execute(void)
int N = par().N;
int nt = env().getDim(Tp);
int Nblock = par().Nblock;
std::vector<Result> result;
std::vector<Gamma::Algebra> gammaResultList;
@ -145,33 +189,54 @@ void TMesonFieldGamma<FImpl>::execute(void)
{
result[i].gamma = gammaResultList[i];
result[i].MesonField.resize(N, std::vector<std::vector<ComplexD>>(N, std::vector<ComplexD>(nt)));
result[i].MesonFiield.resize(N, std::vector<<ComplexD>(nt));
Gamma gamma(gammaResultList[i]);
gammaList[i] = gamma;
}
std::vector<ComplexD> MesonField_ij;
MesonField_ij.resize(nt);
auto &a2a1 = envGet(A2ABase, par().A2A1 + "_class");
auto &a2a2 = envGet(A2ABase, par().A2A2 + "_class");
envGetTmp(FermionField, w);
envGetTmp(FermionField, v);
envGetTmp(FermionField, tmpv_5d);
envGetTmp(FermionField, tmpw_5d);
for (unsigned int i = 0; i < N; i++)
envGetTmp(std::vector<FermionField>, v);
envGetTmp(std::vector<FermionField>, w);
LOG(Message) << "Finding v and w vectors for N = " << N << std::endl;
for (int i = 0; i < N; i++)
{
a2a1.return_w(i, tmpw_5d, w);
for (unsigned int j = 0; j < N; j++)
a2a2.return_v(i, tmpv_5d, v[i]);
a2a1.return_w(i, tmpw_5d, w[i]);
}
LOG(Message) << "Found v and w vectors for N = " << N << std::endl;
std::vector<std::vector<ComplexD>> MesonField_ij;
LOG(Message) << "Before blocked MFs, Nblock = " << Nblock << std::endl;
envGetTmp(std::vector<FermionField>, v_block);
envGetTmp(std::vector<FermionField>, w_block);
MesonField_ij.resize(Nblock * Nblock, std::vector<ComplexD>(nt));
envGetTmp(Eigen::MatrixXcd, MF);
LOG(Message) << "Before blocked MFs, Nblock = " << Nblock << std::endl;
for (unsigned int i = 0; i < N; i += Nblock)
{
vectorOfWs(w, i, Nblock, tmpw_5d, w_block);
for (unsigned int j = 0; j < N; j += Nblock)
{
a2a2.return_v(j, tmpv_5d, v);
vectorOfVs(v, j, Nblock, tmpv_5d, v_block);
for (unsigned int k = 0; k < result.size(); k++)
{
v = gammaList[k]*v;
sliceInnerProductVector(MesonField_ij, w, v, Tp);
result[k].MesonField[i][j] = MesonField_ij;
gammaMult(v_block, gammaList[k]);
sliceInnerProductMesonField(MesonField_ij, w_block, v_block, Tp);
for (unsigned int nj = 0; nj < Nblock; nj++)
{
for (unsigned int ni = 0; ni < Nblock; ni++)
{
MF.col((i + ni) + (j + nj) * N) = Eigen::VectorXcd::Map(&MesonField_ij[nj * Nblock + ni][0], MesonField_ij[nj * Nblock + ni].size());
}
}
}
}
if (i % 10 == 0)
@ -179,7 +244,22 @@ void TMesonFieldGamma<FImpl>::execute(void)
LOG(Message) << "MF for i = " << i << " of " << N << std::endl;
}
}
result[0].last = MesonField_ij[7];
LOG(Message) << "Before Global sum, Nblock = " << Nblock << std::endl;
v_block[0]._grid->GlobalSumVector(MF.data(), MF.size());
LOG(Message) << "After Global sum, Nblock = " << Nblock << std::endl;
for (unsigned int i = 0; i < N; i++)
{
for (unsigned int j = 0; j < N; j++)
{
for (unsigned int k = 0; k < result.size(); k++)
{
for (unsigned int t = 0; t < nt; t++)
{
result[k].MesonField[i][j][t] = MF.col(i + N * j)[t];
}
}
}
}
saveResult(par().output, "meson", result);
}

View File

@ -41,7 +41,6 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
std::cout << GridLogMessage << "Start alloc innerProduct" << std::endl;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
GridBase *grid = left._grid;
@ -50,8 +49,6 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
ComplexD inner;
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
std::cout << GridLogMessage << "End alloc innerProduct" << std::endl;
std::cout << GridLogMessage << "Start parallel for innerProduct" << std::endl;
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
@ -65,18 +62,12 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
}
std::cout << GridLogMessage << "End parallel for innerProduct" << std::endl;
std::cout << GridLogMessage << "Start inner sum innerProduct" << std::endl;
inner=0.0;
for(int i=0;i<grid->SumArraySize();i++){
inner = inner+sumarray[i*pad];
}
right._grid->GlobalSum(inner);
return inner;
std::cout << GridLogMessage << "End inner sum innerProduct" << std::endl;
std::cout << GridLogMessage << "End innerProduct" << std::endl;
}
/////////////////////////
@ -285,7 +276,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
template<class vobj>
static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
std::cout << GridLogMessage << "Start mySsliceInnerProductVector" << std::endl;
std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
typedef typename vobj::scalar_type scalar_type;
std::vector<scalar_type> lsSum;
@ -313,12 +304,12 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::cout << GridLogMessage << "Start alloc" << std::endl;
// std::cout << GridLogMessage << "Start alloc" << std::endl;
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type>> extracted(Nsimd); // splitting the SIMD
std::cout << GridLogMessage << "End alloc" << std::endl;
// std::cout << GridLogMessage << "End alloc" << std::endl;
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
@ -328,8 +319,8 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
std::cout << GridLogMessage << "End prep" << std::endl;
std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
// std::cout << GridLogMessage << "End prep" << std::endl;
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
vector_type vv;
parallel_for(int r=0;r<rd;r++)
{
@ -339,12 +330,12 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss = so + n * stride + b;
vv = TensorRemove(innerProduct(lhs._odata[ss]._internal, rhs._odata[ss]._internal));
vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
lvSum[r] = lvSum[r] + vv;
}
}
}
std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
@ -364,7 +355,7 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
}
}
std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
// std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
}
template <class vobj>
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
@ -376,7 +367,7 @@ static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const L
// sum over nodes.
std::vector<scalar_type> gsum;
gsum.resize(fd, scalar_type(0.0));
std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
// std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
@ -384,10 +375,10 @@ static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const L
gsum[t]=lsSum[lt];
}
}
std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
// std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
// std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
grid->GlobalSumVector(&gsum[0], fd);
std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
// std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
result = gsum;
}

View File

@ -106,7 +106,6 @@ inline vRealD innerProductD(const vRealF &l,const vRealF &r){
typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
iScalar<ret_t> ret;
ret=zero;
// std::cout << GridLogMessage << "innerProduct iVector" << std::endl;
for(int c1=0;c1<N;c1++){
ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
}
@ -130,34 +129,9 @@ inline vRealD innerProductD(const vRealF &l,const vRealF &r){
{
typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
iScalar<ret_t> ret;
// std::cout << GridLogMessage << "innerProduct iScalar" << std::endl;
ret._internal = innerProduct(lhs._internal,rhs._internal);
return ret;
}
template<class l,class r,int N> inline
auto myInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
{
typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
iScalar<ret_t> ret;
ret=zero;
std::cout << GridLogMessage << "myInnerProduct iVector, N = " << N << std::endl;
for(int c1=0;c1<N;c1++){
ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
}
return ret;
}
template<class l,class r> inline
auto myInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
{
typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
iScalar<ret_t> ret;
std::cout << GridLogMessage << "myInnerProduct iScalar" << std::endl;
ret._internal = myInnerProduct(lhs._internal,rhs._internal);
return ret;
}
}
#endif