mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-17 15:27:06 +01:00
Merge pull request #243 from fionnoh/feature/A2A_current_insertion
Feature/a2 a current insertion
This commit is contained in:
@ -317,116 +317,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
||||
{
|
||||
// std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
|
||||
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
std::vector<scalar_type> lsSum;
|
||||
localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
|
||||
globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
|
||||
// std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
|
||||
}
|
||||
|
||||
template <class vobj>
|
||||
static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
|
||||
{
|
||||
// std::cout << GridLogMessage << "Start prep" << std::endl;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
GridBase *grid = lhs.Grid();
|
||||
assert(grid!=NULL);
|
||||
conformable(grid,rhs.Grid());
|
||||
|
||||
const int Nd = grid->_ndimension;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
assert(orthogdim >= 0);
|
||||
assert(orthogdim < Nd);
|
||||
|
||||
int fd=grid->_fdimensions[orthogdim];
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
// std::cout << GridLogMessage << "Start alloc" << std::endl;
|
||||
|
||||
Vector<vector_type> lvSum(rd); // will locally sum vectors first
|
||||
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
|
||||
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
||||
// std::cout << GridLogMessage << "End alloc" << std::endl;
|
||||
|
||||
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
||||
for(int r=0;r<rd;r++){
|
||||
lvSum[r]=Zero();
|
||||
}
|
||||
|
||||
int e1= grid->_slice_nblock[orthogdim];
|
||||
int e2= grid->_slice_block [orthogdim];
|
||||
int stride=grid->_slice_stride[orthogdim];
|
||||
// std::cout << GridLogMessage << "End prep" << std::endl;
|
||||
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
|
||||
vector_type vv;
|
||||
auto l_v=lhs.View();
|
||||
auto r_v=rhs.View();
|
||||
thread_for( r,rd,{
|
||||
|
||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int ss = so + n * stride + b;
|
||||
vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
|
||||
lvSum[r] = lvSum[r] + vv;
|
||||
}
|
||||
}
|
||||
});
|
||||
// std::cout << GridLogMessage << "End parallel inner product" << std::endl;
|
||||
|
||||
// Sum across simd lanes in the plane, breaking out orthog dir.
|
||||
Coordinate icoor(Nd);
|
||||
for(int rt=0;rt<rd;rt++){
|
||||
|
||||
iScalar<vector_type> temp;
|
||||
temp._internal = lvSum[rt];
|
||||
extract(temp,extracted);
|
||||
|
||||
for(int idx=0;idx<Nsimd;idx++){
|
||||
|
||||
grid->iCoorFromIindex(icoor,idx);
|
||||
|
||||
int ldx =rt+icoor[orthogdim]*rd;
|
||||
|
||||
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
|
||||
|
||||
}
|
||||
}
|
||||
// std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
|
||||
}
|
||||
template <class vobj>
|
||||
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
GridBase *grid = lhs.Grid();
|
||||
int fd = result.size();
|
||||
int ld = lsSum.size();
|
||||
// sum over nodes.
|
||||
std::vector<scalar_type> gsum;
|
||||
gsum.resize(fd, scalar_type(0.0));
|
||||
// std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
|
||||
for(int t=0;t<fd;t++){
|
||||
int pt = t/ld; // processor plane
|
||||
int lt = t%ld;
|
||||
if ( pt == grid->_processor_coor[orthogdim] ) {
|
||||
gsum[t]=lsSum[lt];
|
||||
}
|
||||
}
|
||||
// std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
|
||||
// std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
|
||||
grid->GlobalSumVector(&gsum[0], fd);
|
||||
// std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
|
||||
|
||||
result = gsum;
|
||||
}
|
||||
template<class vobj>
|
||||
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
||||
{
|
||||
|
@ -67,8 +67,21 @@ public:
|
||||
const std::vector<ComplexField> &emB1,
|
||||
int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
|
||||
|
||||
static void ContractWWVV(std::vector<PropagatorField> &WWVV,
|
||||
const Eigen::Tensor<ComplexD,3> &WW_sd,
|
||||
template <typename TensorType>
|
||||
typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
|
||||
std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
|
||||
void>::type
|
||||
static ContractWWVV(std::vector<PropagatorField> &WWVV,
|
||||
const TensorType &WW_sd,
|
||||
const FermionField *vs,
|
||||
const FermionField *vd);
|
||||
|
||||
template <typename TensorType>
|
||||
typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
|
||||
std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
|
||||
void>::type
|
||||
static ContractWWVV(std::vector<PropagatorField> &WWVV,
|
||||
const TensorType &WW_sd,
|
||||
const FermionField *vs,
|
||||
const FermionField *vd);
|
||||
|
||||
@ -98,6 +111,11 @@ public:
|
||||
const FermionField *vd,
|
||||
int orthogdim);
|
||||
#endif
|
||||
private:
|
||||
inline static void OuterProductWWVV(PropagatorField &WWVV,
|
||||
const vobj &lhs,
|
||||
const vobj &rhs,
|
||||
const int Ns, const int ss);
|
||||
};
|
||||
|
||||
template <class FImpl>
|
||||
@ -968,9 +986,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
|
||||
// Take WW_sd v^dag_d (x) v_s
|
||||
//
|
||||
|
||||
template<class FImpl>
|
||||
void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
|
||||
const Eigen::Tensor<ComplexD,3> &WW_sd,
|
||||
template <class FImpl>
|
||||
template <typename TensorType>
|
||||
typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
|
||||
std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
|
||||
void>::type
|
||||
A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
|
||||
const TensorType &WW_sd,
|
||||
const FermionField *vs,
|
||||
const FermionField *vd)
|
||||
{
|
||||
@ -992,39 +1014,100 @@ void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
|
||||
for(int d_o=0;d_o<N_d;d_o+=d_unroll){
|
||||
for(int t=0;t<N_t;t++){
|
||||
for(int s=0;s<N_s;s++){
|
||||
auto vs_v = vs[s].View();
|
||||
auto tmp1 = vs_v[ss];
|
||||
vobj tmp2 = Zero();
|
||||
vobj tmp3 = Zero();
|
||||
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
|
||||
auto vd_v = vd[d].View();
|
||||
Scalar_v coeff = WW_sd(t,s,d);
|
||||
tmp3 = conjugate(vd_v[ss]);
|
||||
mac(&tmp2, &coeff, &tmp3);
|
||||
}
|
||||
auto vs_v = vs[s].View();
|
||||
auto tmp1 = vs_v[ss];
|
||||
vobj tmp2 = Zero();
|
||||
vobj tmp3 = Zero();
|
||||
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
|
||||
auto vd_v = vd[d].View();
|
||||
Scalar_v coeff = WW_sd(t,s,d);
|
||||
tmp3 = conjugate(vd_v[ss]);
|
||||
mac(&tmp2, &coeff, &tmp3);
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
|
||||
//////////////////////////
|
||||
auto WWVV_v = WWVV[t].View();
|
||||
for(int s1=0;s1<Ns;s1++){
|
||||
for(int s2=0;s2<Ns;s2++){
|
||||
WWVV_v[ss]()(s1,s2)(0,0) += tmp1()(s1)(0)*tmp2()(s2)(0);
|
||||
WWVV_v[ss]()(s1,s2)(0,1) += tmp1()(s1)(0)*tmp2()(s2)(1);
|
||||
WWVV_v[ss]()(s1,s2)(0,2) += tmp1()(s1)(0)*tmp2()(s2)(2);
|
||||
WWVV_v[ss]()(s1,s2)(1,0) += tmp1()(s1)(1)*tmp2()(s2)(0);
|
||||
WWVV_v[ss]()(s1,s2)(1,1) += tmp1()(s1)(1)*tmp2()(s2)(1);
|
||||
WWVV_v[ss]()(s1,s2)(1,2) += tmp1()(s1)(1)*tmp2()(s2)(2);
|
||||
WWVV_v[ss]()(s1,s2)(2,0) += tmp1()(s1)(2)*tmp2()(s2)(0);
|
||||
WWVV_v[ss]()(s1,s2)(2,1) += tmp1()(s1)(2)*tmp2()(s2)(1);
|
||||
WWVV_v[ss]()(s1,s2)(2,2) += tmp1()(s1)(2)*tmp2()(s2)(2);
|
||||
}}
|
||||
//////////////////////////
|
||||
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
|
||||
//////////////////////////
|
||||
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
|
||||
|
||||
}}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <class FImpl>
|
||||
template <typename TensorType>
|
||||
typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD, 3>, TensorType>::value ||
|
||||
std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
|
||||
void>::type
|
||||
A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
|
||||
const TensorType &WW_sd,
|
||||
const FermionField *vs,
|
||||
const FermionField *vd)
|
||||
{
|
||||
GridBase *grid = vs[0].Grid();
|
||||
|
||||
int nd = grid->_ndimension;
|
||||
int Nsimd = grid->Nsimd();
|
||||
int N_t = WW_sd.dimensions()[0];
|
||||
int N_s = WW_sd.dimensions()[1];
|
||||
int N_d = WW_sd.dimensions()[2];
|
||||
|
||||
int d_unroll = 32;// Empirical optimisation
|
||||
|
||||
Eigen::Matrix<Complex, -1, -1, Eigen::RowMajor> buf;
|
||||
|
||||
for(int t=0;t<N_t;t++){
|
||||
WWVV[t] = Zero();
|
||||
}
|
||||
|
||||
for (int t = 0; t < N_t; t++){
|
||||
std::cout << GridLogMessage << "Contraction t = " << t << std::endl;
|
||||
buf = WW_sd[t];
|
||||
thread_for(ss,grid->oSites(),{
|
||||
for(int d_o=0;d_o<N_d;d_o+=d_unroll){
|
||||
for(int s=0;s<N_s;s++){
|
||||
auto vs_v = vs[s].View();
|
||||
auto tmp1 = vs_v[ss];
|
||||
vobj tmp2 = Zero();
|
||||
vobj tmp3 = Zero();
|
||||
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
|
||||
auto vd_v = vd[d].View();
|
||||
Scalar_v coeff = buf(s,d);
|
||||
tmp3 = conjugate(vd_v[ss]);
|
||||
mac(&tmp2, &coeff, &tmp3);
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
|
||||
//////////////////////////
|
||||
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
|
||||
}}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <class FImpl>
|
||||
inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
|
||||
const vobj &lhs,
|
||||
const vobj &rhs,
|
||||
const int Ns, const int ss)
|
||||
{
|
||||
auto WWVV_v = WWVV.View();
|
||||
for (int s1 = 0; s1 < Ns; s1++){
|
||||
for (int s2 = 0; s2 < Ns; s2++){
|
||||
WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
|
||||
WWVV_v[ss]()(s1,s2)(0, 1) += lhs()(s1)(0) * rhs()(s2)(1);
|
||||
WWVV_v[ss]()(s1,s2)(0, 2) += lhs()(s1)(0) * rhs()(s2)(2);
|
||||
WWVV_v[ss]()(s1,s2)(1, 0) += lhs()(s1)(1) * rhs()(s2)(0);
|
||||
WWVV_v[ss]()(s1,s2)(1, 1) += lhs()(s1)(1) * rhs()(s2)(1);
|
||||
WWVV_v[ss]()(s1,s2)(1, 2) += lhs()(s1)(1) * rhs()(s2)(2);
|
||||
WWVV_v[ss]()(s1,s2)(2, 0) += lhs()(s1)(2) * rhs()(s2)(0);
|
||||
WWVV_v[ss]()(s1,s2)(2, 1) += lhs()(s1)(2) * rhs()(s2)(1);
|
||||
WWVV_v[ss]()(s1,s2)(2, 2) += lhs()(s1)(2) * rhs()(s2)(2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class FImpl>
|
||||
void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWVV0,
|
||||
|
Reference in New Issue
Block a user