1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 11:15:55 +01:00

Reimplemented GparityWilsonImpl::InsertForce5D to run efficiently on GPUs

Swapped order of templated tensor code and c-number specializations in Tensor_outer.h to fix compile issue with type deduction on Summit
This commit is contained in:
Christopher Kelly 2020-12-22 10:10:07 -05:00
parent a0ca362690
commit ba5dc670a5
2 changed files with 42 additions and 29 deletions

View File

@ -30,7 +30,6 @@ directory
NAMESPACE_BEGIN(Grid);
/*
Policy implementation for G-parity boundary conditions
@ -358,28 +357,41 @@ public:
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
assert(0);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls = Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
int Ls=Btilde.Grid()->_fdimensions[0];
{
autoView( tmp_v , tmp, CpuWrite);
autoView( Atilde_v , Atilde, CpuRead);
autoView( Btilde_v , Btilde, CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
}
});
autoView( mat_v , mat, AcceleratorWrite);
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
//Flavor 0
auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
sum = sum + outerProduct(bb,aa);
//Flavor 1
bb = coalescedRead(Btilde_v[sF](1)(spn) );
aa = coalescedRead(Atilde_v[sF](1)(spn) );
sum = sum + conjugate(outerProduct(bb,aa));
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
PokeIndex<LorentzIndex>(mat, tmp, mu);
return;
}
};

View File

@ -35,6 +35,17 @@ NAMESPACE_BEGIN(Grid);
// Vector x Vector -> Matrix
///////////////////////////////////////////////////////////////////////////////////////
template<class CC,IfComplex<CC> = 0>
accelerator_inline CC outerProduct(const CC &l, const CC& r)
{
return l*conj(r);
}
template<class RR,IfReal<RR> = 0>
accelerator_inline RR outerProduct(const RR &l, const RR& r)
{
return l*r;
}
template<class l,class r,int N> accelerator_inline
auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
{
@ -57,16 +68,6 @@ auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<declt
return ret;
}
template<class CC,IfComplex<CC> = 0>
accelerator_inline CC outerProduct(const CC &l, const CC& r)
{
return l*conj(r);
}
template<class RR,IfReal<RR> = 0>
accelerator_inline RR outerProduct(const RR &l, const RR& r)
{
return l*r;
}
NAMESPACE_END(Grid);