mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 11:15:55 +01:00
Reimplemented GparityWilsonImpl::InsertForce5D to run efficiently on GPUs
Swapped order of templated tensor code and c-number specializations in Tensor_outer.h to fix compile issue with type deduction on Summit
This commit is contained in:
parent
a0ca362690
commit
ba5dc670a5
@ -30,7 +30,6 @@ directory
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
/*
|
||||
Policy implementation for G-parity boundary conditions
|
||||
|
||||
@ -358,28 +357,41 @@ public:
|
||||
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
|
||||
assert(0);
|
||||
}
|
||||
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) {
|
||||
|
||||
int Ls = Btilde.Grid()->_fdimensions[0];
|
||||
|
||||
GaugeLinkField tmp(mat.Grid());
|
||||
tmp = Zero();
|
||||
int Ls=Btilde.Grid()->_fdimensions[0];
|
||||
|
||||
{
|
||||
autoView( tmp_v , tmp, CpuWrite);
|
||||
autoView( Atilde_v , Atilde, CpuRead);
|
||||
autoView( Btilde_v , Btilde, CpuRead);
|
||||
thread_for(ss,tmp.Grid()->oSites(),{
|
||||
for (int s = 0; s < Ls; s++) {
|
||||
int sF = s + Ls * ss;
|
||||
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
|
||||
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
||||
}
|
||||
});
|
||||
autoView( mat_v , mat, AcceleratorWrite);
|
||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||
accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{
|
||||
int sU=sss;
|
||||
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
|
||||
ColorMatrixType sum;
|
||||
zeroit(sum);
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
for(int spn=0;spn<Ns;spn++){ //sum over spin
|
||||
//Flavor 0
|
||||
auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
|
||||
auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
|
||||
sum = sum + outerProduct(bb,aa);
|
||||
|
||||
//Flavor 1
|
||||
bb = coalescedRead(Btilde_v[sF](1)(spn) );
|
||||
aa = coalescedRead(Atilde_v[sF](1)(spn) );
|
||||
sum = sum + conjugate(outerProduct(bb,aa));
|
||||
}
|
||||
}
|
||||
coalescedWrite(mat_v[sU](mu)(), sum);
|
||||
});
|
||||
}
|
||||
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
@ -35,6 +35,17 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Vector x Vector -> Matrix
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class CC,IfComplex<CC> = 0>
|
||||
accelerator_inline CC outerProduct(const CC &l, const CC& r)
|
||||
{
|
||||
return l*conj(r);
|
||||
}
|
||||
template<class RR,IfReal<RR> = 0>
|
||||
accelerator_inline RR outerProduct(const RR &l, const RR& r)
|
||||
{
|
||||
return l*r;
|
||||
}
|
||||
|
||||
template<class l,class r,int N> accelerator_inline
|
||||
auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
|
||||
{
|
||||
@ -57,16 +68,6 @@ auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<declt
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<class CC,IfComplex<CC> = 0>
|
||||
accelerator_inline CC outerProduct(const CC &l, const CC& r)
|
||||
{
|
||||
return l*conj(r);
|
||||
}
|
||||
template<class RR,IfReal<RR> = 0>
|
||||
accelerator_inline RR outerProduct(const RR &l, const RR& r)
|
||||
{
|
||||
return l*r;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user