mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Reduce volume to interior for coarse stencil -- worth up to 4x gain
This commit is contained in:
parent
0a3682ad0b
commit
e859a199df
@ -85,7 +85,7 @@ public:
|
|||||||
}
|
}
|
||||||
std::cout << GridLogMessage<<"MultiGeneralCoarsenedMatrix "<<_CoarseGrid->lSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <<std::endl;
|
std::cout << GridLogMessage<<"MultiGeneralCoarsenedMatrix "<<_CoarseGrid->lSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <<std::endl;
|
||||||
|
|
||||||
StencilMasked.resize(_CoarseGridMulti->oSites());
|
StencilMasked.resize(_CoarseGridMulti->oSites()*geom.npoint);
|
||||||
std::vector<GeneralStencilEntryReordered> StencilTmp;
|
std::vector<GeneralStencilEntryReordered> StencilTmp;
|
||||||
|
|
||||||
int32_t j=0;
|
int32_t j=0;
|
||||||
@ -103,15 +103,18 @@ public:
|
|||||||
for(int32_t point = 0 ; point < geom.npoint; point++){
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||||
int i=s*geom.npoint+point;
|
int i=s*geom.npoint+point;
|
||||||
tmp._offset = Stencil._entries[i]._offset;
|
tmp._offset = Stencil._entries[i]._offset;
|
||||||
tmp._permute= Stencil._entries[i]._permute;
|
tmp._permute= Stencil._entries[i]._permute; // Should be no premute and j=site
|
||||||
tmp._output = j;
|
tmp._input = s;
|
||||||
StencilTmp.push_back(tmp);
|
StencilTmp.push_back(tmp);
|
||||||
}
|
}
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout << "coarse osites x npoint "<<_CoarseGridMulti->oSites()*geom.npoint<< " stencil interior size "<< StencilTmp.size()<<std::endl;
|
|
||||||
assert(_CoarseGridMulti->lSites()*geom.npoint==StencilTmp.size());
|
std::cout << " oSites " << _CoarseGridMulti->oSites()<<std::endl;
|
||||||
|
std::cout << " npoint " << geom.npoint<<std::endl;
|
||||||
|
std::cout << " StencilTmp "<<StencilTmp.size();
|
||||||
|
assert(_CoarseGridMulti->oSites()*geom.npoint==StencilTmp.size());
|
||||||
acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
|
acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
|
||||||
CopyMatrix();
|
CopyMatrix();
|
||||||
}
|
}
|
||||||
@ -152,24 +155,20 @@ public:
|
|||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
|
||||||
int64_t osites=pin.Grid()->oSites();
|
RealD flops,bytes;
|
||||||
int64_t nrhs =pin.Grid()->GlobalDimensions()[0]/Nsimd;
|
int64_t nrhs =pin.Grid()->GlobalDimensions()[0]/Nsimd;
|
||||||
assert(nrhs>=1);
|
assert(nrhs>=1);
|
||||||
|
|
||||||
RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
#if 0
|
||||||
RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
|
||||||
+ 2.0*osites*sizeof(siteVector)*npoint;
|
|
||||||
|
|
||||||
{
|
{
|
||||||
tviews-=usecond();
|
tviews-=usecond();
|
||||||
autoView( in_v , pin, AcceleratorRead);
|
autoView( in_v , pin, AcceleratorRead);
|
||||||
autoView( out_v , pout, AcceleratorWriteDiscard);
|
autoView( out_v , pout, AcceleratorWriteDiscard);
|
||||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
|
||||||
tviews+=usecond();
|
tviews+=usecond();
|
||||||
|
|
||||||
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
||||||
ttemps-=usecond();
|
ttemps-=usecond();
|
||||||
MultTemporaries.resize(npoint,pin.Grid());
|
MultTemporaries.resize(npoint,in.Grid());
|
||||||
ttemps+=usecond();
|
ttemps+=usecond();
|
||||||
|
|
||||||
std::vector<Aview> AcceleratorViewContainer_h;
|
std::vector<Aview> AcceleratorViewContainer_h;
|
||||||
@ -194,10 +193,16 @@ public:
|
|||||||
tcopy+=usecond();
|
tcopy+=usecond();
|
||||||
|
|
||||||
int32_t bound = _A[0].size();
|
int32_t bound = _A[0].size();
|
||||||
|
int64_t osites=pin.Grid()->oSites();
|
||||||
|
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
||||||
|
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
||||||
|
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||||
|
|
||||||
std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
|
std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
|
||||||
std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
||||||
std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
||||||
tmult-=usecond();
|
tmult-=usecond();
|
||||||
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
|
accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
|
||||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
int32_t ss = rspb/(nbasis*npoint);
|
int32_t ss = rspb/(nbasis*npoint);
|
||||||
@ -228,6 +233,7 @@ public:
|
|||||||
});
|
});
|
||||||
tmult2+=usecond();
|
tmult2+=usecond();
|
||||||
tmult+=usecond();
|
tmult+=usecond();
|
||||||
|
|
||||||
for(int p=0;p<npoint;p++) {
|
for(int p=0;p<npoint;p++) {
|
||||||
AcceleratorVecViewContainer_h[p].ViewClose();
|
AcceleratorVecViewContainer_h[p].ViewClose();
|
||||||
}
|
}
|
||||||
@ -237,6 +243,85 @@ public:
|
|||||||
out = Cell.Extract(pout);
|
out = Cell.Extract(pout);
|
||||||
text+=usecond();
|
text+=usecond();
|
||||||
ttot+=usecond();
|
ttot+=usecond();
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
tviews-=usecond();
|
||||||
|
autoView( in_v , pin, AcceleratorRead);
|
||||||
|
autoView( out_v , out, AcceleratorWriteDiscard);
|
||||||
|
tviews+=usecond();
|
||||||
|
|
||||||
|
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
||||||
|
ttemps-=usecond();
|
||||||
|
MultTemporaries.resize(npoint,in.Grid());
|
||||||
|
ttemps+=usecond();
|
||||||
|
|
||||||
|
std::vector<Aview> AcceleratorViewContainer_h;
|
||||||
|
std::vector<Vview> AcceleratorVecViewContainer_h;
|
||||||
|
|
||||||
|
tviews-=usecond();
|
||||||
|
for(int p=0;p<npoint;p++) {
|
||||||
|
AcceleratorViewContainer_h.push_back( &_A[p][0]);
|
||||||
|
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
|
||||||
|
}
|
||||||
|
tviews+=usecond();
|
||||||
|
|
||||||
|
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
|
||||||
|
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
|
||||||
|
|
||||||
|
auto Aview_p = &AcceleratorViewContainer[0];
|
||||||
|
auto Vview_p = &AcceleratorVecViewContainer[0];
|
||||||
|
|
||||||
|
tcopy-=usecond();
|
||||||
|
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
|
||||||
|
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
|
||||||
|
tcopy+=usecond();
|
||||||
|
|
||||||
|
int32_t bound = _A[0].size();
|
||||||
|
int64_t osites=in.Grid()->oSites();
|
||||||
|
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
||||||
|
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
||||||
|
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||||
|
|
||||||
|
std::cout << " osites "<<osites <<" bound "<<bound<< " stencilsize "<<StencilMasked.size()<<std::endl;
|
||||||
|
std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
||||||
|
std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
||||||
|
tmult-=usecond();
|
||||||
|
auto Stencil_v = &StencilMasked[0];
|
||||||
|
accelerator_for(rspb, StencilMasked.size()*nbasis, Nsimd, {
|
||||||
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
int32_t ss = rspb/(nbasis*npoint); // site of unpadded
|
||||||
|
int32_t bp = rspb%(nbasis*npoint);
|
||||||
|
int32_t point= bp/nbasis;
|
||||||
|
int32_t b = bp%nbasis;
|
||||||
|
auto SE = &Stencil_v[ss*npoint+point];
|
||||||
|
int32_t s = SE->_input;
|
||||||
|
int32_t snbr= SE->_offset;
|
||||||
|
std::cout << " unpadded " << ss<<" padded " << s<< " point "<<point <<" row " <<b<<std::endl;
|
||||||
|
auto nbr = coalescedRead(in_v[snbr]);
|
||||||
|
auto res = Aview_p[point][s](0,b)*nbr(0);
|
||||||
|
for(int bb=1;bb<nbasis;bb++) {
|
||||||
|
res = res + Aview_p[point][s](bb,b)*nbr(bb);
|
||||||
|
}
|
||||||
|
coalescedWrite(Vview_p[point][ss](b),res);
|
||||||
|
});
|
||||||
|
tmult2-=usecond();
|
||||||
|
accelerator_for(sb, osites*nbasis, Nsimd, {
|
||||||
|
int ss = sb/nbasis;
|
||||||
|
int b = sb%nbasis;
|
||||||
|
auto res = coalescedRead(Vview_p[0][ss](b));
|
||||||
|
for(int point=1;point<npoint;point++){
|
||||||
|
res = res + coalescedRead(Vview_p[point][ss](b));
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
tmult2+=usecond();
|
||||||
|
tmult+=usecond();
|
||||||
|
for(int p=0;p<npoint;p++) {
|
||||||
|
AcceleratorVecViewContainer_h[p].ViewClose();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ttot+=usecond();
|
||||||
|
#endif
|
||||||
|
|
||||||
std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
|
||||||
std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
||||||
|
@ -34,7 +34,7 @@ struct GeneralStencilEntry {
|
|||||||
uint8_t _permute; // 1 bytes // Horrible alignment properties
|
uint8_t _permute; // 1 bytes // Horrible alignment properties
|
||||||
};
|
};
|
||||||
struct GeneralStencilEntryReordered : public GeneralStencilEntry {
|
struct GeneralStencilEntryReordered : public GeneralStencilEntry {
|
||||||
uint64_t _output;
|
uint64_t _input;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Could pack to 8 + 4 + 4 = 128 bit and use
|
// Could pack to 8 + 4 + 4 = 128 bit and use
|
||||||
|
@ -107,7 +107,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
const int nbasis = 32;
|
const int nbasis = 8;
|
||||||
const int cb = 0 ;
|
const int cb = 0 ;
|
||||||
LatticeFermion prom(FGrid);
|
LatticeFermion prom(FGrid);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user