1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Thread loops for now; figure out what can be GPU accelerated later here

This commit is contained in:
paboyle 2018-01-24 13:40:30 +00:00
parent e9c8ba5ef7
commit 43f244badf

View File

@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
half.checkerboard = cb; half.checkerboard = cb;
parallel_for(int ss=0;ss<full._grid->oSites();ss++){ thread_loop( (int ss=0;ss<full._grid->oSites();ss++),{
int cbos; int cbos;
std::vector<int> coor; std::vector<int> coor;
full._grid->oCoorFromOindex(coor,ss); full._grid->oCoorFromOindex(coor,ss);
@ -61,11 +61,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
int ssh=half._grid->oIndex(coor); int ssh=half._grid->oIndex(coor);
half._odata[ssh] = full._odata[ss]; half._odata[ssh] = full._odata[ss];
} }
} });
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.checkerboard; int cb = half.checkerboard;
parallel_for(int ss=0;ss<full._grid->oSites();ss++){ thread_loop( (int ss=0;ss<full._grid->oSites();ss++), {
std::vector<int> coor; std::vector<int> coor;
int cbos; int cbos;
@ -76,7 +76,7 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
int ssh=half._grid->oIndex(coor); int ssh=half._grid->oIndex(coor);
full._odata[ss]=half._odata[ssh]; full._odata[ss]=half._odata[ssh];
} }
} });
} }
@ -106,7 +106,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
coarseData=zero; coarseData=zero;
// Loop over coars parallel, and then loop over fine associated with coarse. // Loop over coars parallel, and then loop over fine associated with coarse.
parallel_for(int sf=0;sf<fine->oSites();sf++){ thread_loop( (int sf=0;sf<fine->oSites();sf++),{
int sc; int sc;
std::vector<int> coor_c(_ndimension); std::vector<int> coor_c(_ndimension);
@ -115,14 +115,14 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
PARALLEL_CRITICAL thread_critical {
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
coarseData._odata[sc](i)=coarseData._odata[sc](i) coarseData._odata[sc](i)=coarseData._odata[sc](i)
+ innerProduct(Basis[i]._odata[sf],fineData._odata[sf]); + innerProduct(Basis[i]._odata[sf],fineData._odata[sf]);
} }
} }
});
return; return;
} }
@ -151,7 +151,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
parallel_for(int sf=0;sf<fine->oSites();sf++){ thread_loop( (int sf=0;sf<fine->oSites();sf++),{
int sc; int sc;
std::vector<int> coor_c(_ndimension); std::vector<int> coor_c(_ndimension);
@ -164,7 +164,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
// z = A x + y // z = A x + y
fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf]; fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
} });
return; return;
} }
@ -184,9 +184,9 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
// Precision promotion? // Precision promotion?
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
parallel_for(int ss=0;ss<coarse->oSites();ss++){ thread_loop( (int ss=0;ss<coarse->oSites();ss++),{
CoarseInner._odata[ss] = coarse_inner._odata[ss]; CoarseInner._odata[ss] = coarse_inner._odata[ss];
} });
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX) inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
@ -218,22 +218,23 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
// Turn this around to loop threaded over sc and interior loop // Turn this around to loop threaded over sc and interior loop
// over sf would thread better // over sf would thread better
coarseData=zero; coarseData=zero;
parallel_region { thread_region {
int sc; int sc;
std::vector<int> coor_c(_ndimension); std::vector<int> coor_c(_ndimension);
std::vector<int> coor_f(_ndimension); std::vector<int> coor_f(_ndimension);
parallel_for_internal(int sf=0;sf<fine->oSites();sf++){ thread_loop_in_region( (int sf=0;sf<fine->oSites();sf++),{
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
PARALLEL_CRITICAL thread_critical {
coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf]; coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
}
} });
} }
return; return;
} }
@ -266,7 +267,6 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
GridBase *fine = Basis[0]._grid; GridBase *fine = Basis[0]._grid;
int nbasis = Basis.size() ; int nbasis = Basis.size() ;
int _ndimension = coarse->_ndimension;
// checks // checks
subdivides(coarse,fine); subdivides(coarse,fine);
@ -308,12 +308,12 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
parallel_region { thread_region {
int sc; int sc;
std::vector<int> coor_c(_ndimension); std::vector<int> coor_c(_ndimension);
std::vector<int> coor_f(_ndimension); std::vector<int> coor_f(_ndimension);
parallel_for_internal(int sf=0;sf<fine->oSites();sf++){ thread_loop_in_region( (int sf=0;sf<fine->oSites();sf++),{
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
@ -323,7 +323,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf]; if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
else fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf]; else fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
} }
} });
} }
return; return;
@ -351,7 +351,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
assert(ig->lSites() == og->lSites()); assert(ig->lSites() == og->lSites());
} }
parallel_for(int idx=0;idx<ig->lSites();idx++){ thread_loop( (int idx=0;idx<ig->lSites();idx++),{
sobj s; sobj s;
ssobj ss; ssobj ss;
@ -360,7 +360,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
peekLocalSite(s,in,lcoor); peekLocalSite(s,in,lcoor);
ss=s; ss=s;
pokeLocalSite(ss,out,lcoor); pokeLocalSite(ss,out,lcoor);
} });
} }
@ -389,7 +389,7 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
parallel_for(int idx=0;idx<lg->lSites();idx++){ thread_loop( (int idx=0;idx<lg->lSites();idx++),{
sobj s; sobj s;
std::vector<int> lcoor(nl); std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh); std::vector<int> hcoor(nh);
@ -403,7 +403,7 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
} }
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDim,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDim,hcoor);
} });
} }
template<class vobj> template<class vobj>
@ -430,7 +430,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
parallel_for(int idx=0;idx<lg->lSites();idx++){ thread_loop((int idx=0;idx<lg->lSites();idx++),{
sobj s; sobj s;
std::vector<int> lcoor(nl); std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh); std::vector<int> hcoor(nh);
@ -444,7 +444,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDim,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDim,lcoor);
} });
} }
@ -469,7 +469,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
parallel_for(int idx=0;idx<lg->lSites();idx++){ thread_loop( (int idx=0;idx<lg->lSites();idx++),{
sobj s; sobj s;
std::vector<int> lcoor(nl); std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh); std::vector<int> hcoor(nh);
@ -480,7 +480,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDim,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDim,hcoor);
} }
} });
} }
@ -504,7 +504,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
parallel_for(int idx=0;idx<lg->lSites();idx++){ thread_loop( (int idx=0;idx<lg->lSites();idx++),{
sobj s; sobj s;
std::vector<int> lcoor(nl); std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh); std::vector<int> hcoor(nh);
@ -515,7 +515,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDim,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDim,lcoor);
} }
} });
} }
@ -575,8 +575,9 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
in_icoor[lane].resize(ndim); in_icoor[lane].resize(ndim);
in_grid->iCoorFromIindex(in_icoor[lane], lane); in_grid->iCoorFromIindex(in_icoor[lane], lane);
} }
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index //loop over outer index
thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
std::vector<sobj*> out_ptrs(in_nsimd); std::vector<sobj*> out_ptrs(in_nsimd);
@ -597,7 +598,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
//Unpack into those ptrs //Unpack into those ptrs
const vobj & in_vobj = in._odata[in_oidx]; const vobj & in_vobj = in._odata[in_oidx];
extract1(in_vobj, out_ptrs, 0); extract1(in_vobj, out_ptrs, 0);
} });
} }
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
template<typename vobj, typename sobj> template<typename vobj, typename sobj>
@ -621,7 +622,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
grid->iCoorFromIindex(icoor[lane],lane); grid->iCoorFromIindex(icoor[lane],lane);
} }
parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
std::vector<sobj*> ptrs(nsimd); std::vector<sobj*> ptrs(nsimd);
@ -645,12 +646,13 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
vobj vecobj; vobj vecobj;
merge1(vecobj, ptrs, 0); merge1(vecobj, ptrs, 0);
out._odata[oidx] = vecobj; out._odata[oidx] = vecobj;
} });
} }
//Convert a Lattice from one precision to another //Convert a Lattice from one precision to another
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
assert(out._grid->Nd() == in._grid->Nd()); assert(out._grid->Nd() == in._grid->Nd());
out.checkerboard = in.checkerboard; out.checkerboard = in.checkerboard;
GridBase *in_grid=in._grid; GridBase *in_grid=in._grid;
@ -672,7 +674,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
std::vector<SobjOut> in_slex_conv(in_grid->lSites()); std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in); unvectorizeToLexOrdArray(in_slex_conv, in);
parallel_for(uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){ thread_loop( (uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++),{
std::vector<int> out_ocoor(ndim); std::vector<int> out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx); out_grid->oCoorFromOindex(out_ocoor, out_oidx);
@ -688,7 +690,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
ptrs[lane] = &in_slex_conv[llex]; ptrs[lane] = &in_slex_conv[llex];
} }
merge(out._odata[out_oidx], ptrs, 0); merge(out._odata[out_oidx], ptrs, 0);
} });
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -789,9 +791,9 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
for(int v=0;v<nvector;v++){ for(int v=0;v<nvector;v++){
unvectorizeToLexOrdArray(scalardata,full[v]); unvectorizeToLexOrdArray(scalardata,full[v]);
parallel_for(int site=0;site<lsites;site++){ thread_loop( (int site=0;site<lsites;site++),{
alldata[v*lsites+site] = scalardata[site]; alldata[v*lsites+site] = scalardata[site];
} });
} }
int nvec = nvector; // Counts down to 1 as we collapse dims int nvec = nvector; // Counts down to 1 as we collapse dims
@ -821,7 +823,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
// Loop over reordered data post A2A // Loop over reordered data post A2A
parallel_for(int c=0;c<chunk;c++){ thread_loop( (int c=0;c<chunk;c++),{
std::vector<int> coor(ndim); std::vector<int> coor(ndim);
for(int m=0;m<M;m++){ for(int m=0;m<M;m++){
for(int s=0;s<sP;s++){ for(int s=0;s<sP;s++){
@ -844,7 +846,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
} }
} }
} });
ldims[d]*= ratio[d]; ldims[d]*= ratio[d];
lsites *= ratio[d]; lsites *= ratio[d];
@ -940,7 +942,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
{ {
// Loop over reordered data post A2A // Loop over reordered data post A2A
parallel_for(int c=0;c<chunk;c++){ thread_loop( (int c=0;c<chunk;c++),{
std::vector<int> coor(ndim); std::vector<int> coor(ndim);
for(int m=0;m<M;m++){ for(int m=0;m<M;m++){
for(int s=0;s<sP;s++){ for(int s=0;s<sP;s++){
@ -962,7 +964,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
tmpdata[lex_c] = alldata[lex_r]; tmpdata[lex_c] = alldata[lex_r];
} }
} }
} });
} }
if ( split_grid->_processors[d] > 1 ) { if ( split_grid->_processors[d] > 1 ) {
@ -979,10 +981,10 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
lsites = full_grid->lSites(); lsites = full_grid->lSites();
for(int v=0;v<nvector;v++){ for(int v=0;v<nvector;v++){
// assert(v<full.size()); // assert(v<full.size());
parallel_for(int site=0;site<lsites;site++){ thread_loop( (int site=0;site<lsites;site++),{
// assert(v*lsites+site < alldata.size()); // assert(v*lsites+site < alldata.size());
scalardata[site] = alldata[v*lsites+site]; scalardata[site] = alldata[v*lsites+site];
} });
vectorizeFromLexOrdArray(scalardata,full[v]); vectorizeFromLexOrdArray(scalardata,full[v]);
} }
} }