mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Thread loops for now; figure out what can be GPU accelerated later here
This commit is contained in:
parent
e9c8ba5ef7
commit
43f244badf
@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
|||||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
||||||
half.checkerboard = cb;
|
half.checkerboard = cb;
|
||||||
|
|
||||||
parallel_for(int ss=0;ss<full._grid->oSites();ss++){
|
thread_loop( (int ss=0;ss<full._grid->oSites();ss++),{
|
||||||
int cbos;
|
int cbos;
|
||||||
std::vector<int> coor;
|
std::vector<int> coor;
|
||||||
full._grid->oCoorFromOindex(coor,ss);
|
full._grid->oCoorFromOindex(coor,ss);
|
||||||
@ -61,11 +61,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
|
|||||||
int ssh=half._grid->oIndex(coor);
|
int ssh=half._grid->oIndex(coor);
|
||||||
half._odata[ssh] = full._odata[ss];
|
half._odata[ssh] = full._odata[ss];
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
||||||
int cb = half.checkerboard;
|
int cb = half.checkerboard;
|
||||||
parallel_for(int ss=0;ss<full._grid->oSites();ss++){
|
thread_loop( (int ss=0;ss<full._grid->oSites();ss++), {
|
||||||
std::vector<int> coor;
|
std::vector<int> coor;
|
||||||
int cbos;
|
int cbos;
|
||||||
|
|
||||||
@ -76,7 +76,7 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
|
|||||||
int ssh=half._grid->oIndex(coor);
|
int ssh=half._grid->oIndex(coor);
|
||||||
full._odata[ss]=half._odata[ssh];
|
full._odata[ss]=half._odata[ssh];
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -106,7 +106,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
coarseData=zero;
|
coarseData=zero;
|
||||||
|
|
||||||
// Loop over coars parallel, and then loop over fine associated with coarse.
|
// Loop over coars parallel, and then loop over fine associated with coarse.
|
||||||
parallel_for(int sf=0;sf<fine->oSites();sf++){
|
thread_loop( (int sf=0;sf<fine->oSites();sf++),{
|
||||||
|
|
||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
std::vector<int> coor_c(_ndimension);
|
||||||
@ -115,14 +115,14 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
PARALLEL_CRITICAL
|
thread_critical {
|
||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
|
|
||||||
coarseData._odata[sc](i)=coarseData._odata[sc](i)
|
coarseData._odata[sc](i)=coarseData._odata[sc](i)
|
||||||
+ innerProduct(Basis[i]._odata[sf],fineData._odata[sf]);
|
+ innerProduct(Basis[i]._odata[sf],fineData._odata[sf]);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -151,7 +151,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||||
}
|
}
|
||||||
|
|
||||||
parallel_for(int sf=0;sf<fine->oSites();sf++){
|
thread_loop( (int sf=0;sf<fine->oSites();sf++),{
|
||||||
|
|
||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
std::vector<int> coor_c(_ndimension);
|
||||||
@ -164,7 +164,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
// z = A x + y
|
// z = A x + y
|
||||||
fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
|
fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
|
||||||
|
|
||||||
}
|
});
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -184,9 +184,9 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
|||||||
// Precision promotion?
|
// Precision promotion?
|
||||||
fine_inner = localInnerProduct(fineX,fineY);
|
fine_inner = localInnerProduct(fineX,fineY);
|
||||||
blockSum(coarse_inner,fine_inner);
|
blockSum(coarse_inner,fine_inner);
|
||||||
parallel_for(int ss=0;ss<coarse->oSites();ss++){
|
thread_loop( (int ss=0;ss<coarse->oSites();ss++),{
|
||||||
CoarseInner._odata[ss] = coarse_inner._odata[ss];
|
CoarseInner._odata[ss] = coarse_inner._odata[ss];
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
template<class vobj,class CComplex>
|
template<class vobj,class CComplex>
|
||||||
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
|
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
|
||||||
@ -218,22 +218,23 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
// Turn this around to loop threaded over sc and interior loop
|
// Turn this around to loop threaded over sc and interior loop
|
||||||
// over sf would thread better
|
// over sf would thread better
|
||||||
coarseData=zero;
|
coarseData=zero;
|
||||||
parallel_region {
|
thread_region {
|
||||||
|
|
||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
std::vector<int> coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
std::vector<int> coor_f(_ndimension);
|
||||||
|
|
||||||
parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
|
thread_loop_in_region( (int sf=0;sf<fine->oSites();sf++),{
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
PARALLEL_CRITICAL
|
thread_critical {
|
||||||
coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
|
coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
|
||||||
|
}
|
||||||
|
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -266,7 +267,6 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
|
|||||||
GridBase *fine = Basis[0]._grid;
|
GridBase *fine = Basis[0]._grid;
|
||||||
|
|
||||||
int nbasis = Basis.size() ;
|
int nbasis = Basis.size() ;
|
||||||
int _ndimension = coarse->_ndimension;
|
|
||||||
|
|
||||||
// checks
|
// checks
|
||||||
subdivides(coarse,fine);
|
subdivides(coarse,fine);
|
||||||
@ -308,12 +308,12 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Loop with a cache friendly loop ordering
|
// Loop with a cache friendly loop ordering
|
||||||
parallel_region {
|
thread_region {
|
||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
std::vector<int> coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
std::vector<int> coor_f(_ndimension);
|
||||||
|
|
||||||
parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
|
thread_loop_in_region( (int sf=0;sf<fine->oSites();sf++),{
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
@ -323,7 +323,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
|
if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
|
||||||
else fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
|
else fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -351,7 +351,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
|
|||||||
assert(ig->lSites() == og->lSites());
|
assert(ig->lSites() == og->lSites());
|
||||||
}
|
}
|
||||||
|
|
||||||
parallel_for(int idx=0;idx<ig->lSites();idx++){
|
thread_loop( (int idx=0;idx<ig->lSites();idx++),{
|
||||||
sobj s;
|
sobj s;
|
||||||
ssobj ss;
|
ssobj ss;
|
||||||
|
|
||||||
@ -360,7 +360,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
|
|||||||
peekLocalSite(s,in,lcoor);
|
peekLocalSite(s,in,lcoor);
|
||||||
ss=s;
|
ss=s;
|
||||||
pokeLocalSite(ss,out,lcoor);
|
pokeLocalSite(ss,out,lcoor);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -389,7 +389,7 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
parallel_for(int idx=0;idx<lg->lSites();idx++){
|
thread_loop( (int idx=0;idx<lg->lSites();idx++),{
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
std::vector<int> lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
std::vector<int> hcoor(nh);
|
||||||
@ -403,7 +403,7 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
}
|
}
|
||||||
peekLocalSite(s,lowDim,lcoor);
|
peekLocalSite(s,lowDim,lcoor);
|
||||||
pokeLocalSite(s,higherDim,hcoor);
|
pokeLocalSite(s,higherDim,hcoor);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
@ -430,7 +430,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
parallel_for(int idx=0;idx<lg->lSites();idx++){
|
thread_loop((int idx=0;idx<lg->lSites();idx++),{
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
std::vector<int> lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
std::vector<int> hcoor(nh);
|
||||||
@ -444,7 +444,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
}
|
}
|
||||||
peekLocalSite(s,higherDim,hcoor);
|
peekLocalSite(s,higherDim,hcoor);
|
||||||
pokeLocalSite(s,lowDim,lcoor);
|
pokeLocalSite(s,lowDim,lcoor);
|
||||||
}
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -469,7 +469,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
parallel_for(int idx=0;idx<lg->lSites();idx++){
|
thread_loop( (int idx=0;idx<lg->lSites();idx++),{
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
std::vector<int> lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
std::vector<int> hcoor(nh);
|
||||||
@ -480,7 +480,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
peekLocalSite(s,lowDim,lcoor);
|
peekLocalSite(s,lowDim,lcoor);
|
||||||
pokeLocalSite(s,higherDim,hcoor);
|
pokeLocalSite(s,higherDim,hcoor);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -504,7 +504,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
parallel_for(int idx=0;idx<lg->lSites();idx++){
|
thread_loop( (int idx=0;idx<lg->lSites();idx++),{
|
||||||
sobj s;
|
sobj s;
|
||||||
std::vector<int> lcoor(nl);
|
std::vector<int> lcoor(nl);
|
||||||
std::vector<int> hcoor(nh);
|
std::vector<int> hcoor(nh);
|
||||||
@ -515,7 +515,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
|
|||||||
peekLocalSite(s,higherDim,hcoor);
|
peekLocalSite(s,higherDim,hcoor);
|
||||||
pokeLocalSite(s,lowDim,lcoor);
|
pokeLocalSite(s,lowDim,lcoor);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -575,8 +575,9 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
in_icoor[lane].resize(ndim);
|
in_icoor[lane].resize(ndim);
|
||||||
in_grid->iCoorFromIindex(in_icoor[lane], lane);
|
in_grid->iCoorFromIindex(in_icoor[lane], lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
|
//loop over outer index
|
||||||
|
thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
std::vector<sobj*> out_ptrs(in_nsimd);
|
std::vector<sobj*> out_ptrs(in_nsimd);
|
||||||
|
|
||||||
@ -597,7 +598,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
//Unpack into those ptrs
|
//Unpack into those ptrs
|
||||||
const vobj & in_vobj = in._odata[in_oidx];
|
const vobj & in_vobj = in._odata[in_oidx];
|
||||||
extract1(in_vobj, out_ptrs, 0);
|
extract1(in_vobj, out_ptrs, 0);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
|
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
|
||||||
template<typename vobj, typename sobj>
|
template<typename vobj, typename sobj>
|
||||||
@ -621,7 +622,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
grid->iCoorFromIindex(icoor[lane],lane);
|
grid->iCoorFromIindex(icoor[lane],lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
|
thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
std::vector<sobj*> ptrs(nsimd);
|
std::vector<sobj*> ptrs(nsimd);
|
||||||
|
|
||||||
@ -645,12 +646,13 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
vobj vecobj;
|
vobj vecobj;
|
||||||
merge1(vecobj, ptrs, 0);
|
merge1(vecobj, ptrs, 0);
|
||||||
out._odata[oidx] = vecobj;
|
out._odata[oidx] = vecobj;
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
//Convert a Lattice from one precision to another
|
//Convert a Lattice from one precision to another
|
||||||
template<class VobjOut, class VobjIn>
|
template<class VobjOut, class VobjIn>
|
||||||
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
||||||
|
|
||||||
assert(out._grid->Nd() == in._grid->Nd());
|
assert(out._grid->Nd() == in._grid->Nd());
|
||||||
out.checkerboard = in.checkerboard;
|
out.checkerboard = in.checkerboard;
|
||||||
GridBase *in_grid=in._grid;
|
GridBase *in_grid=in._grid;
|
||||||
@ -672,7 +674,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
|||||||
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
||||||
unvectorizeToLexOrdArray(in_slex_conv, in);
|
unvectorizeToLexOrdArray(in_slex_conv, in);
|
||||||
|
|
||||||
parallel_for(uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
|
thread_loop( (uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++),{
|
||||||
std::vector<int> out_ocoor(ndim);
|
std::vector<int> out_ocoor(ndim);
|
||||||
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
||||||
|
|
||||||
@ -688,7 +690,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
|||||||
ptrs[lane] = &in_slex_conv[llex];
|
ptrs[lane] = &in_slex_conv[llex];
|
||||||
}
|
}
|
||||||
merge(out._odata[out_oidx], ptrs, 0);
|
merge(out._odata[out_oidx], ptrs, 0);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -789,9 +791,9 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
|
|
||||||
for(int v=0;v<nvector;v++){
|
for(int v=0;v<nvector;v++){
|
||||||
unvectorizeToLexOrdArray(scalardata,full[v]);
|
unvectorizeToLexOrdArray(scalardata,full[v]);
|
||||||
parallel_for(int site=0;site<lsites;site++){
|
thread_loop( (int site=0;site<lsites;site++),{
|
||||||
alldata[v*lsites+site] = scalardata[site];
|
alldata[v*lsites+site] = scalardata[site];
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
int nvec = nvector; // Counts down to 1 as we collapse dims
|
int nvec = nvector; // Counts down to 1 as we collapse dims
|
||||||
@ -821,7 +823,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
|
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
|
||||||
|
|
||||||
// Loop over reordered data post A2A
|
// Loop over reordered data post A2A
|
||||||
parallel_for(int c=0;c<chunk;c++){
|
thread_loop( (int c=0;c<chunk;c++),{
|
||||||
std::vector<int> coor(ndim);
|
std::vector<int> coor(ndim);
|
||||||
for(int m=0;m<M;m++){
|
for(int m=0;m<M;m++){
|
||||||
for(int s=0;s<sP;s++){
|
for(int s=0;s<sP;s++){
|
||||||
@ -844,7 +846,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
ldims[d]*= ratio[d];
|
ldims[d]*= ratio[d];
|
||||||
lsites *= ratio[d];
|
lsites *= ratio[d];
|
||||||
|
|
||||||
@ -940,7 +942,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
|
|
||||||
{
|
{
|
||||||
// Loop over reordered data post A2A
|
// Loop over reordered data post A2A
|
||||||
parallel_for(int c=0;c<chunk;c++){
|
thread_loop( (int c=0;c<chunk;c++),{
|
||||||
std::vector<int> coor(ndim);
|
std::vector<int> coor(ndim);
|
||||||
for(int m=0;m<M;m++){
|
for(int m=0;m<M;m++){
|
||||||
for(int s=0;s<sP;s++){
|
for(int s=0;s<sP;s++){
|
||||||
@ -962,7 +964,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
tmpdata[lex_c] = alldata[lex_r];
|
tmpdata[lex_c] = alldata[lex_r];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( split_grid->_processors[d] > 1 ) {
|
if ( split_grid->_processors[d] > 1 ) {
|
||||||
@ -979,10 +981,10 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
|
|||||||
lsites = full_grid->lSites();
|
lsites = full_grid->lSites();
|
||||||
for(int v=0;v<nvector;v++){
|
for(int v=0;v<nvector;v++){
|
||||||
// assert(v<full.size());
|
// assert(v<full.size());
|
||||||
parallel_for(int site=0;site<lsites;site++){
|
thread_loop( (int site=0;site<lsites;site++),{
|
||||||
// assert(v*lsites+site < alldata.size());
|
// assert(v*lsites+site < alldata.size());
|
||||||
scalardata[site] = alldata[v*lsites+site];
|
scalardata[site] = alldata[v*lsites+site];
|
||||||
}
|
});
|
||||||
vectorizeFromLexOrdArray(scalardata,full[v]);
|
vectorizeFromLexOrdArray(scalardata,full[v]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user