1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Red black friendly coarsening

This commit is contained in:
paboyle 2017-10-25 23:51:18 +01:00
parent b395a312af
commit 08583afaff

View File

@ -109,8 +109,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
coarseData=zero; coarseData=zero;
// Loop with a cache friendly loop ordering // Loop over coars parallel, and then loop over fine associated with coarse.
for(int sf=0;sf<fine->oSites();sf++){ parallel_for(int sf=0;sf<fine->oSites();sf++){
int sc; int sc;
std::vector<int> coor_c(_ndimension); std::vector<int> coor_c(_ndimension);
@ -119,8 +119,9 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
PARALLEL_CRITICAL
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
coarseData._odata[sc](i)=coarseData._odata[sc](i) coarseData._odata[sc](i)=coarseData._odata[sc](i)
+ innerProduct(Basis[i]._odata[sf],fineData._odata[sf]); + innerProduct(Basis[i]._odata[sf],fineData._odata[sf]);
@ -139,6 +140,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
GridBase * coarse= coarseA._grid; GridBase * coarse= coarseA._grid;
fineZ.checkerboard=fineX.checkerboard; fineZ.checkerboard=fineX.checkerboard;
assert(fineX.checkerboard==fineY.checkerboard);
subdivides(coarse,fine); // require they map subdivides(coarse,fine); // require they map
conformable(fineX,fineY); conformable(fineX,fineY);
conformable(fineX,fineZ); conformable(fineX,fineZ);
@ -180,9 +182,10 @@ template<class vobj,class CComplex>
GridBase *coarse(CoarseInner._grid); GridBase *coarse(CoarseInner._grid);
GridBase *fine (fineX._grid); GridBase *fine (fineX._grid);
Lattice<dotp> fine_inner(fine); Lattice<dotp> fine_inner(fine); fine_inner.checkerboard = fineX.checkerboard;
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion?
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
parallel_for(int ss=0;ss<coarse->oSites();ss++){ parallel_for(int ss=0;ss<coarse->oSites();ss++){
@ -193,7 +196,7 @@ template<class vobj,class CComplex>
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX) inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
{ {
GridBase *coarse = ip._grid; GridBase *coarse = ip._grid;
Lattice<vobj> zz(fineX._grid); zz=zero; Lattice<vobj> zz(fineX._grid); zz=zero; zz.checkerboard=fineX.checkerboard;
blockInnerProduct(ip,fineX,fineX); blockInnerProduct(ip,fineX,fineX);
ip = pow(ip,-0.5); ip = pow(ip,-0.5);
blockZAXPY(fineX,ip,fineX,zz); blockZAXPY(fineX,ip,fineX,zz);
@ -216,19 +219,25 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
// Turn this around to loop threaded over sc and interior loop
// over sf would thread better
coarseData=zero; coarseData=zero;
for(int sf=0;sf<fine->oSites();sf++){ parallel_region {
int sc; int sc;
std::vector<int> coor_c(_ndimension); std::vector<int> coor_c(_ndimension);
std::vector<int> coor_f(_ndimension); std::vector<int> coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf]; Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
PARALLEL_CRITICAL
coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
}
} }
return; return;
} }
@ -238,7 +247,7 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
{ {
GridBase * fine = unpicked._grid; GridBase * fine = unpicked._grid;
Lattice<vobj> zz(fine); Lattice<vobj> zz(fine); zz.checkerboard = unpicked.checkerboard;
Lattice<iScalar<vInteger> > fcoor(fine); Lattice<iScalar<vInteger> > fcoor(fine);
zz = zero; zz = zero;
@ -303,20 +312,21 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
for(int sf=0;sf<fine->oSites();sf++){ parallel_region {
int sc; int sc;
std::vector<int> coor_c(_ndimension); std::vector<int> coor_c(_ndimension);
std::vector<int> coor_f(_ndimension); std::vector<int> coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) {
if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
else fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) {
if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
else fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
}
} }
} }
return; return;