1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Revert to working.

This commit is contained in:
Peter Boyle 2024-01-17 16:32:05 -05:00
parent d967eb53de
commit ca5ae8a2e6

View File

@ -300,38 +300,6 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
// std::cout << GridLogPerformance << " blockProject : conv : "<<t_co<<" us"<<std::endl; // std::cout << GridLogPerformance << " blockProject : conv : "<<t_co<<" us"<<std::endl;
// std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl; // std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl;
} }
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
const VLattice &Basis)
{
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
Lattice<iScalar<CComplex>> ip(coarse);
Lattice<vobj> fineDataRed = fineData;
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite);
RealD t_IP=0;
RealD t_co=0;
for(int v=0;v<nbasis;v++) {
t_IP-=usecond();
blockInnerProductD(ip,Basis[v],fineData); // ip = <basis|fine>
t_IP+=usecond();
t_co-=usecond();
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
t_co+=usecond();
}
// std::cout << GridLogPerformance << " blockProjectFast : blockInnerProduct : "<<t_IP<<" us"<<std::endl;
// std::cout << GridLogPerformance << " blockProjectFast : conv : "<<t_co<<" us"<<std::endl;
}
// This only minimises data motion from CPU to GPU // This only minimises data motion from CPU to GPU
// there is chance of better implementation that does a vxk loop of inner products to data share // there is chance of better implementation that does a vxk loop of inner products to data share
// at the GPU thread level // at the GPU thread level
@ -776,9 +744,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
//////////////////////////////////////////////////////////////////////////////////////////////// static const int words=sizeof(vobj)/sizeof(vector_type);
// the checks should guarantee that the operations are local
////////////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded); assert(!Fg->_isCheckerBoarded);
@ -792,18 +759,38 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
for(int d=0;d<nd;d++){ for(int d=0;d<nd;d++){
assert(Fg->_processors[d] == Tg->_processors[d]); assert(Fg->_processors[d] == Tg->_processors[d]);
} }
// the above should guarantee that the operations are local
#if 1
size_t nsite = 1; size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= RegionSize[i]; for(int i=0;i<nd;i++) nsite *= RegionSize[i];
//////////////////////////////////////////////////////////////////////////////////////////////// size_t tbytes = 4*nsite*sizeof(int);
// do the index calc on the GPU int *table = (int*)malloc(tbytes);
////////////////////////////////////////////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride; thread_for(idx, nsite, {
Coordinate f_istride = Fg->_istride; Coordinate from_coor, to_coor;
Coordinate f_rdimensions = Fg->_rdimensions; size_t rem = idx;
Coordinate t_ostride = Tg->_ostride; for(int i=0;i<nd;i++){
Coordinate t_istride = Tg->_istride; size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
Coordinate t_rdimensions = Tg->_rdimensions; from_coor[i] = base_i + FromLowerLeft[i];
to_coor[i] = base_i + ToLowerLeft[i];
}
int foidx = Fg->oIndex(from_coor);
int fiidx = Fg->iIndex(from_coor);
int toidx = Tg->oIndex(to_coor);
int tiidx = Tg->iIndex(to_coor);
int* tt = table + 4*idx;
tt[0] = foidx;
tt[1] = fiidx;
tt[2] = toidx;
tt[3] = tiidx;
});
int* table_d = (int*)acceleratorAllocDevice(tbytes);
acceleratorCopyToDevice(table,table_d,tbytes);
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
@ -811,19 +798,13 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
autoView(from_v,From,AcceleratorRead); autoView(from_v,From,AcceleratorRead);
autoView(to_v,To,AcceleratorWrite); autoView(to_v,To,AcceleratorWrite);
const int words=sizeof(vobj)/sizeof(vector_type);
accelerator_for(idx,nsite,1,{ accelerator_for(idx,nsite,1,{
static const int words=sizeof(vobj)/sizeof(vector_type);
Coordinate from_coor, to_coor, base; int* tt = table_d + 4*idx;
Lexicographic::CoorFromIndex(base,idx,RegionSize); int from_oidx = *tt++;
for(int i=0;i<nd;i++){ int from_lane = *tt++;
from_coor[i] = base[i] + FromLowerLeft[i]; int to_oidx = *tt++;
to_coor[i] = base[i] + ToLowerLeft[i]; int to_lane = *tt;
}
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
const vector_type* from = (const vector_type *)&from_v[from_oidx]; const vector_type* from = (const vector_type *)&from_v[from_oidx];
vector_type* to = (vector_type *)&to_v[to_oidx]; vector_type* to = (vector_type *)&to_v[to_oidx];
@ -834,6 +815,53 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
putlane(to[w], stmp, to_lane); putlane(to[w], stmp, to_lane);
} }
}); });
acceleratorFreeDevice(table_d);
free(table);
#else
Coordinate ldf = Fg->_ldimensions;
Coordinate rdf = Fg->_rdimensions;
Coordinate isf = Fg->_istride;
Coordinate osf = Fg->_ostride;
Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
autoView( t_v , To, CpuWrite);
autoView( f_v , From, CpuRead);
thread_for(idx,Fg->lSites(),{
sobj s;
Coordinate Fcoor(nd);
Coordinate Tcoor(nd);
Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
int in_region=1;
for(int d=0;d<nd;d++){
if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){
in_region=0;
}
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
}
if (in_region) {
#if 0
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
scalar_type * fp = (scalar_type *)&f_v[odx_f];
scalar_type * tp = (scalar_type *)&t_v[odx_t];
for(int w=0;w<words;w++){
tp[w].putlane(fp[w].getlane(idx_f),idx_t);
}
#else
peekLocalSite(s,f_v,Fcoor);
pokeLocalSite(s,t_v,Tcoor);
#endif
}
});
#endif
} }
@ -925,7 +953,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
//FIXME: make this run entirely on GPU
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim //Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same //The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
template<class vobj> template<class vobj>
@ -1774,32 +1802,5 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
} }
} }
//////////////////////////////////////////////////////
// MultiRHS interface support for coarse space
// -- Simplest possible implementation to begin with
//////////////////////////////////////////////////////
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockProjectMany(Lattice<iVector<CComplex,nbasis > > &coarseIP,
Lattice<iVector<CComplex,nbasis > > &coarseTMP,
const VLattice &fineData, // Basis and fineData necessarily same type
const VLattice &Basis)
{
for(int r=0;r<fineData.size();r++){
blockProject(coarseTMP,fineData[r],Basis);
InsertSliceLocal(coarseTMP, coarseIP,r,r,0);
}
}
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockPromoteMany(Lattice<iVector<CComplex,nbasis > > &coarseIP,
Lattice<iVector<CComplex,nbasis > > &coarseTMP,
const VLattice &fineData, // Basis and fineData necessarily same type
const VLattice &Basis)
{
for(int r=0;r<fineData.size();r++){
ExtractSliceLocal(coarseTMP, coarseIP,r,r,0);
blockPromote(coarseTMP,fineData[r],Basis);
}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);