mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
MultiRHS work
This commit is contained in:
parent
59abaeb5cd
commit
0a3682ad0b
@ -62,6 +62,7 @@ public:
|
|||||||
|
|
||||||
std::vector<deviceVector<calcMatrix> > _A;
|
std::vector<deviceVector<calcMatrix> > _A;
|
||||||
std::vector<CoarseVector> MultTemporaries;
|
std::vector<CoarseVector> MultTemporaries;
|
||||||
|
deviceVector<GeneralStencilEntryReordered> StencilMasked;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
@ -78,9 +79,40 @@ public:
|
|||||||
Stencil(Cell.grids.back(),geom.shifts)
|
Stencil(Cell.grids.back(),geom.shifts)
|
||||||
{
|
{
|
||||||
_A.resize(geom.npoint);
|
_A.resize(geom.npoint);
|
||||||
|
int32_t padded_sites = _Op._A[0].Grid()->lSites();
|
||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
_A[p].resize(_CoarseGrid->lSites());
|
_A[p].resize(padded_sites);
|
||||||
}
|
}
|
||||||
|
std::cout << GridLogMessage<<"MultiGeneralCoarsenedMatrix "<<_CoarseGrid->lSites()<<" coarse sites "<<_Op._A[0].Grid()->lSites() <<std::endl;
|
||||||
|
|
||||||
|
StencilMasked.resize(_CoarseGridMulti->oSites());
|
||||||
|
std::vector<GeneralStencilEntryReordered> StencilTmp;
|
||||||
|
|
||||||
|
int32_t j=0;
|
||||||
|
int32_t sites = Stencil._entries.size()/geom.npoint;
|
||||||
|
for(int32_t s=0;s<sites;s++){
|
||||||
|
int ghost_zone=0;
|
||||||
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||||
|
int i=s*geom.npoint+point;
|
||||||
|
if( Stencil._entries[i]._permute ) {
|
||||||
|
ghost_zone=1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GeneralStencilEntryReordered tmp;
|
||||||
|
if( ghost_zone==0) {
|
||||||
|
for(int32_t point = 0 ; point < geom.npoint; point++){
|
||||||
|
int i=s*geom.npoint+point;
|
||||||
|
tmp._offset = Stencil._entries[i]._offset;
|
||||||
|
tmp._permute= Stencil._entries[i]._permute;
|
||||||
|
tmp._output = j;
|
||||||
|
StencilTmp.push_back(tmp);
|
||||||
|
}
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout << "coarse osites x npoint "<<_CoarseGridMulti->oSites()*geom.npoint<< " stencil interior size "<< StencilTmp.size()<<std::endl;
|
||||||
|
assert(_CoarseGridMulti->lSites()*geom.npoint==StencilTmp.size());
|
||||||
|
acceleratorCopyToDevice(&StencilTmp[0],&StencilMasked[0],sizeof(GeneralStencilEntryReordered)*StencilTmp.size());
|
||||||
CopyMatrix();
|
CopyMatrix();
|
||||||
}
|
}
|
||||||
void CopyMatrix (void)
|
void CopyMatrix (void)
|
||||||
@ -100,12 +132,18 @@ public:
|
|||||||
}
|
}
|
||||||
void M (const CoarseVector &in, CoarseVector &out)
|
void M (const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
|
RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0;
|
||||||
|
RealD tmult2=0;
|
||||||
|
|
||||||
|
ttot=-usecond();
|
||||||
conformable(CoarseGrid(),in.Grid());
|
conformable(CoarseGrid(),in.Grid());
|
||||||
conformable(in.Grid(),out.Grid());
|
conformable(in.Grid(),out.Grid());
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
CoarseVector tin=in;
|
CoarseVector tin=in;
|
||||||
|
|
||||||
|
texch-=usecond();
|
||||||
CoarseVector pin = Cell.ExchangePeriodic(tin);
|
CoarseVector pin = Cell.ExchangePeriodic(tin);
|
||||||
|
texch+=usecond();
|
||||||
CoarseVector pout(pin.Grid());
|
CoarseVector pout(pin.Grid());
|
||||||
|
|
||||||
int npoint = geom.npoint;
|
int npoint = geom.npoint;
|
||||||
@ -116,22 +154,33 @@ public:
|
|||||||
|
|
||||||
int64_t osites=pin.Grid()->oSites();
|
int64_t osites=pin.Grid()->oSites();
|
||||||
int64_t nrhs =pin.Grid()->GlobalDimensions()[0]/Nsimd;
|
int64_t nrhs =pin.Grid()->GlobalDimensions()[0]/Nsimd;
|
||||||
|
assert(nrhs>=1);
|
||||||
|
|
||||||
|
RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
|
||||||
|
RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
|
||||||
|
+ 2.0*osites*sizeof(siteVector)*npoint;
|
||||||
|
|
||||||
{
|
{
|
||||||
|
tviews-=usecond();
|
||||||
autoView( in_v , pin, AcceleratorRead);
|
autoView( in_v , pin, AcceleratorRead);
|
||||||
autoView( out_v , pout, AcceleratorWriteDiscard);
|
autoView( out_v , pout, AcceleratorWriteDiscard);
|
||||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
|
tviews+=usecond();
|
||||||
|
|
||||||
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
// Static and prereserve to keep UVM region live and not resized across multiple calls
|
||||||
|
ttemps-=usecond();
|
||||||
MultTemporaries.resize(npoint,pin.Grid());
|
MultTemporaries.resize(npoint,pin.Grid());
|
||||||
|
ttemps+=usecond();
|
||||||
|
|
||||||
std::vector<Aview> AcceleratorViewContainer_h;
|
std::vector<Aview> AcceleratorViewContainer_h;
|
||||||
std::vector<Vview> AcceleratorVecViewContainer_h;
|
std::vector<Vview> AcceleratorVecViewContainer_h;
|
||||||
|
|
||||||
|
tviews-=usecond();
|
||||||
for(int p=0;p<npoint;p++) {
|
for(int p=0;p<npoint;p++) {
|
||||||
AcceleratorViewContainer_h.push_back( &_A[p][0]);
|
AcceleratorViewContainer_h.push_back( &_A[p][0]);
|
||||||
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
|
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
|
||||||
}
|
}
|
||||||
|
tviews+=usecond();
|
||||||
|
|
||||||
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
|
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
|
||||||
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
|
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
|
||||||
@ -139,15 +188,23 @@ public:
|
|||||||
auto Aview_p = &AcceleratorViewContainer[0];
|
auto Aview_p = &AcceleratorViewContainer[0];
|
||||||
auto Vview_p = &AcceleratorVecViewContainer[0];
|
auto Vview_p = &AcceleratorVecViewContainer[0];
|
||||||
|
|
||||||
|
tcopy-=usecond();
|
||||||
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
|
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
|
||||||
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
|
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
|
||||||
|
tcopy+=usecond();
|
||||||
|
|
||||||
|
int32_t bound = _A[0].size();
|
||||||
|
std::cout << " osites "<<osites <<" bound "<<bound<<std::endl;
|
||||||
|
std::cout << " padded local dims "<<pin.Grid()->LocalDimensions()<<std::endl;
|
||||||
|
std::cout << " unpadded local dims "<<in.Grid()->LocalDimensions()<<std::endl;
|
||||||
|
tmult-=usecond();
|
||||||
accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
|
accelerator_for(rspb, osites*nbasis*npoint, Nsimd, {
|
||||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
int32_t ss = rspb/(nbasis*npoint);
|
int32_t ss = rspb/(nbasis*npoint);
|
||||||
int32_t bp = rspb%(nbasis*npoint);
|
int32_t bp = rspb%(nbasis*npoint);
|
||||||
int32_t point= bp/nbasis;
|
int32_t point= bp/nbasis;
|
||||||
int32_t b = bp%nbasis;
|
int32_t b = bp%nbasis;
|
||||||
|
assert(ss<bound);
|
||||||
auto SE = Stencil_v.GetEntry(point,ss);
|
auto SE = Stencil_v.GetEntry(point,ss);
|
||||||
if ( SE->_permute == 0 ) {
|
if ( SE->_permute == 0 ) {
|
||||||
int32_t snbr= SE->_offset;
|
int32_t snbr= SE->_offset;
|
||||||
@ -159,6 +216,7 @@ public:
|
|||||||
coalescedWrite(Vview_p[point][ss](b),res);
|
coalescedWrite(Vview_p[point][ss](b),res);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
tmult2-=usecond();
|
||||||
accelerator_for(sb, osites*nbasis, Nsimd, {
|
accelerator_for(sb, osites*nbasis, Nsimd, {
|
||||||
int ss = sb/nbasis;
|
int ss = sb/nbasis;
|
||||||
int b = sb%nbasis;
|
int b = sb%nbasis;
|
||||||
@ -168,12 +226,31 @@ public:
|
|||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
|
tmult2+=usecond();
|
||||||
|
tmult+=usecond();
|
||||||
for(int p=0;p<npoint;p++) {
|
for(int p=0;p<npoint;p++) {
|
||||||
AcceleratorVecViewContainer_h[p].ViewClose();
|
AcceleratorVecViewContainer_h[p].ViewClose();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
text-=usecond();
|
||||||
out = Cell.Extract(pout);
|
out = Cell.Extract(pout);
|
||||||
|
text+=usecond();
|
||||||
|
ttot+=usecond();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<" of which mult2 "<<tmult2<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Mult ext "<<text<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
||||||
|
// std::cout << GridLogMessage<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
||||||
|
std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||||
|
|
||||||
};
|
};
|
||||||
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
||||||
|
@ -745,8 +745,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
|
||||||
|
|
||||||
GridBase *Fg = From.Grid();
|
GridBase *Fg = From.Grid();
|
||||||
GridBase *Tg = To.Grid();
|
GridBase *Tg = To.Grid();
|
||||||
assert(!Fg->_isCheckerBoarded);
|
assert(!Fg->_isCheckerBoarded);
|
||||||
@ -763,13 +761,14 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
|
|
||||||
size_t nsite = 1;
|
size_t nsite = 1;
|
||||||
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
|
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
|
||||||
|
|
||||||
size_t tbytes = 4*nsite*sizeof(int);
|
size_t tbytes = 4*nsite*sizeof(int);
|
||||||
int *table = (int*)malloc(tbytes);
|
int *table = (int*)malloc(tbytes);
|
||||||
|
|
||||||
|
RealD t_cpu=-usecond();
|
||||||
|
#if 0
|
||||||
thread_for(idx, nsite, {
|
thread_for(idx, nsite, {
|
||||||
Coordinate from_coor, to_coor;
|
Coordinate from_coor, to_coor;
|
||||||
size_t rem = idx;
|
size_t rem = idx;
|
||||||
@ -792,15 +791,44 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
|
|
||||||
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
||||||
acceleratorCopyToDevice(table,table_d,tbytes);
|
acceleratorCopyToDevice(table,table_d,tbytes);
|
||||||
|
#else
|
||||||
|
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
||||||
|
Coordinate f_ostride = Fg->_ostride;
|
||||||
|
Coordinate f_istride = Fg->_istride;
|
||||||
|
Coordinate f_rdimensions = Fg->_rdimensions;
|
||||||
|
Coordinate t_ostride = Tg->_ostride;
|
||||||
|
Coordinate t_istride = Tg->_istride;
|
||||||
|
Coordinate t_rdimensions = Tg->_rdimensions;
|
||||||
|
|
||||||
|
accelerator_for(idx, nsite, 1, {
|
||||||
|
Coordinate from_coor, to_coor;
|
||||||
|
size_t rem = idx;
|
||||||
|
for(int i=0;i<nd;i++){
|
||||||
|
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
|
||||||
|
from_coor[i] = base_i + FromLowerLeft[i];
|
||||||
|
to_coor[i] = base_i + ToLowerLeft[i];
|
||||||
|
}
|
||||||
|
int foidx = 0; for(int d=0;d<nd;d++) foidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
|
||||||
|
int fiidx = 0; for(int d=0;d<nd;d++) fiidx+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
|
||||||
|
int toidx = 0; for(int d=0;d<nd;d++) toidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
|
||||||
|
int tiidx = 0; for(int d=0;d<nd;d++) tiidx+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
|
||||||
|
int* tt = table_d + 4*idx;
|
||||||
|
tt[0] = foidx;
|
||||||
|
tt[1] = fiidx;
|
||||||
|
tt[2] = toidx;
|
||||||
|
tt[3] = tiidx;
|
||||||
|
});
|
||||||
|
#endif
|
||||||
|
t_cpu+=usecond();
|
||||||
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
autoView(from_v,From,AcceleratorRead);
|
autoView(from_v,From,AcceleratorRead);
|
||||||
autoView(to_v,To,AcceleratorWrite);
|
autoView(to_v,To,AcceleratorWrite);
|
||||||
|
RealD t_acc=-usecond();
|
||||||
accelerator_for(idx,nsite,1,{
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
accelerator_for(idx,nsite,words,{
|
||||||
int* tt = table_d + 4*idx;
|
int* tt = table_d + 4*idx;
|
||||||
int from_oidx = *tt++;
|
int from_oidx = *tt++;
|
||||||
int from_lane = *tt++;
|
int from_lane = *tt++;
|
||||||
@ -811,12 +839,20 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
vector_type* to = (vector_type *)&to_v[to_oidx];
|
vector_type* to = (vector_type *)&to_v[to_oidx];
|
||||||
|
|
||||||
scalar_type stmp;
|
scalar_type stmp;
|
||||||
|
#ifdef GRID_SIMT
|
||||||
|
int w = acceleratorSIMTlane(words);
|
||||||
|
stmp = getlane(from[w], from_lane);
|
||||||
|
putlane(to[w], stmp, to_lane);
|
||||||
|
#else
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
stmp = getlane(from[w], from_lane);
|
stmp = getlane(from[w], from_lane);
|
||||||
putlane(to[w], stmp, to_lane);
|
putlane(to[w], stmp, to_lane);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
});
|
});
|
||||||
|
t_acc+=usecond();
|
||||||
|
std::cout << " localCopyRegion cpu " <<t_cpu/1000<<" ms"<<std::endl;
|
||||||
|
std::cout << " localCopyRegion acc " <<t_acc/1000<<" ms"<<std::endl;
|
||||||
acceleratorFreeDevice(table_d);
|
acceleratorFreeDevice(table_d);
|
||||||
free(table);
|
free(table);
|
||||||
|
|
||||||
|
@ -403,18 +403,8 @@ public:
|
|||||||
double t = usecond();
|
double t = usecond();
|
||||||
padded = in;
|
padded = in;
|
||||||
tins += usecond() - t;
|
tins += usecond() - t;
|
||||||
|
// return in; ?
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
// Replace sequence with
|
|
||||||
// ---------------------
|
|
||||||
// (i) Gather high face(s); start comms
|
|
||||||
// (ii) Gather low face(s); start comms
|
|
||||||
// (iii) Copy middle bit with localCopyRegion
|
|
||||||
// (iv) Complete high face(s), insert slice(s)
|
|
||||||
// (iv) Complete low face(s), insert slice(s)
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
Face_exchange(in,padded,dim,depth);
|
Face_exchange(in,padded,dim,depth);
|
||||||
}
|
}
|
||||||
return padded;
|
return padded;
|
||||||
@ -482,6 +472,7 @@ public:
|
|||||||
// Gather all surface terms up to depth "d"
|
// Gather all surface terms up to depth "d"
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
RealD t;
|
RealD t;
|
||||||
|
RealD t_tot=-usecond();
|
||||||
int plane=0;
|
int plane=0;
|
||||||
for ( int d=0;d < depth ; d ++ ) {
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
int tag = d*1024 + dimension*2+0;
|
int tag = d*1024 + dimension*2+0;
|
||||||
@ -549,6 +540,7 @@ public:
|
|||||||
}
|
}
|
||||||
t_scatter+= usecond() - t;
|
t_scatter+= usecond() - t;
|
||||||
// DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
|
// DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
|
||||||
|
t_tot+=usecond();
|
||||||
|
|
||||||
//DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
|
//DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
|
||||||
@ -557,6 +549,7 @@ public:
|
|||||||
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
|
||||||
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -90,7 +90,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
|
|||||||
|
|
||||||
for (int i = 0; i < logstreams.size(); i++) {
|
for (int i = 0; i < logstreams.size(); i++) {
|
||||||
if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1);
|
if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1);
|
||||||
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1);
|
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(0);
|
||||||
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
|
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
|
||||||
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
|
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
|
||||||
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
|
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
|
||||||
|
@ -33,6 +33,10 @@ struct GeneralStencilEntry {
|
|||||||
uint64_t _offset; // 4 bytes
|
uint64_t _offset; // 4 bytes
|
||||||
uint8_t _permute; // 1 bytes // Horrible alignment properties
|
uint8_t _permute; // 1 bytes // Horrible alignment properties
|
||||||
};
|
};
|
||||||
|
struct GeneralStencilEntryReordered : public GeneralStencilEntry {
|
||||||
|
uint64_t _output;
|
||||||
|
};
|
||||||
|
|
||||||
// Could pack to 8 + 4 + 4 = 128 bit and use
|
// Could pack to 8 + 4 + 4 = 128 bit and use
|
||||||
|
|
||||||
class GeneralLocalStencilView {
|
class GeneralLocalStencilView {
|
||||||
|
@ -6,6 +6,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
|||||||
--enable-tracing=timer \
|
--enable-tracing=timer \
|
||||||
--enable-accelerator=hip \
|
--enable-accelerator=hip \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
|
--enable-tracing=roctx \
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
|
@ -78,7 +78,7 @@ int main (int argc, char ** argv)
|
|||||||
// Construct a coarsened grid
|
// Construct a coarsened grid
|
||||||
Coordinate clatt = GridDefaultLatt();
|
Coordinate clatt = GridDefaultLatt();
|
||||||
for(int d=0;d<clatt.size();d++){
|
for(int d=0;d<clatt.size();d++){
|
||||||
clatt[d] = clatt[d]/2;
|
clatt[d] = clatt[d]/4;
|
||||||
}
|
}
|
||||||
|
|
||||||
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
|
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
|
||||||
@ -107,7 +107,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
const int nbasis = 16;
|
const int nbasis = 32;
|
||||||
const int cb = 0 ;
|
const int cb = 0 ;
|
||||||
LatticeFermion prom(FGrid);
|
LatticeFermion prom(FGrid);
|
||||||
|
|
||||||
@ -265,8 +265,8 @@ int main (int argc, char ** argv)
|
|||||||
LittleDiracOp.M(phi,Aphi);
|
LittleDiracOp.M(phi,Aphi);
|
||||||
}
|
}
|
||||||
t1+=usecond();
|
t1+=usecond();
|
||||||
std::cout << r << " mrhs " << norm2(chi)<<std::endl;
|
std::cout << " mrhs [" <<r <<"] "<< norm2(chi)<<std::endl;
|
||||||
std::cout << r << " srhs " << norm2(Aphi)<<std::endl;
|
std::cout << " srhs [" <<r <<"] "<< norm2(Aphi)<<std::endl;
|
||||||
chi=chi-Aphi;
|
chi=chi-Aphi;
|
||||||
std::cout << r << " diff " << norm2(chi)<<std::endl;
|
std::cout << r << " diff " << norm2(chi)<<std::endl;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user