1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Compare commits

...

10 Commits

Author SHA1 Message Date
Peter Boyle
3bc2da5321 Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1 2023-10-05 16:57:59 -04:00
Peter Boyle
2d710d6bfd Optimised parameters for 16^3 2023-10-05 16:56:55 -04:00
Peter Boyle
6532b7f32b Eliminate older inefficient coarsening implementation 2023-10-05 16:56:15 -04:00
Peter Boyle
7b41b92d99 Only need to bad non-local dimensions 2023-10-05 16:55:48 -04:00
Peter Boyle
dd557af84b ADEF1 and ADEF2 2 level CG 2023-10-05 16:55:19 -04:00
Peter Boyle
59b9d0e030 coalesceRead the blockSum 2023-10-05 16:54:48 -04:00
Peter Boyle
b82eee4733 Hermitian dealing with 2023-10-05 16:54:14 -04:00
Peter Boyle
6a87487544 Running on Frontier, fix RNG big volume y2k, affecting 5D RNG 2023-10-05 16:50:59 -04:00
Peter Boyle
fcf5023845 Running on Frontier 2023-10-05 16:50:59 -04:00
Peter Boyle
c8adad6d8b First runs on Summit. PopulateAdag needs work 2023-10-05 16:50:54 -04:00
15 changed files with 432 additions and 183 deletions

View File

@ -226,6 +226,7 @@ public:
void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
{
int nfound=0;
std::cout << " ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
for(int p=0;p<geom.npoint;p++){
for(int pp=0;pp<CopyMe.geom.npoint;pp++){
// Search for the same relative shift
@ -233,12 +234,12 @@ public:
if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
_A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
_Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
ExchangeCoarseLinks();
nfound++;
}
}
}
assert(nfound==geom.npoint);
ExchangeCoarseLinks();
}
GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
@ -271,7 +272,8 @@ public:
}
void Mdag (const CoarseVector &in, CoarseVector &out)
{
Mult(_Adag,in,out);
if ( hermitian ) M(in,out);
else Mult(_Adag,in,out);
}
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
{
@ -298,10 +300,10 @@ public:
const int Nsimd = CComplex::Nsimd();
int osites=pin.Grid()->oSites();
int gsites=pin.Grid()->gSites();
// int gsites=pin.Grid()->gSites();
RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites;
RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint;
RealD flops = 1.0* npoint * nbasis * nbasis * 8 * osites;
RealD bytes = (1.0*osites*sizeof(siteMatrix)*npoint+2.0*osites*sizeof(siteVector))*npoint;
// for(int point=0;point<npoint;point++){
// conformable(A[point],pin);
@ -343,20 +345,20 @@ public:
text+=usecond();
ttot+=usecond();
std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult ext "<<text<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
std::cout << GridLogMessage<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
std::cout << GridLogMessage<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
std::cout << GridLogMessage<<"Coarse bytes "<< bytes/1e6<<" MB"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult ext "<<text<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse bytes "<< bytes/1e6<<" MB"<<std::endl;
};
void PopulateAdag(void)
{
for(int bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
Coordinate bcoor;
CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
@ -412,10 +414,10 @@ public:
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*/
void CoarsenOperatorColoured(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
std::cout << GridLogMessage<< "CoarsenMatrixColoured "<< std::endl;
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
RealD tproj=0.0;
@ -540,9 +542,14 @@ public:
auto sval = peekSite(_A[p],coor);
}
PopulateAdag();
// Only needed if nonhermitian
if ( ! hermitian ) {
std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
PopulateAdag();
}
// Need to write something to populate Adag from A
std::cout << GridLogMessage<<"ExchangeCoarseLinks "<<std::endl;
ExchangeCoarseLinks();
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
@ -552,6 +559,7 @@ public:
}
void ExchangeCoarseLinks(void){
for(int p=0;p<geom.npoint;p++){
std::cout << "Exchange "<<p<<std::endl;
_A[p] = Cell.Exchange(_A[p]);
_Adag[p]= Cell.Exchange(_Adag[p]);
}

View File

@ -42,46 +42,30 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*/
NAMESPACE_BEGIN(Grid);
template<class Field, class CoarseField, class Aggregation>
class TwoLevelADEF2 : public LinearFunction<Field>
template<class Field>
class TwoLevelCG : public LinearFunction<Field>
{
public:
RealD Tolerance;
Integer MaxIterations;
const int mmax = 1;
GridBase *grid;
GridBase *coarsegrid;
// Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother;
LinearFunction<CoarseField> &_CoarseSolver;
LinearFunction<CoarseField> &_CoarseSolverPrecise;
// Need something that knows how to get from Coarse to fine and back again
// void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
// void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
Aggregation &_Aggregates;
// more most opertor functions
TwoLevelADEF2(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolver,
LinearFunction<CoarseField> &CoarseSolverPrecise,
Aggregation &Aggregates
) :
TwoLevelCG(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
GridBase *fine) :
Tolerance(tol),
MaxIterations(maxit),
_FineLinop(FineLinop),
_Smoother(Smoother),
_CoarseSolver(CoarseSolver),
_CoarseSolverPrecise(CoarseSolverPrecise),
_Aggregates(Aggregates)
_Smoother(Smoother)
{
coarsegrid = Aggregates.CoarseGrid;
grid = Aggregates.FineGrid;
grid = fine;
};
virtual void operator() (const Field &src, Field &psi)
@ -195,47 +179,8 @@ class TwoLevelADEF2 : public LinearFunction<Field>
public:
virtual void PcgM1(Field & in, Field & out)
{
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(grid);
Field Min(grid);
CoarseField PleftProj(coarsegrid);
CoarseField PleftMss_proj(coarsegrid);
GridStopWatch SmootherTimer;
GridStopWatch MatrixTimer;
SmootherTimer.Start();
_Smoother(in,Min);
SmootherTimer.Stop();
MatrixTimer.Start();
_FineLinop.HermOp(Min,out);
MatrixTimer.Stop();
axpy(tmp,-1.0,out,in); // tmp = in - A Min
GridStopWatch ProjTimer;
GridStopWatch CoarseTimer;
GridStopWatch PromTimer;
ProjTimer.Start();
_Aggregates.ProjectToSubspace(PleftProj,tmp);
ProjTimer.Stop();
CoarseTimer.Start();
_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
CoarseTimer.Stop();
PromTimer.Start();
_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
PromTimer.Stop();
std::cout << GridLogMessage << "PcgM1 breakdown "<<std::endl;
std::cout << GridLogMessage << "\tSmoother " << SmootherTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tProj " << ProjTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tCoarse " << CoarseTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tProm " << PromTimer.Elapsed() <<std::endl;
axpy(out,1.0,Min,tmp); // Min+tmp
}
virtual void PcgM1(Field & in, Field & out) =0;
virtual void Vstart(Field & x,const Field & src)=0;
virtual void PcgM2(const Field & in, Field & out) {
out=in;
@ -249,6 +194,89 @@ class TwoLevelADEF2 : public LinearFunction<Field>
return dd;
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout.
/////////////////////////////////////////////////////////////////////
virtual void Vout (Field & in, Field & out,Field & src){
out = in;
}
};
template<class Field, class CoarseField, class Aggregation>
class TwoLevelADEF2 : public TwoLevelCG<Field>
{
public:
///////////////////////////////////////////////////////////////////////////////////
// Need something that knows how to get from Coarse to fine and back again
// void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
// void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
///////////////////////////////////////////////////////////////////////////////////
GridBase *coarsegrid;
Aggregation &_Aggregates;
LinearFunction<CoarseField> &_CoarseSolver;
LinearFunction<CoarseField> &_CoarseSolverPrecise;
///////////////////////////////////////////////////////////////////////////////////
// more most opertor functions
TwoLevelADEF2(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolver,
LinearFunction<CoarseField> &CoarseSolverPrecise,
Aggregation &Aggregates
) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
_CoarseSolver(CoarseSolver),
_CoarseSolverPrecise(CoarseSolverPrecise),
_Aggregates(Aggregates)
{
coarsegrid = Aggregates.CoarseGrid;
};
virtual void PcgM1(Field & in, Field & out)
{
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(this->grid);
Field Min(this->grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
GridStopWatch SmootherTimer;
GridStopWatch MatrixTimer;
SmootherTimer.Start();
this->_Smoother(in,Min);
SmootherTimer.Stop();
MatrixTimer.Start();
this->_FineLinop.HermOp(Min,out);
MatrixTimer.Stop();
axpy(tmp,-1.0,out,in); // tmp = in - A Min
GridStopWatch ProjTimer;
GridStopWatch CoarseTimer;
GridStopWatch PromTimer;
ProjTimer.Start();
this->_Aggregates.ProjectToSubspace(PleftProj,tmp);
ProjTimer.Stop();
CoarseTimer.Start();
this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
CoarseTimer.Stop();
PromTimer.Start();
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
PromTimer.Stop();
std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
std::cout << GridLogPerformance << "\tSmoother " << SmootherTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProj " << ProjTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tCoarse " << CoarseTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProm " << PromTimer.Elapsed() <<std::endl;
axpy(out,1.0,Min,tmp); // Min+tmp
}
virtual void Vstart(Field & x,const Field & src)
{
///////////////////////////////////
@ -262,65 +290,69 @@ class TwoLevelADEF2 : public LinearFunction<Field>
// = src_s - (A guess)_s - src_s + (A guess)_s
// = 0
///////////////////////////////////
Field r(grid);
Field mmp(grid);
CoarseField PleftProj(coarsegrid);
CoarseField PleftMss_proj(coarsegrid);
Field r(this->grid);
Field mmp(this->grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
_Aggregates.ProjectToSubspace(PleftProj,src);
_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
_Aggregates.PromoteFromSubspace(PleftMss_proj,x);
this->_Aggregates.ProjectToSubspace(PleftProj,src);
this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout.
/////////////////////////////////////////////////////////////////////
virtual void Vout (Field & in, Field & out,Field & src){
out = in;
}
};
template<class Field, class CoarseField, class Aggregation>
class TwoLevelADEF1ev : public TwoLevelADEF2<Field,CoarseField,Aggregation>
template<class Field>
class TwoLevelADEF1defl : public TwoLevelCG<Field>
{
TwoLevelADEF1ev(RealD tol,
public:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
TwoLevelADEF1defl(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolver,
LinearFunction<CoarseField> &CoarseSolverPrecise,
Aggregation &Aggregates ) :
TwoLevelADEF2<Field,CoarseField,Aggregation>(tol,maxit,FineLinop,Smoother,CoarseSolver,CoarseSolverPrecise,Aggregates)
std::vector<Field> &_evec,
std::vector<RealD> &_eval) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
evec(_evec),
eval(_eval)
{};
// Can just inherit existing Vstart
// Can just inherit existing Vout
// Can just inherit existing M2
// Can just inherit existing M3
// Simple vstart - do nothing
virtual void Vstart(Field & x,const Field & src){};
// Override PcgM1
virtual void PcgM1(Field & in, Field & out)
{
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
Field tmp(this->grid);
int N=evec.size();
Field Pin(this->grid);
Field Qin(this->grid);
//MP + Q = M(1-AQ) + Q = M
// // If we are eigenvector deflating in coarse space
// // Q = Sum_i |phi_i> 1/lambda_i <phi_i|
// // A Q = Sum_i |phi_i> <phi_i|
// // M(1-AQ) = M(1-proj) + Q
this->_Aggregates.ProjectToSubspace(PleftProj,in);
this->_Aggregates.PromoteFromSubspace(PleftProj,tmp);// tmp = Qin
Qin.Checkerboard()=in.Checkerboard();
Qin = Zero();
Pin = in;
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
auto ip = TensorRemove(innerProduct(tmp,in));
axpy(Qin, ip / eval[i],tmp,Qin);
axpy(Pin, -ip ,tmp,Pin);
}
Pin = in - tmp;
this->_Smoother(Pin,out);
this->_CoarseSolver(PleftProj,PleftMss_proj);
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Qin
out = out + tmp;
out = out + Qin;
}
};

View File

@ -70,8 +70,8 @@ public:
Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
int _fsites; // _isites*_osites = product(dimensions).
int _gsites;
int64_t _fsites; // _isites*_osites = product(dimensions).
int64_t _gsites;
Coordinate _slice_block;// subslice information
Coordinate _slice_stride;
Coordinate _slice_nblock;
@ -183,7 +183,7 @@ public:
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; };
@ -214,7 +214,7 @@ public:
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
@ -222,7 +222,7 @@ public:
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {

View File

@ -360,7 +360,7 @@ public:
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj;
for(int g=0;g<o.Grid()->_gsites;g++){
for(int64_t g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);

View File

@ -432,7 +432,7 @@ public:
#if 1
thread_for( lidx, _grid->lSites(), {
int gidx;
int64_t gidx;
int o_idx;
int i_idx;
int rank;

View File

@ -471,13 +471,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),1,{
accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
// One thread per sub block
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
vobj cd = zz;
auto cd = coalescedRead(zz);
for(int sb=0;sb<blockVol;sb++){
@ -488,10 +488,10 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
cd=cd+fineData_p[sf];
cd=cd+coalescedRead(fineData_p[sf]);
}
coarseData_p[sc] = cd;
coalescedWrite(coarseData_p[sc],cd);
});
return;
@ -1054,7 +1054,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
Coordinate fcoor(nd);
Coordinate ccoor(nd);
for(int g=0;g<fg->gSites();g++){
for(int64_t g=0;g<fg->gSites();g++){
fg->GlobalIndexToGlobalCoor(g,fcoor);
for(int d=0;d<nd;d++){

View File

@ -86,8 +86,10 @@ public:
// expand up one dim at a time
for(int d=0;d<dims;d++){
plocal[d] += 2*depth;
if ( processors[d] > 1 ) {
plocal[d] += 2*depth;
}
for(int d=0;d<dims;d++){
global[d] = plocal[d]*processors[d];
}
@ -98,11 +100,17 @@ public:
template<class vobj>
inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
{
Coordinate processors=unpadded_grid->_processors;
Lattice<vobj> out(unpadded_grid);
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate fll(dims,depth); // depends on the MPI spread
// depends on the MPI spread
Coordinate fll(dims,depth);
Coordinate tll(dims,0); // depends on the MPI spread
for(int d=0;d<dims;d++){
if( processors[d]==1 ) fll[d]=0;
}
localCopyRegion(in,out,fll,tll,local);
return out;
}
@ -121,6 +129,7 @@ public:
template<class vobj>
inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
Coordinate processors=unpadded_grid->_processors;
GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid);
@ -133,36 +142,48 @@ public:
std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
double tins=0, tshift=0;
// Middle bit
double t = usecond();
for(int x=0;x<local[dim];x++){
InsertSliceLocal(in,padded,x,depth+x,dim);
}
tins += usecond() - t;
// High bit
t = usecond();
shifted = cshift.Cshift(in,dim,depth);
tshift += usecond() - t;
t=usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
}
tins += usecond() - t;
// Low bit
t = usecond();
shifted = cshift.Cshift(in,dim,-depth);
tshift += usecond() - t;
t = usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,x,x,dim);
}
tins += usecond() - t;
int islocal = 0 ;
if ( processors[dim] == 1 ) islocal = 1;
if ( islocal ) {
double t = usecond();
for(int x=0;x<local[dim];x++){
InsertSliceLocal(in,padded,x,x,dim);
}
tins += usecond() - t;
} else {
// Middle bit
double t = usecond();
for(int x=0;x<local[dim];x++){
InsertSliceLocal(in,padded,x,depth+x,dim);
}
tins += usecond() - t;
// High bit
t = usecond();
shifted = cshift.Cshift(in,dim,depth);
tshift += usecond() - t;
t=usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
}
tins += usecond() - t;
// Low bit
t = usecond();
shifted = cshift.Cshift(in,dim,-depth);
tshift += usecond() - t;
t = usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,x,x,dim);
}
tins += usecond() - t;
}
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
return padded;

View File

@ -8,7 +8,7 @@ namespace Grid{
public:
template<class coor_t>
static accelerator_inline void CoorFromIndex (coor_t& coor,int index,const coor_t &dims){
static accelerator_inline void CoorFromIndex (coor_t& coor,int64_t index,const coor_t &dims){
int nd= dims.size();
coor.resize(nd);
for(int d=0;d<nd;d++){
@ -18,28 +18,45 @@ namespace Grid{
}
template<class coor_t>
static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
static accelerator_inline void IndexFromCoor (const coor_t& coor,int64_t &index,const coor_t &dims){
int nd=dims.size();
int stride=1;
index=0;
for(int d=0;d<nd;d++){
index = index+stride*coor[d];
index = index+(int64_t)stride*coor[d];
stride=stride*dims[d];
}
}
template<class coor_t>
static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
int64_t index64;
IndexFromCoor(coor,index64,dims);
assert(index64<2*1024*1024*1024LL);
index = (int) index64;
}
template<class coor_t>
static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
static inline void IndexFromCoorReversed (const coor_t& coor,int64_t &index,const coor_t &dims){
int nd=dims.size();
int stride=1;
index=0;
for(int d=nd-1;d>=0;d--){
index = index+stride*coor[d];
index = index+(int64_t)stride*coor[d];
stride=stride*dims[d];
}
}
template<class coor_t>
static inline void CoorFromIndexReversed (coor_t& coor,int index,const coor_t &dims){
static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
int64_t index64;
IndexFromCoorReversed(coor,index64,dims);
if ( index64>=2*1024*1024*1024LL ){
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
}
assert(index64<2*1024*1024*1024LL);
index = (int) index64;
}
template<class coor_t>
static inline void CoorFromIndexReversed (coor_t& coor,int64_t index,const coor_t &dims){
int nd= dims.size();
coor.resize(nd);
for(int d=nd-1;d>=0;d--){

View File

@ -0,0 +1,43 @@
#!/bin/bash -l
#SBATCH --job-name=bench
##SBATCH --partition=small-g
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=7
#SBATCH --gpus-per-node=8
#SBATCH --time=00:10:00
#SBATCH --account=phy157_dwf
#SBATCH --gpu-bind=none
#SBATCH --exclusive
#SBATCH --mem=0
cat << EOF > select_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3 7 6 5 4)
export NUMA_MAP=(3 3 1 1 2 2 0 0)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
export HIP_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./select_gpu
root=$HOME/Frontier/Grid/systems/Frontier/
source ${root}/sourceme.sh
export OMP_NUM_THREADS=7
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
for vol in 32.32.32.64
do
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
done

View File

@ -0,0 +1,23 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=timer \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "

13
systems/Frontier/mpiwrapper.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash
lrank=$SLURM_LOCALID
lgpu=(0 1 2 3 7 6 5 4)
export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]}
echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES "
$*

View File

@ -0,0 +1,13 @@
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
spack load c-lime
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
module load emacs
module load PrgEnv-gnu
module load rocm
module load cray-mpich/8.1.23
module load gmp
module load cray-fftw
module load craype-accel-amd-gfx90a
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Hack for lib
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH

9
systems/Frontier/wrap.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/sh
export HIP_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES
unset ROCR_VISIBLE_DEVICES
#rank=$SLURM_PROCID
#rocprof -d rocprof.$rank -o rocprof.$rank/results.rank$SLURM_PROCID.csv --sys-trace $@
$@

View File

@ -144,12 +144,7 @@ int main (int argc, char ** argv)
HermOpAdaptor<LatticeFermionD> HOA(HermDefOp);
int pp=16;
// LittleDiracOpCol.CoarsenOperator(HOA,Aggregates);
// std::cout << "LittleDiracOp old " << LittleDiracOpCol._A[pp]<<std::endl;
LittleDiracOp.CoarsenOperatorColoured(HOA,Aggregates);
// LittleDiracOp.ExchangeCoarseLinks();
// std::cout << "LittleDiracOp new " << LittleDiracOp._A[pp]<<std::endl;
LittleDiracOp.CoarsenOperator(HOA,Aggregates);
///////////////////////////////////////////////////
// Test the operator

View File

@ -86,6 +86,12 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv);
const int Ls=16;
const int nbasis = 40;
const int cb = 0 ;
RealD mass=0.01;
RealD M5=1.8;
RealD b=1.5;
RealD c=0.5;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
@ -121,10 +127,6 @@ int main (int argc, char ** argv)
NerscIO::readConfiguration(Umu,header,file);
//////////////////////// Fermion action //////////////////////////////////
RealD mass=0.01;
RealD M5=1.8;
RealD b=1.5;
RealD c=0.5;
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
@ -143,8 +145,6 @@ int main (int argc, char ** argv)
////////////////////////////////////////////////////////////
///////////// Coarse basis and Little Dirac Operator ///////
////////////////////////////////////////////////////////////
const int nbasis = 40;
const int cb = 0 ;
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
typedef LittleDiracOperator::CoarseVector CoarseVector;
@ -154,6 +154,18 @@ int main (int argc, char ** argv)
// Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
Subspace Aggregates(Coarse5d,FrbGrid,cb);
#if 1
Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
95.0,0.1,
// 400,200,200 -- 48 iters
// 600,200,200 -- 38 iters, 162s
// 600,200,100 -- 38 iters, 169s
// 600,200,50 -- 88 iters. 370s
600,
200,
100,
0.0);
#else
Aggregates.CreateSubspaceChebyshev(RNG5,
HermOpEO,
nbasis,
@ -172,19 +184,22 @@ int main (int argc, char ** argv)
// 0.1, // nbasis 16 -- 142; sloppy solve
0.1, // nbasis 24
300);
#endif
////////////////////////////////////////////////////////////
// Need to check about red-black grid coarsening
////////////////////////////////////////////////////////////
LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
LittleDiracOp.CoarsenOperatorColoured(FineHermOp,Aggregates);
LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
// Try projecting to one hop only
std::cout << " projecting coarse matrix "<<std::endl;
LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
LittleDiracOpProj.ProjectNearestNeighbour(0.5,LittleDiracOp);
typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
HermMatrix CoarseOp (LittleDiracOp);
HermMatrix CoarseOpProj (LittleDiracOpProj);
//////////////////////////////////////////
// Build a coarse lanczos
//////////////////////////////////////////
@ -200,11 +215,13 @@ int main (int argc, char ** argv)
std::vector<RealD> eval(Nm);
std::vector<CoarseVector> evec(Nm,Coarse5d);
CoarseVector c_src(Coarse5d); c_src=1.0;
CoarseVector c_res(Coarse5d);
PowerMethod<CoarseVector> cPM; cPM(CoarseOp,c_src);
IRL.calc(eval,evec,c_src,Nconv);
DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
//////////////////////////////////////////
// Build a coarse space solver
@ -216,7 +233,53 @@ int main (int argc, char ** argv)
// HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
c_res=Zero();
HPDSolve(c_src,c_res);
//////////////////////////////////////////////////////////////////////////
// Deflated (with real op EV's) solve for the projected coarse op
// Work towards ADEF1 in the coarse space
//////////////////////////////////////////////////////////////////////////
HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
c_res=Zero();
HPDSolveProj(c_src,c_res);
//////////////////////////////////////////////////////////////////////
// Coarse ADEF1 with deflation space
//////////////////////////////////////////////////////////////////////
// ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(2.0,40.,8,CoarseOpProj); // 311
ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(2.0,36.,12,CoarseOpProj); // 311
////////////////////////////////////////////////////////
// CG, Cheby mode spacing 200,200
// Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
// Unprojected Coarse CG solve to 4e-2 : 33 iters, 0.8s
// Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
////////////////////////////////////////////////////////
// CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs
////////////////////////////////////////////////////////
// ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s 2.1x gain
// ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
// HDCG 38 iters 162s
//
// CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs
// ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s 2.1x gain
// ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
// HDCG 38 iters 169s
TwoLevelADEF1defl<CoarseVector>
cADEF1(1.0e-8, 100,
CoarseOp,
CoarseSmoother,
evec,eval);
c_res=Zero();
cADEF1(c_src,c_res);
cADEF1.Tolerance = 4.0e-2;
c_res=Zero();
cADEF1(c_src,c_res);
//////////////////////////////////////////
// Build a smoother
//////////////////////////////////////////
@ -242,10 +305,6 @@ int main (int argc, char ** argv)
// std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
// Standard CG
// result=Zero();
// CGfine(HermOpEO, src, result);
for(int l=0;l<los.size();l++){
RealD lo = los[l];
@ -271,8 +330,24 @@ int main (int argc, char ** argv)
result=Zero();
HDCG(src,result);
TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
HDCGdefl(1.0e-8, 3000,
FineHermOp,
Smoother,
cADEF1,
HPDSolve,
Aggregates);
result=Zero();
HDCGdefl(src,result);
}
}
// Standard CG
result=Zero();
CGfine(HermOpEO, src, result);
Grid_finalize();
return 0;