mirror of
https://github.com/paboyle/Grid.git
synced 2025-07-13 03:27:07 +01:00
Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc
Aim to reduce the amount of cuda and other code variations floating around all over the place. Will move GpuInit iinto Accelerator.cc from Init.cc Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
This commit is contained in:
@ -1,14 +1,3 @@
|
||||
// blockZaxpy in bockPromote - 3s, 5%
|
||||
// noncoalesced linalg in Preconditionoer ~ 3s 5%
|
||||
// Lancos tuning or replace 10-20s ~ 25%, open ended
|
||||
// setup tuning 5s ~ 8%
|
||||
// -- e.g. ordermin, orderstep tunables.
|
||||
// MdagM path without norm in LinOp code. few seconds
|
||||
|
||||
// Mdir calc blocking kernels
|
||||
// Fuse kernels in blockMaskedInnerProduct
|
||||
// preallocate Vectors in Cayley 5D ~ few percent few seconds
|
||||
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
@ -91,34 +80,7 @@ public:
|
||||
}
|
||||
directions [2*_d]=0;
|
||||
displacements[2*_d]=0;
|
||||
|
||||
//// report back
|
||||
std::cout<<GridLogMessage<<"directions :";
|
||||
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
|
||||
std::cout<<std::endl;
|
||||
std::cout<<GridLogMessage<<"displacements :";
|
||||
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
|
||||
/*
|
||||
// Original cleaner code
|
||||
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
|
||||
for(int d=0;d<dimension;d++){
|
||||
directions[2*d ] = d;
|
||||
directions[2*d+1] = d;
|
||||
displacements[2*d ] = +1;
|
||||
displacements[2*d+1] = -1;
|
||||
}
|
||||
directions [2*dimension]=0;
|
||||
displacements[2*dimension]=0;
|
||||
}
|
||||
std::vector<int> GetDelta(int point) {
|
||||
std::vector<int> delta(dimension,0);
|
||||
delta[directions[point]] = displacements[point];
|
||||
return delta;
|
||||
};
|
||||
*/
|
||||
|
||||
};
|
||||
|
||||
@ -149,25 +111,7 @@ public:
|
||||
CoarseScalar InnerProd(CoarseGrid);
|
||||
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
|
||||
blockOrthogonalise(InnerProd,subspace);
|
||||
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
|
||||
// blockOrthogonalise(InnerProd,subspace);
|
||||
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
|
||||
// CheckOrthogonal();
|
||||
}
|
||||
void CheckOrthogonal(void){
|
||||
CoarseVector iProj(CoarseGrid);
|
||||
CoarseVector eProj(CoarseGrid);
|
||||
for(int i=0;i<nbasis;i++){
|
||||
blockProject(iProj,subspace[i],subspace);
|
||||
eProj=Zero();
|
||||
accelerator_for(ss, CoarseGrid->oSites(),1,{
|
||||
eProj[ss](i)=CComplex(1.0);
|
||||
});
|
||||
eProj=eProj - iProj;
|
||||
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
|
||||
}
|
||||
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
|
||||
}
|
||||
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
|
||||
blockProject(CoarseVec,FineVec,subspace);
|
||||
}
|
||||
@ -175,50 +119,12 @@ public:
|
||||
FineVec.Checkerboard() = subspace[0].Checkerboard();
|
||||
blockPromote(CoarseVec,FineVec,subspace);
|
||||
}
|
||||
void CreateSubspaceRandom(GridParallelRNG &RNG){
|
||||
for(int i=0;i<nbasis;i++){
|
||||
random(RNG,subspace[i]);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
|
||||
|
||||
RealD scale;
|
||||
|
||||
ConjugateGradient<FineField> CG(1.0e-2,100,false);
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
|
||||
for(int b=0;b<nn;b++){
|
||||
|
||||
subspace[b] = Zero();
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
for(int i=0;i<1;i++){
|
||||
|
||||
CG(hermop,noise,subspace[b]);
|
||||
|
||||
noise = subspace[b];
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
}
|
||||
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
|
||||
subspace[b] = noise;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||
// and this is the best I found
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#if 1
|
||||
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
@ -313,201 +219,6 @@ public:
|
||||
}
|
||||
assert(b==nn);
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
double lo,
|
||||
int orderfilter,
|
||||
int ordermin,
|
||||
int orderstep,
|
||||
double filterlo
|
||||
) {
|
||||
|
||||
RealD scale;
|
||||
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
FineField tmp(FineGrid);
|
||||
FineField combined(FineGrid);
|
||||
|
||||
// New normalised noise
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
// Initial matrix element
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
int b =0;
|
||||
#define FILTERb(llo,hhi,oorder) \
|
||||
{ \
|
||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||
Cheb(hermop,noise,Mn); \
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||
subspace[b] = Mn; \
|
||||
hermop.Op(Mn,tmp); \
|
||||
std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||
b++; \
|
||||
}
|
||||
|
||||
// JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5); \
|
||||
|
||||
RealD alpha=-0.8;
|
||||
RealD beta =-0.8;
|
||||
#define FILTER(llo,hhi,oorder) \
|
||||
{ \
|
||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||
/* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
|
||||
Cheb(hermop,noise,Mn); \
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
||||
subspace[b] = Mn; \
|
||||
hermop.Op(Mn,tmp); \
|
||||
std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||
b++; \
|
||||
}
|
||||
|
||||
#define FILTERc(llo,hhi,oorder) \
|
||||
{ \
|
||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
||||
Cheb(hermop,noise,combined); \
|
||||
}
|
||||
|
||||
double node = 0.000;
|
||||
FILTERb(lo,hi,orderfilter);// 0
|
||||
// FILTERc(node,hi,51);// 0
|
||||
noise = Mn;
|
||||
int base = 0;
|
||||
int mult = 100;
|
||||
FILTER(node,hi,base+1*mult);
|
||||
FILTER(node,hi,base+2*mult);
|
||||
FILTER(node,hi,base+3*mult);
|
||||
FILTER(node,hi,base+4*mult);
|
||||
FILTER(node,hi,base+5*mult);
|
||||
FILTER(node,hi,base+6*mult);
|
||||
FILTER(node,hi,base+7*mult);
|
||||
FILTER(node,hi,base+8*mult);
|
||||
FILTER(node,hi,base+9*mult);
|
||||
FILTER(node,hi,base+10*mult);
|
||||
FILTER(node,hi,base+11*mult);
|
||||
FILTER(node,hi,base+12*mult);
|
||||
FILTER(node,hi,base+13*mult);
|
||||
FILTER(node,hi,base+14*mult);
|
||||
FILTER(node,hi,base+15*mult);
|
||||
assert(b==nn);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
double lo,
|
||||
int orderfilter,
|
||||
int ordermin,
|
||||
int orderstep,
|
||||
double filterlo
|
||||
) {
|
||||
|
||||
RealD scale;
|
||||
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
FineField tmp(FineGrid);
|
||||
FineField combined(FineGrid);
|
||||
|
||||
// New normalised noise
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
// Initial matrix element
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
int b =0;
|
||||
{
|
||||
Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
|
||||
// JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
|
||||
//JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
|
||||
// JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
|
||||
JacobiPoly(hermop,noise,Mn);
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
b++;
|
||||
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||
// subspace[b] = tmp; b++;
|
||||
// }
|
||||
}
|
||||
|
||||
#define FILTER(lambda) \
|
||||
{ \
|
||||
hermop.HermOp(subspace[0],tmp); \
|
||||
tmp = tmp - lambda *subspace[0]; \
|
||||
scale = std::pow(norm2(tmp),-0.5); \
|
||||
tmp=tmp*scale; \
|
||||
subspace[b] = tmp; \
|
||||
hermop.Op(subspace[b],tmp); \
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
||||
b++; \
|
||||
}
|
||||
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
||||
// subspace[b] = tmp; b++;
|
||||
// }
|
||||
|
||||
FILTER(2.0e-5);
|
||||
FILTER(2.0e-4);
|
||||
FILTER(4.0e-4);
|
||||
FILTER(8.0e-4);
|
||||
FILTER(8.0e-4);
|
||||
|
||||
FILTER(2.0e-3);
|
||||
FILTER(3.0e-3);
|
||||
FILTER(4.0e-3);
|
||||
FILTER(5.0e-3);
|
||||
FILTER(6.0e-3);
|
||||
|
||||
FILTER(2.5e-3);
|
||||
FILTER(3.5e-3);
|
||||
FILTER(4.5e-3);
|
||||
FILTER(5.5e-3);
|
||||
FILTER(6.5e-3);
|
||||
|
||||
// FILTER(6.0e-5);//6
|
||||
// FILTER(7.0e-5);//8
|
||||
// FILTER(8.0e-5);//9
|
||||
// FILTER(9.0e-5);//3
|
||||
|
||||
/*
|
||||
// FILTER(1.0e-4);//10
|
||||
FILTER(2.0e-4);//11
|
||||
// FILTER(3.0e-4);//12
|
||||
// FILTER(4.0e-4);//13
|
||||
FILTER(5.0e-4);//14
|
||||
|
||||
FILTER(6.0e-3);//4
|
||||
FILTER(7.0e-4);//1
|
||||
FILTER(8.0e-4);//7
|
||||
FILTER(9.0e-4);//15
|
||||
FILTER(1.0e-3);//2
|
||||
|
||||
FILTER(2.0e-3);//2
|
||||
FILTER(3.0e-3);//2
|
||||
FILTER(4.0e-3);//2
|
||||
FILTER(5.0e-3);//2
|
||||
FILTER(6.0e-3);//2
|
||||
|
||||
FILTER(7.0e-3);//2
|
||||
FILTER(8.0e-3);//2
|
||||
FILTER(1.0e-2);//2
|
||||
*/
|
||||
std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
|
||||
assert(b==nn);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
};
|
||||
|
||||
@ -580,23 +291,22 @@ public:
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
int lane=SIMTlane(Nsimd);
|
||||
for(int point=0;point<geom.npoint;point++){
|
||||
|
||||
SE=Stencil.GetEntry(ptype,point,ss);
|
||||
|
||||
if(SE->_is_local) {
|
||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||
} else {
|
||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
||||
}
|
||||
synchronise();
|
||||
acceleratorSynchronise();
|
||||
|
||||
for(int bb=0;bb<nbasis;bb++) {
|
||||
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||
}
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res,lane);
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
usecs +=usecond();
|
||||
|
||||
@ -604,13 +314,6 @@ public:
|
||||
RealD Nout= norm2(out);
|
||||
nrm_usec+=usecond();
|
||||
|
||||
/*
|
||||
std::cout << GridLogMessage << "\tNorm " << nrm_usec << " us" <<std::endl;
|
||||
std::cout << GridLogMessage << "\tHalo " << comms_usec << " us" <<std::endl;
|
||||
std::cout << GridLogMessage << "\tMatrix " << usecs << " us" <<std::endl;
|
||||
std::cout << GridLogMessage << "\t mflop/s " << flops/usecs<<std::endl;
|
||||
std::cout << GridLogMessage << "\t MB/s " << bytes/usecs<<std::endl;
|
||||
*/
|
||||
return Nout;
|
||||
};
|
||||
|
||||
@ -658,45 +361,20 @@ public:
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
int lane=SIMTlane(Nsimd);
|
||||
SE=Stencil.GetEntry(ptype,point,ss);
|
||||
|
||||
if(SE->_is_local) {
|
||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||
} else {
|
||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
||||
}
|
||||
synchronise();
|
||||
acceleratorSynchronise();
|
||||
|
||||
for(int bb=0;bb<nbasis;bb++) {
|
||||
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res,lane);
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
#if 0
|
||||
accelerator_for(ss,Grid()->oSites(),1,{
|
||||
|
||||
siteVector res = Zero();
|
||||
siteVector nbr;
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
|
||||
SE=Stencil.GetEntry(ptype,point,ss);
|
||||
|
||||
if(SE->_is_local&&SE->_permute) {
|
||||
permute(nbr,in_v[SE->_offset],ptype);
|
||||
} else if(SE->_is_local) {
|
||||
nbr = in_v[SE->_offset];
|
||||
} else {
|
||||
nbr = Stencil.CommBuf()[SE->_offset];
|
||||
}
|
||||
synchronise();
|
||||
|
||||
res = res + Aview_p[point][ss]*nbr;
|
||||
|
||||
out_v[ss]=res;
|
||||
});
|
||||
#endif
|
||||
}
|
||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||
{
|
||||
@ -912,33 +590,8 @@ public:
|
||||
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
|
||||
ForceHermitian();
|
||||
}
|
||||
// AssertHermitian();
|
||||
// ForceDiagonal();
|
||||
}
|
||||
|
||||
#if 0
|
||||
///////////////////////////
|
||||
// test code worth preserving in if block
|
||||
///////////////////////////
|
||||
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
|
||||
std::cout<<GridLogMessage<< A[p] << std::endl;
|
||||
}
|
||||
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
|
||||
|
||||
phi=Subspace.subspace[0];
|
||||
std::vector<int> bc(FineGrid->_ndimension,0);
|
||||
|
||||
blockPick(Grid(),phi,tmp,bc); // Pick out a block
|
||||
linop.Op(tmp,Mphi); // Apply big dop
|
||||
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
|
||||
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
|
||||
std::cout<<GridLogMessage<< iProj <<std::endl;
|
||||
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
|
||||
#endif
|
||||
|
||||
|
||||
void ForceHermitian(void) {
|
||||
CoarseMatrix Diff (Grid());
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
@ -958,27 +611,6 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
void AssertHermitian(void) {
|
||||
CoarseMatrix AA (Grid());
|
||||
CoarseMatrix AAc (Grid());
|
||||
CoarseMatrix Diff (Grid());
|
||||
for(int d=0;d<4;d++){
|
||||
|
||||
int dd=d+1;
|
||||
AAc = Cshift(A[2*d+1],dd,1);
|
||||
AA = A[2*d];
|
||||
|
||||
Diff = AA - adj(AAc);
|
||||
|
||||
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
|
||||
|
||||
}
|
||||
Diff = A[8] - adj(A[8]);
|
||||
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
Reference in New Issue
Block a user