mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Compare commits
4 Commits
be18ffe3b4
...
36ae6e5aba
Author | SHA1 | Date | |
---|---|---|---|
|
36ae6e5aba | ||
|
9db585cfeb | ||
|
c564611ba7 | ||
|
e187bcb85c |
@ -193,6 +193,7 @@ class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,
|
||||
public:
|
||||
|
||||
typedef iVector<CComplex,nbasis > siteVector;
|
||||
typedef iMatrix<CComplex,nbasis > siteMatrix;
|
||||
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
|
||||
typedef Lattice<siteVector> CoarseVector;
|
||||
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
|
||||
@ -254,7 +255,6 @@ public:
|
||||
{
|
||||
{
|
||||
int npoint = _geom.npoint;
|
||||
StencilEntry *SE;
|
||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||
int osites=Stencil.Grid()->oSites();
|
||||
for(int ss=0;ss<osites;ss++){
|
||||
@ -279,6 +279,7 @@ public:
|
||||
}
|
||||
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
|
||||
{
|
||||
RealD tviews=0;
|
||||
RealD ttot=0;
|
||||
RealD tmult=0;
|
||||
RealD texch=0;
|
||||
@ -293,74 +294,83 @@ public:
|
||||
CoarseVector pin = Cell.Exchange(tin);
|
||||
texch+=usecond();
|
||||
|
||||
CoarseVector pout(pin.Grid());
|
||||
|
||||
autoView( in_v , pin, AcceleratorRead);
|
||||
autoView( out_v , pout, AcceleratorWrite);
|
||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||
CoarseVector pout(pin.Grid()); pout=Zero();
|
||||
|
||||
int npoint = geom.npoint;
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
|
||||
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
typedef siteVector calcVector;
|
||||
typedef CComplex calcComplex;
|
||||
|
||||
int osites=pin.Grid()->oSites();
|
||||
int gsites=pin.Grid()->gSites();
|
||||
|
||||
RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites;
|
||||
RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint;
|
||||
|
||||
for(int point=0;point<npoint;point++){
|
||||
conformable(A[point],pin);
|
||||
}
|
||||
|
||||
tmult-=usecond();
|
||||
accelerator_for(sss, osites*nbasis, 1, {
|
||||
int ss = sss/nbasis;
|
||||
int b = sss%nbasis;
|
||||
assert(ss<osites);
|
||||
calcComplex res;
|
||||
res = Zero();
|
||||
calcVector nbr;
|
||||
int ptype;
|
||||
StencilEntry *SE;
|
||||
// for(int point=0;point<npoint;point++){
|
||||
// conformable(A[point],pin);
|
||||
// }
|
||||
|
||||
{
|
||||
tviews-=usecond();
|
||||
autoView( in_v , pin, AcceleratorRead);
|
||||
autoView( out_v , pout, AcceleratorWrite);
|
||||
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||
tviews+=usecond();
|
||||
|
||||
std::cout << "Calling accelerator for loop " <<std::endl;
|
||||
|
||||
for(int point=0;point<npoint;point++){
|
||||
tviews-=usecond();
|
||||
autoView( A_v, A[point],AcceleratorRead);
|
||||
tviews+=usecond();
|
||||
tmult-=usecond();
|
||||
#if 0
|
||||
prof_accelerator_for(ss, osites, Nsimd, {
|
||||
// Junk load is annoying -- need to sort out the types better.
|
||||
//////////////////////////////
|
||||
// GPU chokes on gpermute - want coalescedReadPermute()
|
||||
// gpermute(nbr,SE->_permute);
|
||||
//////////////////////////////
|
||||
auto SE = Stencil_v.GetEntry(point,ss);
|
||||
int o = SE->_offset;
|
||||
coalescedWrite(out_v[ss],out_v(ss) + A_v(ss)*in_v(o));
|
||||
});
|
||||
#else
|
||||
prof_accelerator_for(sss, osites*nbasis, Nsimd, {
|
||||
|
||||
auto SE = Stencil_v.GetEntry(point,ss);
|
||||
|
||||
int o = SE->_offset;
|
||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||
|
||||
assert( o < osites);
|
||||
// gpermute etc..
|
||||
nbr = in_v[o];
|
||||
gpermute(nbr,SE->_permute);
|
||||
int ss = sss/nbasis;
|
||||
int b = sss%nbasis;
|
||||
|
||||
for(int bb=0;bb<nbasis;bb++) {
|
||||
res = res + Aview_p[point][ss](b,bb)*nbr(bb);
|
||||
}
|
||||
auto SE = Stencil_v.GetEntry(point,ss);
|
||||
auto nbr = coalescedRead(in_v[SE->_offset]);
|
||||
auto res = out_v(ss)(b);
|
||||
for(int bb=0;bb<nbasis;bb++) {
|
||||
res = res + coalescedRead(A_v[ss](b,bb))*nbr(bb);
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
#endif
|
||||
tmult+=usecond();
|
||||
}
|
||||
out_v[ss](b)=res;
|
||||
});
|
||||
tmult+=usecond();
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
std::cout << "Called accelerator for loop " <<std::endl;
|
||||
}
|
||||
text-=usecond();
|
||||
out = Cell.Extract(pout);
|
||||
text+=usecond();
|
||||
ttot+=usecond();
|
||||
|
||||
std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult ext "<<text<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Kernel flops/s "<< flops/tmult<<" mflop/s"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
|
||||
std::cout << GridLogMessage<<"Coarse bytes "<< bytes/1e6<<" MB"<<std::endl;
|
||||
};
|
||||
|
||||
void PopulateAdag(void)
|
||||
@ -453,10 +463,7 @@ public:
|
||||
Coordinate coor({0,0,0,0,0});
|
||||
auto sval = peekSite(_A[p],coor);
|
||||
}
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
_A[p] = Cell.Exchange(_A[p]);
|
||||
_Adag[p]= Cell.Exchange(_Adag[p]);
|
||||
}
|
||||
ExchangeCoarseLinks();
|
||||
std::cout << GridLogMessage<<"CoarsenOperator pick "<<tpick<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"CoarsenOperator projection "<<tproj<<" us"<<std::endl;
|
||||
@ -542,13 +549,13 @@ public:
|
||||
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
|
||||
*/
|
||||
teigen-=usecond();
|
||||
ComplexD ci(0.0,1.0);
|
||||
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
|
||||
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
|
||||
ComplexD ci(0.0,1.0);
|
||||
for(int k=0;k<npoint;k++){ // Loop over momenta
|
||||
|
||||
for(int l=0;l<npoint;l++){ // Loop over nbr relative
|
||||
std::complex<double> phase(0.0,0.0);
|
||||
ComplexD phase(0.0,0.0);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
|
||||
phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
|
||||
@ -633,17 +640,19 @@ public:
|
||||
PopulateAdag();
|
||||
|
||||
// Need to write something to populate Adag from A
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
_A[p] = Cell.Exchange(_A[p]);
|
||||
_Adag[p]= Cell.Exchange(_Adag[p]);
|
||||
}
|
||||
ExchangeCoarseLinks();
|
||||
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
|
||||
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
||||
}
|
||||
|
||||
void ExchangeCoarseLinks(void){
|
||||
for(int p=0;p<geom.npoint;p++){
|
||||
_A[p] = Cell.Exchange(_A[p]);
|
||||
_Adag[p]= Cell.Exchange(_Adag[p]);
|
||||
}
|
||||
}
|
||||
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
|
||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
|
||||
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
|
||||
|
@ -137,6 +137,18 @@ inline void cuda_mem(void)
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
}
|
||||
#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
}
|
||||
|
||||
#define accelerator_for6dNB(iter1, num1, \
|
||||
iter2, num2, \
|
||||
@ -157,6 +169,20 @@ inline void cuda_mem(void)
|
||||
Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
|
||||
}
|
||||
|
||||
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
{
|
||||
@ -168,6 +194,17 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
Lambda(x,y,z);
|
||||
}
|
||||
}
|
||||
template<typename lambda> __global__
|
||||
void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
{
|
||||
// Weird permute is to make lane coalesce for large blocks
|
||||
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
|
||||
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
|
||||
uint64_t z = threadIdx.x;
|
||||
if ( (x < num1) && (y<num2) && (z<num3) ) {
|
||||
Lambda(x,y,z);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
||||
@ -208,6 +245,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
assert(0);
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
@ -460,6 +498,9 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
|
||||
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
|
||||
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
|
||||
#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
|
||||
prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
#define accelerator_for( iter, num, nsimd, ... ) \
|
||||
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
|
||||
|
@ -108,7 +108,7 @@ int main (int argc, char ** argv)
|
||||
|
||||
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
|
||||
const int nbasis = 4;
|
||||
const int nbasis = 16;
|
||||
const int cb = 0 ;
|
||||
LatticeFermion prom(FGrid);
|
||||
|
||||
@ -147,6 +147,8 @@ int main (int argc, char ** argv)
|
||||
// LittleDiracOpCol.CoarsenOperator(HOA,Aggregates);
|
||||
// std::cout << "LittleDiracOp old " << LittleDiracOpCol._A[pp]<<std::endl;
|
||||
LittleDiracOp.CoarsenOperatorColoured(HOA,Aggregates);
|
||||
// LittleDiracOp.ExchangeCoarseLinks();
|
||||
|
||||
// std::cout << "LittleDiracOp new " << LittleDiracOp._A[pp]<<std::endl;
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
@ -178,6 +180,7 @@ int main (int argc, char ** argv)
|
||||
blockProject(c_proj,tmp,subspace);
|
||||
std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
|
||||
|
||||
std::cout<<GridLogMessage<<" Calling little Dirac Op "<<std::endl;
|
||||
LittleDiracOp.M(c_src,c_res);
|
||||
LittleDiracOp.Mdag(c_src,c_res_dag);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user