1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Fastest GPU version.

Need to work on the PaddedCell now to make much faster
This commit is contained in:
Peter Boyle 2023-09-29 18:26:51 -04:00
parent 9db585cfeb
commit 36ae6e5aba

View File

@ -279,6 +279,7 @@ public:
} }
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out) void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
{ {
RealD tviews=0;
RealD ttot=0; RealD ttot=0;
RealD tmult=0; RealD tmult=0;
RealD texch=0; RealD texch=0;
@ -306,45 +307,62 @@ public:
RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites; RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites;
RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint; RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint;
for(int point=0;point<npoint;point++){ // for(int point=0;point<npoint;point++){
conformable(A[point],pin); // conformable(A[point],pin);
} // }
{ {
tviews-=usecond();
autoView( in_v , pin, AcceleratorRead); autoView( in_v , pin, AcceleratorRead);
autoView( out_v , pout, AcceleratorWrite); autoView( out_v , pout, AcceleratorWrite);
autoView( Stencil_v , Stencil, AcceleratorRead); autoView( Stencil_v , Stencil, AcceleratorRead);
tviews+=usecond();
std::cout << "Calling accelerator for loop " <<std::endl; std::cout << "Calling accelerator for loop " <<std::endl;
for(int point=0;point<npoint;point++){ for(int point=0;point<npoint;point++){
tviews-=usecond();
autoView( A_v, A[point],AcceleratorRead); autoView( A_v, A[point],AcceleratorRead);
tviews+=usecond();
tmult-=usecond(); tmult-=usecond();
#if 0
prof_accelerator_for(ss, osites, Nsimd, { prof_accelerator_for(ss, osites, Nsimd, {
auto SE = Stencil_v.GetEntry(point,ss);
int o = SE->_offset;
// Junk load is annoying -- need to sort out the types better. // Junk load is annoying -- need to sort out the types better.
////////////////////////////// //////////////////////////////
// GPU chokes on gpermute - want coalescedReadPermute() // GPU chokes on gpermute - want coalescedReadPermute()
// gpermute(nbr,SE->_permute); // gpermute(nbr,SE->_permute);
////////////////////////////// //////////////////////////////
auto SE = Stencil_v.GetEntry(point,ss);
int o = SE->_offset;
coalescedWrite(out_v[ss],out_v(ss) + A_v(ss)*in_v(o)); coalescedWrite(out_v[ss],out_v(ss) + A_v(ss)*in_v(o));
}); });
#else
prof_accelerator_for(sss, osites*nbasis, Nsimd, {
typedef decltype(coalescedRead(in_v[0])) calcVector;
int ss = sss/nbasis;
int b = sss%nbasis;
auto SE = Stencil_v.GetEntry(point,ss);
auto nbr = coalescedRead(in_v[SE->_offset]);
auto res = out_v(ss)(b);
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(A_v[ss](b,bb))*nbr(bb);
}
coalescedWrite(out_v[ss](b),res);
});
#endif
tmult+=usecond(); tmult+=usecond();
} }
std::cout << "Called accelerator for loop " <<std::endl; std::cout << "Called accelerator for loop " <<std::endl;
} }
std::cout << "out"<< norm2(pout)<<std::endl;
std::cout << "in"<< norm2(pin)<<std::endl;
std::cout << "A"<< norm2(A[0])<<std::endl;
text-=usecond(); text-=usecond();
out = Cell.Extract(pout); out = Cell.Extract(pout);
text+=usecond(); text+=usecond();
ttot+=usecond(); ttot+=usecond();
std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl; std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl; std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult ext "<<text<<" us"<<std::endl; std::cout << GridLogMessage<<"Coarse Mult ext "<<text<<" us"<<std::endl;
@ -352,6 +370,7 @@ public:
std::cout << GridLogMessage<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl; std::cout << GridLogMessage<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
std::cout << GridLogMessage<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl; std::cout << GridLogMessage<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
std::cout << GridLogMessage<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl; std::cout << GridLogMessage<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
std::cout << GridLogMessage<<"Coarse bytes "<< bytes/1e6<<" MB"<<std::endl;
}; };
void PopulateAdag(void) void PopulateAdag(void)