From 36ae6e5aba68ae69b086fa556290bb8a3c9fcc2a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 29 Sep 2023 18:26:51 -0400 Subject: [PATCH] Fastest GPU version. Need to work on the PaddedCell now to make much faster --- Grid/algorithms/GeneralCoarsenedMatrix.h | 41 +++++++++++++++++------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/Grid/algorithms/GeneralCoarsenedMatrix.h b/Grid/algorithms/GeneralCoarsenedMatrix.h index a1fbebdf..3e4bc1df 100644 --- a/Grid/algorithms/GeneralCoarsenedMatrix.h +++ b/Grid/algorithms/GeneralCoarsenedMatrix.h @@ -279,6 +279,7 @@ public: } void Mult (std::vector &A,const CoarseVector &in, CoarseVector &out) { + RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; @@ -306,45 +307,62 @@ public: RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites; RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint; - for(int point=0;point_offset; - // Junk load is annoying -- need to sort out the types better. ////////////////////////////// // GPU chokes on gpermute - want coalescedReadPermute() // gpermute(nbr,SE->_permute); ////////////////////////////// + auto SE = Stencil_v.GetEntry(point,ss); + int o = SE->_offset; coalescedWrite(out_v[ss],out_v(ss) + A_v(ss)*in_v(o)); - }); +#else + prof_accelerator_for(sss, osites*nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + + int ss = sss/nbasis; + int b = sss%nbasis; + + auto SE = Stencil_v.GetEntry(point,ss); + auto nbr = coalescedRead(in_v[SE->_offset]); + auto res = out_v(ss)(b); + for(int bb=0;bb