diff --git a/Grid/algorithms/GeneralCoarsenedMatrix.h b/Grid/algorithms/GeneralCoarsenedMatrix.h index a1fbebdf..3e4bc1df 100644 --- a/Grid/algorithms/GeneralCoarsenedMatrix.h +++ b/Grid/algorithms/GeneralCoarsenedMatrix.h @@ -279,6 +279,7 @@ public: } void Mult (std::vector &A,const CoarseVector &in, CoarseVector &out) { + RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; @@ -306,45 +307,62 @@ public: RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites; RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint; - for(int point=0;point_offset; - // Junk load is annoying -- need to sort out the types better. ////////////////////////////// // GPU chokes on gpermute - want coalescedReadPermute() // gpermute(nbr,SE->_permute); ////////////////////////////// + auto SE = Stencil_v.GetEntry(point,ss); + int o = SE->_offset; coalescedWrite(out_v[ss],out_v(ss) + A_v(ss)*in_v(o)); - }); +#else + prof_accelerator_for(sss, osites*nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + + int ss = sss/nbasis; + int b = sss%nbasis; + + auto SE = Stencil_v.GetEntry(point,ss); + auto nbr = coalescedRead(in_v[SE->_offset]); + auto res = out_v(ss)(b); + for(int bb=0;bb