diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index e47137f9..5e5bcbfa 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -117,8 +117,8 @@ public: CoarseScalar InnerProd(CoarseGrid); std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"< "< "< siteVector; typedef Lattice CoarseVector; typedef Lattice > CoarseMatrix; - + typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; @@ -336,36 +335,70 @@ public: conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); - RealD Nin = norm2(in); + + // RealD Nin = norm2(in); SimpleCompressor compressor; + double comms_usec = -usecond(); Stencil.HaloExchange(in,compressor); + comms_usec += usecond(); auto in_v = in.View(); auto out_v = out.View(); - accelerator_for(ss,Grid()->oSites(),1,{ - siteVector res = Zero(); - siteVector nbr; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(); + double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; + double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); + double usecs =-usecond(); + + assert(geom.npoint==9); + + accelerator_for(ss, Grid()->oSites(), Nsimd, { + + calcVector res = Zero(); + calcVector nbr; int ptype; StencilEntry *SE; - for(int point=0;point_is_local&&SE->_permute) { - permute(nbr,in_v[SE->_offset],ptype); - } else if(SE->_is_local) { - nbr = in_v[SE->_offset]; + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); } else { - nbr = Stencil.CommBuf()[SE->_offset]; + nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); } - auto A_point = A[point].View(); - res = res + A_point[ss]*nbr; - } - vstream(out_v[ss],res); - }); + synchronise(); + auto A = coalescedRead(Aview_p[point][ss]); + res = res + A*nbr; + } + coalescedWrite(out_v[ss],res,lane); + }); + usecs +=usecond(); + + double nrm_usec=-usecond(); RealD Nout= norm2(out); + nrm_usec+=usecond(); + /* + std::cout << GridLogMessage << "\tNorm " << nrm_usec << " us" <