diff --git a/Grid/algorithms/GeneralCoarsenedMatrix.h b/Grid/algorithms/GeneralCoarsenedMatrix.h index e1b086fa..a1fbebdf 100644 --- a/Grid/algorithms/GeneralCoarsenedMatrix.h +++ b/Grid/algorithms/GeneralCoarsenedMatrix.h @@ -193,6 +193,7 @@ class GeneralCoarsenedMatrix : public SparseMatrixBase siteVector; + typedef iMatrix siteMatrix; typedef Lattice > CoarseComplexField; typedef Lattice CoarseVector; typedef Lattice > CoarseMatrix; @@ -254,7 +255,6 @@ public: { { int npoint = _geom.npoint; - StencilEntry *SE; autoView( Stencil_v , Stencil, AcceleratorRead); int osites=Stencil.Grid()->oSites(); for(int ss=0;ss Aview; - Vector AcceleratorViewContainer; - - for(int p=0;poSites(); int gsites=pin.Grid()->gSites(); RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites; + RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint; for(int point=0;point_offset; + auto SE = Stencil_v.GetEntry(point,ss); + int o = SE->_offset; + + // Junk load is annoying -- need to sort out the types better. + ////////////////////////////// + // GPU chokes on gpermute - want coalescedReadPermute() + // gpermute(nbr,SE->_permute); + ////////////////////////////// + coalescedWrite(out_v[ss],out_v(ss) + A_v(ss)*in_v(o)); - assert( o < osites); - // gpermute etc.. - nbr = in_v[o]; - gpermute(nbr,SE->_permute); - - for(int bb=0;bb phase(0.0,0.0); + ComplexD phase(0.0,0.0); for(int mu=0;mu &out){assert(0);};