From 5fac47a26d26925df33c34d116a383aa59a26643 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 Oct 2023 18:16:47 -0400 Subject: [PATCH] Faster halo exchange --- Grid/lattice/PaddedCell.h | 94 +++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 34 deletions(-) diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index b994dd11..568fa65d 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -47,16 +47,15 @@ struct CshiftImplGauge: public CshiftImplBase inline void ScatterSlice(const cshiftVector &buf, +template inline void ScatterSlice(const cshiftVector &buf, Lattice &lat, int x, int dim, int offset=0) { + const int Nsimd=vobj::Nsimd(); typedef typename vobj::scalar_object sobj; - autoView(lat_v, lat, AcceleratorRead); - GridBase *grid = lat.Grid(); Coordinate simd = grid->_simd_layout; int Nd = grid->Nd(); @@ -73,7 +72,8 @@ template inline void ScatterSlice(const cshiftVector inline void ScatterSlice(const cshiftVector inline void ScatterSlice(const cshiftVector inline void GatherSlice(cshiftVector &buf, +template inline void GatherSlice(cshiftVector &buf, const Lattice &lat, int x, int dim, int offset=0) { + const int Nsimd=vobj::Nsimd(); typedef typename vobj::scalar_object sobj; autoView(lat_v, lat, AcceleratorRead); @@ -149,20 +155,29 @@ template inline void GatherSlice(cshiftVector inline void GatherSlice(cshiftVector tbuf(words); + acceleratorCopyFromDevice((void *)&buf[offset],(void *)&tbuf[0],words*sizeof(vobj)); + typedef typename vobj::scalar_type scalar; + scalar *sbuf = (scalar *)&tbuf[0]; + scalar tmp=0.0; + for(int w=0;w - inline Lattice ExchangeTest(const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const + inline Lattice ExchangePeriodic(const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const { GridBase *old_grid = in.Grid(); int dims = old_grid->Nd(); Lattice tmp = in; for(int d=0;d - inline Lattice ExpandTest(int dim, const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const + inline Lattice ExpandPeriodic(int dim, const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const { Coordinate processors=unpadded_grid->_processors; GridBase *old_grid = in.Grid(); @@ -412,7 +437,7 @@ public: Coordinate simd= from.Grid()->_simd_layout; int ld = lds[dimension]; int nld = to.Grid()->_ldimensions[dimension]; - + const int Nsimd = vobj::Nsimd(); assert(depth<=lds[dimension]); // A must be on neighbouring node assert(depth>0); // A caller bug if zero @@ -424,16 +449,17 @@ public: for(int d=0;d_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] *rNsimd); + buffer_size = buffer_size / Nsimd; + int rNsimd = Nsimd / simd[dimension]; + assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); - static cshiftVector send_buf; - static cshiftVector recv_buf; + static cshiftVector send_buf; + static cshiftVector recv_buf; send_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth); int words = buffer_size; - int bytes = words * sizeof(sobj); + int bytes = words * sizeof(vobj); //////////////////////////////////////////////////////////////////////////// // Gather all surface terms up to depth "d" //////////////////////////////////////////////////////////////////////////// @@ -491,14 +517,14 @@ public: ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++; } t_scatter= usecond() - t; + // DumpSliceNorm(std::string("Face_exchange to done"),to,dimension); std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<