/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/lattice/PaddedCell.h Copyright (C) 2019 Author: Peter Boyle pboyle@bnl.gov This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ #pragma once #include NAMESPACE_BEGIN(Grid); //Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions template struct CshiftImplBase{ virtual Lattice Cshift(const Lattice &in, int dir, int shift) const = 0; virtual ~CshiftImplBase(){} }; template struct CshiftImplDefault: public CshiftImplBase{ Lattice Cshift(const Lattice &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); } }; template struct CshiftImplGauge: public CshiftImplBase{ typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); } }; /* * * TODO: * -- address elementsof vobj via thread block in Scatter/Gather * -- overlap comms with motion in Face_exchange * */ template inline void ScatterSlice(const deviceVector &buf, Lattice &lat, int x, int dim, int offset=0) { const int Nsimd=vobj::Nsimd(); typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; GridBase *grid = lat.Grid(); Coordinate simd = grid->_simd_layout; int Nd = grid->Nd(); int block = grid->_slice_block[dim]; int stride = grid->_slice_stride[dim]; int nblock = grid->_slice_nblock[dim]; int rd = grid->_rdimensions[dim]; int ox = x%rd; int ix = x/rd; int isites = 1; for(int d=0;d inline void GatherSlice(deviceVector &buf, const Lattice &lat, int x, int dim, int offset=0) { const int Nsimd=vobj::Nsimd(); typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; autoView(lat_v, lat, AcceleratorRead); GridBase *grid = lat.Grid(); Coordinate simd = grid->_simd_layout; int Nd = grid->Nd(); int block = grid->_slice_block[dim]; int stride = grid->_slice_stride[dim]; int nblock = grid->_slice_nblock[dim]; int rd = grid->_rdimensions[dim]; int ox = x%rd; int ix = x/rd; int isites = 1; for(int d=0;d grids; ~PaddedCell() { DeleteGrids(); } PaddedCell(int _depth,GridCartesian *_grid) { unpadded_grid = _grid; depth=_depth; dims=_grid->Nd(); AllocateGrids(); Coordinate local =unpadded_grid->LocalDimensions(); Coordinate procs =unpadded_grid->ProcessorGrid(); for(int d=0;d 1 ) GRID_ASSERT(local[d]>=depth); } } void DeleteGrids(void) { Coordinate processors=unpadded_grid->_processors; for(int d=0;d 1 ) { delete grids[d]; } } grids.resize(0); }; void AllocateGrids(void) { Coordinate local =unpadded_grid->LocalDimensions(); Coordinate simd =unpadded_grid->_simd_layout; Coordinate processors=unpadded_grid->_processors; Coordinate plocal =unpadded_grid->LocalDimensions(); Coordinate global(dims); GridCartesian *old_grid = unpadded_grid; // expand up one dim at a time for(int d=0;d 1 ) { plocal[d] += 2*depth; for(int d=0;d inline Lattice Extract(const Lattice &in) const { Coordinate processors=unpadded_grid->_processors; Lattice out(unpadded_grid); Coordinate local =unpadded_grid->LocalDimensions(); // depends on the MPI spread Coordinate fll(dims,depth); Coordinate tll(dims,0); // depends on the MPI spread for(int d=0;d inline Lattice Exchange(const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const { GridBase *old_grid = in.Grid(); int dims = old_grid->Nd(); Lattice tmp = in; for(int d=0;d inline Lattice ExchangePeriodic(const Lattice &in) const { GridBase *old_grid = in.Grid(); int dims = old_grid->Nd(); Lattice tmp = in; for(int d=0;d inline Lattice Expand(int dim, const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const { Coordinate processors=unpadded_grid->_processors; GridBase *old_grid = in.Grid(); GridCartesian *new_grid = grids[dim];//These are new grids Lattice padded(new_grid); Lattice shifted(old_grid); Coordinate local =old_grid->LocalDimensions(); Coordinate plocal =new_grid->LocalDimensions(); if(dim==0) conformable(old_grid,unpadded_grid); else conformable(old_grid,grids[dim-1]); double tins=0, tshift=0; int islocal = 0 ; if ( processors[dim] == 1 ) islocal = 1; if ( islocal ) { // replace with a copy and maybe grid swizzle // return in;?? double t = usecond(); padded = in; tins += usecond() - t; } else { ////////////////////////////////////////////// // Replace sequence with // --------------------- // (i) Gather high face(s); start comms // (ii) Gather low face(s); start comms // (iii) Copy middle bit with localCopyRegion // (iv) Complete high face(s), insert slice(s) // (iv) Complete low face(s), insert slice(s) ////////////////////////////////////////////// // Middle bit double t = usecond(); for(int x=0;x inline Lattice ExpandPeriodic(int dim, const Lattice &in) const { Coordinate processors=unpadded_grid->_processors; GridBase *old_grid = in.Grid(); GridCartesian *new_grid = grids[dim];//These are new grids Lattice padded(new_grid); // Lattice shifted(old_grid); Coordinate local =old_grid->LocalDimensions(); Coordinate plocal =new_grid->LocalDimensions(); if(dim==0) conformable(old_grid,unpadded_grid); else conformable(old_grid,grids[dim-1]); // std::cout << " dim "< void Face_exchange(const Lattice &from, Lattice &to, int dimension,int depth) const { typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_object sobj; RealD t_gather=0.0; RealD t_scatter=0.0; RealD t_comms=0.0; RealD t_copy=0.0; // std::cout << GridLogMessage << "dimension " <_ldimensions; Coordinate nlds= to.Grid()->_ldimensions; Coordinate simd= from.Grid()->_simd_layout; int ld = lds[dimension]; int nld = to.Grid()->_ldimensions[dimension]; const int Nsimd = vobj::Nsimd(); GRID_ASSERT(depth<=lds[dimension]); // A must be on neighbouring node GRID_ASSERT(depth>0); // A caller bug if zero GRID_ASSERT(ld+2*depth==nld); //////////////////////////////////////////////////////////////////////////// // Face size and byte calculations //////////////////////////////////////////////////////////////////////////// int buffer_size = 1; for(int d=0;d_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); static deviceVector send_buf; static deviceVector recv_buf; send_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth); #ifndef ACCELERATOR_AWARE_MPI static hostVector hsend_buf; static hostVector hrecv_buf; hsend_buf.resize(buffer_size*2*depth); hrecv_buf.resize(buffer_size*2*depth); #endif std::vector fwd_req; std::vector bwd_req; int words = buffer_size; int bytes = words * sizeof(vobj); //////////////////////////////////////////////////////////////////////////// // Communication coords //////////////////////////////////////////////////////////////////////////// int comm_proc = 1; int xmit_to_rank; int recv_from_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); //////////////////////////////////////////////////////////////////////////// // Gather all surface terms up to depth "d" //////////////////////////////////////////////////////////////////////////// RealD t; RealD t_tot=-usecond(); int plane=0; for ( int d=0;d < depth ; d ++ ) { int tag = d*1024 + dimension*2+0; t=usecond(); GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++; t_gather+=usecond()-t; t=usecond(); #ifdef ACCELERATOR_AWARE_MPI grid->SendToRecvFromBegin(fwd_req, (void *)&send_buf[d*buffer_size], xmit_to_rank, (void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); #else acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes); grid->SendToRecvFromBegin(fwd_req, (void *)&hsend_buf[d*buffer_size], xmit_to_rank, (void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag); #endif t_comms+=usecond()-t; } for ( int d=0;d < depth ; d ++ ) { int tag = d*1024 + dimension*2+1; t=usecond(); GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++; t_gather+= usecond() - t; t=usecond(); #ifdef ACCELERATOR_AWARE_MPI grid->SendToRecvFromBegin(bwd_req, (void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, (void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); #else acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes); grid->SendToRecvFromBegin(bwd_req, (void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank, (void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); #endif t_comms+=usecond()-t; } //////////////////////////////////////////////////////////////////////////// // Copy interior -- overlap this with comms //////////////////////////////////////////////////////////////////////////// int Nd = new_grid->Nd(); Coordinate LL(Nd,0); Coordinate sz = grid->_ldimensions; Coordinate toLL(Nd,0); toLL[dimension]=depth; t=usecond(); localCopyRegion(from,to,LL,toLL,sz); t_copy= usecond() - t; //////////////////////////////////////////////////////////////////////////// // Scatter all faces //////////////////////////////////////////////////////////////////////////// plane=0; t=usecond(); grid->CommsComplete(fwd_req); #ifndef ACCELERATOR_AWARE_MPI for ( int d=0;d < depth ; d ++ ) { acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes); } #endif t_comms+= usecond() - t; t=usecond(); for ( int d=0;d < depth ; d ++ ) { ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++; } t_scatter= usecond() - t; t=usecond(); grid->CommsComplete(bwd_req); #ifndef ACCELERATOR_AWARE_MPI for ( int d=0;d < depth ; d ++ ) { acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes); } #endif t_comms+= usecond() - t; t=usecond(); for ( int d=0;d < depth ; d ++ ) { ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++; } t_scatter+= usecond() - t; t_tot+=usecond(); std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<