mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-24 17:54:47 +01:00 
			
		
		
		
	GPU friendly Stencil needs a view
This commit is contained in:
		| @@ -28,6 +28,8 @@ | |||||||
| #ifndef GRID_STENCIL_H | #ifndef GRID_STENCIL_H | ||||||
| #define GRID_STENCIL_H | #define GRID_STENCIL_H | ||||||
|  |  | ||||||
|  | #define STENCIL_MAX (16) | ||||||
|  |  | ||||||
| #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate | #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate | ||||||
| #include <Grid/stencil/Lebesgue.h>   // subdir aggregate | #include <Grid/stencil/Lebesgue.h>   // subdir aggregate | ||||||
|  |  | ||||||
| @@ -99,18 +101,74 @@ struct StencilEntry { | |||||||
|   uint16_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline |   uint16_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline | ||||||
|   uint16_t _pad; |   uint16_t _pad; | ||||||
| }; | }; | ||||||
|  | template<class vobj,class cobj> | ||||||
|  | class CartesianStencilView { | ||||||
|  |  public: | ||||||
|  |   typedef AcceleratorVector<int,STENCIL_MAX> StencilVector; | ||||||
|  |  | ||||||
|  |   // Stencil runs along coordinate axes only; NO diagonal fill in. | ||||||
|  |   //////////////////////////////////////// | ||||||
|  |   // Basic Grid and stencil info | ||||||
|  |   //////////////////////////////////////// | ||||||
|  |   int                               _checkerboard; | ||||||
|  |   int                               _npoints; // Move to template param? | ||||||
|  |   StencilVector _directions; | ||||||
|  |   StencilVector _distances; | ||||||
|  |   StencilVector _comm_buf_size; | ||||||
|  |   StencilVector _permute_type; | ||||||
|  |   StencilVector same_node; | ||||||
|  |   Coordinate                         _simd_layout; | ||||||
|  |   Coordinate                         twists; | ||||||
|  |   StencilEntry*  _entries_p; | ||||||
|  |   cobj* u_recv_buf_p; | ||||||
|  |   cobj* u_send_buf_p; | ||||||
|  |  | ||||||
|  |   accelerator_inline cobj *CommBuf(void) { return u_recv_buf_p; } | ||||||
|  |  | ||||||
|  |   accelerator_inline int GetNodeLocal(int osite,int point) {  | ||||||
|  |     return this->_entries_p[point+this->_npoints*osite]._is_local; | ||||||
|  |   } | ||||||
|  |   accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) {  | ||||||
|  |     ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite];  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { | ||||||
|  |     uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; | ||||||
|  |     local = this->_entries_p[ent]._is_local; | ||||||
|  |     perm  = this->_entries_p[ent]._permute; | ||||||
|  |     if (perm)  ptype = this->_permute_type[point];  | ||||||
|  |     if (local) { | ||||||
|  |       return  base + this->_entries_p[ent]._byte_offset; | ||||||
|  |     } else { | ||||||
|  |       return cbase + this->_entries_p[ent]._byte_offset; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) { | ||||||
|  |     uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; | ||||||
|  |     int local = this->_entries_p[ent]._is_local; | ||||||
|  |     if (local) return  base + this->_entries_p[ent]._byte_offset; | ||||||
|  |     else       return cbase + this->_entries_p[ent]._byte_offset; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane)  | ||||||
|  |   { | ||||||
|  |     Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | }; | ||||||
| //////////////////////////////////////// | //////////////////////////////////////// | ||||||
| // The Stencil Class itself | // The Stencil Class itself | ||||||
| //////////////////////////////////////// | //////////////////////////////////////// | ||||||
| template<class vobj,class cobj> | template<class vobj,class cobj> | ||||||
| class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. | class CartesianStencil : public CartesianStencilView<vobj,cobj> { // Stencil runs along coordinate axes only; NO diagonal fill in. | ||||||
| public: | public: | ||||||
|  |  | ||||||
|   typedef typename cobj::vector_type vector_type; |   typedef typename cobj::vector_type vector_type; | ||||||
|   typedef typename cobj::scalar_type scalar_type; |   typedef typename cobj::scalar_type scalar_type; | ||||||
|   typedef typename cobj::scalar_object scalar_object; |   typedef typename cobj::scalar_object scalar_object; | ||||||
|  |   typedef CartesianStencilView<vobj,cobj> View_type; | ||||||
|  |   typedef typename View_type::StencilVector StencilVector; | ||||||
|   /////////////////////////////////////////// |   /////////////////////////////////////////// | ||||||
|   // Helper structs |   // Helper structs | ||||||
|   /////////////////////////////////////////// |   /////////////////////////////////////////// | ||||||
| @@ -134,33 +192,23 @@ public: | |||||||
|     Integer buffer_size; |     Integer buffer_size; | ||||||
|   }; |   }; | ||||||
|    |    | ||||||
|   //////////////////////////////////////// |  | ||||||
|   // Basic Grid and stencil info | protected: | ||||||
|   //////////////////////////////////////// |   GridBase *                        _grid; | ||||||
|  |  | ||||||
|  | public:  | ||||||
|  |   GridBase *Grid(void) const { return _grid; } | ||||||
|  |  | ||||||
|  |   View_type View(void) const { | ||||||
|  |     View_type accessor(*( (View_type *) this)); | ||||||
|  |     return accessor; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   int face_table_computed; |   int face_table_computed; | ||||||
|   std::vector<std::vector<std::pair<int,int> > > face_table ; |   std::vector<std::vector<std::pair<int,int> > > face_table ; | ||||||
|  |  | ||||||
|  |  | ||||||
|   int                               _checkerboard; |  | ||||||
|   int                               _npoints; // Move to template param? |  | ||||||
| protected: |  | ||||||
|   GridBase *                        _grid; |  | ||||||
| public:  |  | ||||||
|   GridBase *Grid(void) const { return _grid; } |  | ||||||
|   // npoints of these; make it a template param and std::array |  | ||||||
|   std::vector<int>                  _directions; |  | ||||||
|   std::vector<int>                  _distances; |  | ||||||
|   std::vector<int>                  _comm_buf_size; |  | ||||||
|   std::vector<int>                  _permute_type; |  | ||||||
|   Coordinate                        _simd_layout; |  | ||||||
|  |  | ||||||
|   accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane)  |  | ||||||
|   { |  | ||||||
|     Lexicographic::CoorFromIndex(coor,lane,_simd_layout); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   Vector<StencilEntry>  _entries; // Resident in managed memory |   Vector<StencilEntry>  _entries; // Resident in managed memory | ||||||
|   StencilEntry*  _entries_p; |  | ||||||
|   std::vector<Packet> Packets; |   std::vector<Packet> Packets; | ||||||
|   std::vector<Merge> Mergers; |   std::vector<Merge> Mergers; | ||||||
|   std::vector<Merge> MergersSHM; |   std::vector<Merge> MergersSHM; | ||||||
| @@ -173,14 +221,11 @@ public: | |||||||
|   // Vectors that live on the symmetric heap in case of SHMEM |   // Vectors that live on the symmetric heap in case of SHMEM | ||||||
|   // These are used; either SHM objects or refs to the above symmetric heap vectors |   // These are used; either SHM objects or refs to the above symmetric heap vectors | ||||||
|   // depending on comms target |   // depending on comms target | ||||||
|   cobj* u_recv_buf_p; |  | ||||||
|   cobj* u_send_buf_p; |  | ||||||
|   std::vector<cobj *> u_simd_send_buf; |   std::vector<cobj *> u_simd_send_buf; | ||||||
|   std::vector<cobj *> u_simd_recv_buf; |   std::vector<cobj *> u_simd_recv_buf; | ||||||
|  |  | ||||||
|   int u_comm_offset; |   int u_comm_offset; | ||||||
|   int _unified_buffer_size; |   int _unified_buffer_size; | ||||||
|   cobj *CommBuf(void) { return u_recv_buf_p; } |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////// |   ///////////////////////////////////////// | ||||||
|   // Timing info; ugly; possibly temporary |   // Timing info; ugly; possibly temporary | ||||||
| @@ -208,8 +253,8 @@ public: | |||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   inline int SameNode(int point) {  |   inline int SameNode(int point) {  | ||||||
|  |  | ||||||
|     int dimension    = _directions[point]; |     int dimension    = this->_directions[point]; | ||||||
|     int displacement = _distances[point]; |     int displacement = this->_distances[point]; | ||||||
|     assert( (displacement==1) || (displacement==-1)); |     assert( (displacement==1) || (displacement==-1)); | ||||||
|  |  | ||||||
|     int pd              = _grid->_processors[dimension]; |     int pd              = _grid->_processors[dimension]; | ||||||
| @@ -230,37 +275,12 @@ public: | |||||||
|  |  | ||||||
|     _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  |     _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||||
|  |  | ||||||
|     void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p); |     void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); | ||||||
|  |  | ||||||
|     if ( shm==NULL ) return 0; |     if ( shm==NULL ) return 0; | ||||||
|  |  | ||||||
|     return 1; |     return 1; | ||||||
|   } |   } | ||||||
|   accelerator_inline int GetNodeLocal(int osite,int point) {  |  | ||||||
|     return _entries_p[point+_npoints*osite]._is_local; |  | ||||||
|   } |  | ||||||
|   accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) {  |  | ||||||
|     ptype = _permute_type[point]; return & _entries_p[point+_npoints*osite];  |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { |  | ||||||
|     uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; |  | ||||||
|     local = _entries_p[ent]._is_local; |  | ||||||
|     perm  = _entries_p[ent]._permute; |  | ||||||
|     if (perm)  ptype = _permute_type[point];  |  | ||||||
|     if (local) { |  | ||||||
|       return  base + _entries_p[ent]._byte_offset; |  | ||||||
|     } else { |  | ||||||
|       return cbase + _entries_p[ent]._byte_offset; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) { |  | ||||||
|     uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; |  | ||||||
|     int local = _entries_p[ent]._is_local; |  | ||||||
|     if (local) return  base + _entries_p[ent]._byte_offset; |  | ||||||
|     else       return cbase + _entries_p[ent]._byte_offset; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|   // Comms packet queue for asynch thread |   // Comms packet queue for asynch thread | ||||||
| @@ -377,8 +397,8 @@ public: | |||||||
|    |    | ||||||
|   template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx) |   template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx) | ||||||
|   { |   { | ||||||
|     int dimension    = _directions[point]; |     int dimension    = this->_directions[point]; | ||||||
|     int displacement = _distances[point]; |     int displacement = this->_distances[point]; | ||||||
|      |      | ||||||
|     int fd = _grid->_fdimensions[dimension]; |     int fd = _grid->_fdimensions[dimension]; | ||||||
|     int rd = _grid->_rdimensions[dimension]; |     int rd = _grid->_rdimensions[dimension]; | ||||||
| @@ -386,29 +406,29 @@ public: | |||||||
|     // Map to always positive shift modulo global full dimension. |     // Map to always positive shift modulo global full dimension. | ||||||
|     int shift = (displacement+fd)%fd; |     int shift = (displacement+fd)%fd; | ||||||
|  |  | ||||||
|     assert (source.Checkerboard()== _checkerboard); |     assert (source.Checkerboard()== this->_checkerboard); | ||||||
|      |      | ||||||
|     // the permute type |     // the permute type | ||||||
|     int simd_layout     = _grid->_simd_layout[dimension]; |     int simd_layout     = _grid->_simd_layout[dimension]; | ||||||
|     int comm_dim        = _grid->_processors[dimension] >1 ; |     int comm_dim        = _grid->_processors[dimension] >1 ; | ||||||
|     int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim); |     int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim); | ||||||
|  |  | ||||||
|     int same_node = 1; |     int is_same_node = 1; | ||||||
|     // Gather phase |     // Gather phase | ||||||
|     int sshift [2]; |     int sshift [2]; | ||||||
|     if ( comm_dim ) { |     if ( comm_dim ) { | ||||||
|       sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); |       sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even); | ||||||
|       sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); |       sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd); | ||||||
|       if ( sshift[0] == sshift[1] ) { |       if ( sshift[0] == sshift[1] ) { | ||||||
| 	if (splice_dim) { | 	if (splice_dim) { | ||||||
| 	  splicetime-=usecond(); | 	  splicetime-=usecond(); | ||||||
| 	  auto tmp  = GatherSimd(source,dimension,shift,0x3,compress,face_idx); | 	  auto tmp  = GatherSimd(source,dimension,shift,0x3,compress,face_idx); | ||||||
| 	  same_node = same_node && tmp; | 	  is_same_node = is_same_node && tmp; | ||||||
| 	  splicetime+=usecond(); | 	  splicetime+=usecond(); | ||||||
| 	} else {  | 	} else {  | ||||||
| 	  nosplicetime-=usecond(); | 	  nosplicetime-=usecond(); | ||||||
| 	  auto tmp  = Gather(source,dimension,shift,0x3,compress,face_idx); | 	  auto tmp  = Gather(source,dimension,shift,0x3,compress,face_idx); | ||||||
| 	  same_node = same_node && tmp; | 	  is_same_node = is_same_node && tmp; | ||||||
| 	  nosplicetime+=usecond(); | 	  nosplicetime+=usecond(); | ||||||
| 	} | 	} | ||||||
|       } else { |       } else { | ||||||
| @@ -418,18 +438,18 @@ public: | |||||||
| 	  // both with block stride loop iteration | 	  // both with block stride loop iteration | ||||||
| 	  auto tmp1 =  GatherSimd(source,dimension,shift,0x1,compress,face_idx); | 	  auto tmp1 =  GatherSimd(source,dimension,shift,0x1,compress,face_idx); | ||||||
| 	  auto tmp2 =  GatherSimd(source,dimension,shift,0x2,compress,face_idx); | 	  auto tmp2 =  GatherSimd(source,dimension,shift,0x2,compress,face_idx); | ||||||
| 	  same_node = same_node && tmp1 && tmp2; | 	  is_same_node = is_same_node && tmp1 && tmp2; | ||||||
| 	  splicetime+=usecond(); | 	  splicetime+=usecond(); | ||||||
| 	} else { | 	} else { | ||||||
| 	  nosplicetime-=usecond(); | 	  nosplicetime-=usecond(); | ||||||
| 	  auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx); | 	  auto tmp1 = Gather(source,dimension,shift,0x1,compress,face_idx); | ||||||
| 	  auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx); | 	  auto tmp2 = Gather(source,dimension,shift,0x2,compress,face_idx); | ||||||
| 	  same_node = same_node && tmp1 && tmp2; | 	  is_same_node = is_same_node && tmp1 && tmp2; | ||||||
| 	  nosplicetime+=usecond(); | 	  nosplicetime+=usecond(); | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     return same_node; |     return is_same_node; | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   template<class compressor> |   template<class compressor> | ||||||
| @@ -447,7 +467,7 @@ public: | |||||||
|      |      | ||||||
|     // Gather all comms buffers |     // Gather all comms buffers | ||||||
|     int face_idx=0; |     int face_idx=0; | ||||||
|     for(int point = 0 ; point < _npoints; point++) { |     for(int point = 0 ; point < this->_npoints; point++) { | ||||||
|       compress.Point(point); |       compress.Point(point); | ||||||
|       HaloGatherDir(source,compress,point,face_idx); |       HaloGatherDir(source,compress,point,face_idx); | ||||||
|     } |     } | ||||||
| @@ -546,25 +566,30 @@ public: | |||||||
| 		   int checkerboard, | 		   int checkerboard, | ||||||
| 		   const std::vector<int> &directions, | 		   const std::vector<int> &directions, | ||||||
| 		   const std::vector<int> &distances)  | 		   const std::vector<int> &distances)  | ||||||
|     : _permute_type(npoints),  |     : comm_bytes_thr(npoints),  | ||||||
|       _comm_buf_size(npoints), |  | ||||||
|       comm_bytes_thr(npoints),  |  | ||||||
|       comm_enter_thr(npoints), |       comm_enter_thr(npoints), | ||||||
|       comm_leave_thr(npoints),  |       comm_leave_thr(npoints),  | ||||||
|       comm_time_thr(npoints) |       comm_time_thr(npoints) | ||||||
|   { |   { | ||||||
|     face_table_computed=0; |     face_table_computed=0; | ||||||
|     _npoints = npoints; |  | ||||||
|     _grid    = grid; |     _grid    = grid; | ||||||
|     _directions = directions; |  | ||||||
|     _distances  = distances; |     ///////////////////////////////////// | ||||||
|  |     // Initialise the base | ||||||
|  |     ///////////////////////////////////// | ||||||
|  |     this->_npoints = npoints; | ||||||
|  |     this->_comm_buf_size.resize(npoints), | ||||||
|  |     this->_permute_type.resize(npoints),  | ||||||
|  |     this->_simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels | ||||||
|  |     this->_directions = StencilVector(directions); | ||||||
|  |     this->_distances  = StencilVector(distances); | ||||||
|  |  | ||||||
|     _unified_buffer_size=0; |     _unified_buffer_size=0; | ||||||
|     _simd_layout = _grid->_simd_layout; // copy simd_layout to give access to Accelerator Kernels |  | ||||||
|  |  | ||||||
|     int osites  = _grid->oSites(); |     int osites  = _grid->oSites(); | ||||||
|      |      | ||||||
|     _entries.resize(_npoints* osites); |     _entries.resize(this->_npoints* osites); | ||||||
|     _entries_p = &_entries[0]; |     this->_entries_p = &_entries[0]; | ||||||
|     for(int ii=0;ii<npoints;ii++){ |     for(int ii=0;ii<npoints;ii++){ | ||||||
|        |        | ||||||
|       int i = ii; // reverse direction to get SIMD comms done first |       int i = ii; // reverse direction to get SIMD comms done first | ||||||
| @@ -576,9 +601,9 @@ public: | |||||||
|        |        | ||||||
|       int fd = _grid->_fdimensions[dimension]; |       int fd = _grid->_fdimensions[dimension]; | ||||||
|       int rd = _grid->_rdimensions[dimension]; |       int rd = _grid->_rdimensions[dimension]; | ||||||
|       _permute_type[point]=_grid->PermuteType(dimension); |       this->_permute_type[point]=_grid->PermuteType(dimension); | ||||||
|        |        | ||||||
|       _checkerboard = checkerboard; |       this->_checkerboard = checkerboard; | ||||||
|        |        | ||||||
|       ////////////////////////// |       ////////////////////////// | ||||||
|       // the permute type |       // the permute type | ||||||
| @@ -598,8 +623,8 @@ public: | |||||||
|       // live in lattice or a comms buffer. |       // live in lattice or a comms buffer. | ||||||
|       ////////////////////////// |       ////////////////////////// | ||||||
|       if ( !comm_dim ) { |       if ( !comm_dim ) { | ||||||
| 	sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); | 	sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even); | ||||||
| 	sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); | 	sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd); | ||||||
| 	 | 	 | ||||||
| 	if ( sshift[0] == sshift[1] ) { | 	if ( sshift[0] == sshift[1] ) { | ||||||
| 	  Local(point,dimension,shift,0x3); | 	  Local(point,dimension,shift,0x3); | ||||||
| @@ -610,8 +635,8 @@ public: | |||||||
|       } else {  |       } else {  | ||||||
| 	// All permute extract done in comms phase prior to Stencil application | 	// All permute extract done in comms phase prior to Stencil application | ||||||
| 	//        So tables are the same whether comm_dim or splice_dim | 	//        So tables are the same whether comm_dim or splice_dim | ||||||
| 	sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); | 	sshift[0] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Even); | ||||||
| 	sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); | 	sshift[1] = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,Odd); | ||||||
| 	if ( sshift[0] == sshift[1] ) { | 	if ( sshift[0] == sshift[1] ) { | ||||||
| 	  Comms(point,dimension,shift,0x3); | 	  Comms(point,dimension,shift,0x3); | ||||||
| 	} else { | 	} else { | ||||||
| @@ -630,8 +655,8 @@ public: | |||||||
|  |  | ||||||
|     u_simd_send_buf.resize(Nsimd); |     u_simd_send_buf.resize(Nsimd); | ||||||
|     u_simd_recv_buf.resize(Nsimd); |     u_simd_recv_buf.resize(Nsimd); | ||||||
|     u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); |     this->u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); | ||||||
|     u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); |     this->u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); | ||||||
|  |  | ||||||
|     for(int l=0;l<2;l++){ |     for(int l=0;l<2;l++){ | ||||||
|       u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); |       u_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); | ||||||
| @@ -662,7 +687,7 @@ public: | |||||||
|        |        | ||||||
|       int cb= (cbmask==0x2)? Odd : Even; |       int cb= (cbmask==0x2)? Odd : Even; | ||||||
|        |        | ||||||
|       int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); |       int sshift = _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb); | ||||||
|       int sx     = (x+sshift)%rd; |       int sx     = (x+sshift)%rd; | ||||||
|        |        | ||||||
|       int wraparound=0; |       int wraparound=0; | ||||||
| @@ -706,12 +731,12 @@ public: | |||||||
|     // done in reduced dims, so SIMD factored |     // done in reduced dims, so SIMD factored | ||||||
|     int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];  |     int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];  | ||||||
|  |  | ||||||
|     _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and |     this->_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and | ||||||
|  |  | ||||||
|     // send to one or more remote nodes. |     // send to one or more remote nodes. | ||||||
|      |      | ||||||
|     int cb= (cbmask==0x2)? Odd : Even; |     int cb= (cbmask==0x2)? Odd : Even; | ||||||
|     int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); |     int sshift= _grid->CheckerBoardShiftForCB(this->_checkerboard,dimension,shift,cb); | ||||||
|      |      | ||||||
|     for(int x=0;x<rd;x++){        |     for(int x=0;x<rd;x++){        | ||||||
|        |        | ||||||
| @@ -783,7 +808,7 @@ public: | |||||||
|       // Simple block stride gather of SIMD objects |       // Simple block stride gather of SIMD objects | ||||||
|       for(int n=0;n<_grid->_slice_nblock[dimension];n++){ |       for(int n=0;n<_grid->_slice_nblock[dimension];n++){ | ||||||
| 	for(int b=0;b<_grid->_slice_block[dimension];b++){ | 	for(int b=0;b<_grid->_slice_block[dimension];b++){ | ||||||
| 	  int idx=point+(lo+o+b)*_npoints; | 	  int idx=point+(lo+o+b)*this->_npoints; | ||||||
| 	  _entries[idx]._offset  =ro+o+b; | 	  _entries[idx]._offset  =ro+o+b; | ||||||
| 	  _entries[idx]._permute=permute; | 	  _entries[idx]._permute=permute; | ||||||
| 	  _entries[idx]._is_local=1; | 	  _entries[idx]._is_local=1; | ||||||
| @@ -804,7 +829,7 @@ public: | |||||||
| 	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b); | 	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b); | ||||||
| 	   | 	   | ||||||
| 	  if ( ocb&cbmask ) { | 	  if ( ocb&cbmask ) { | ||||||
| 	    int idx = point+(lo+o+b)*_npoints; | 	    int idx = point+(lo+o+b)*this->_npoints; | ||||||
| 	    _entries[idx]._offset =ro+o+b; | 	    _entries[idx]._offset =ro+o+b; | ||||||
| 	    _entries[idx]._is_local=1; | 	    _entries[idx]._is_local=1; | ||||||
| 	    _entries[idx]._permute=permute; | 	    _entries[idx]._permute=permute; | ||||||
| @@ -831,7 +856,7 @@ public: | |||||||
|       // Simple block stride gather of SIMD objects |       // Simple block stride gather of SIMD objects | ||||||
|       for(int n=0;n<_grid->_slice_nblock[dimension];n++){ |       for(int n=0;n<_grid->_slice_nblock[dimension];n++){ | ||||||
| 	for(int b=0;b<_grid->_slice_block[dimension];b++){ | 	for(int b=0;b<_grid->_slice_block[dimension];b++){ | ||||||
| 	  int idx=point+(so+o+b)*_npoints; | 	  int idx=point+(so+o+b)*this->_npoints; | ||||||
| 	  _entries[idx]._offset  =offset+(bo++); | 	  _entries[idx]._offset  =offset+(bo++); | ||||||
| 	  _entries[idx]._is_local=0; | 	  _entries[idx]._is_local=0; | ||||||
| 	  _entries[idx]._permute=0; | 	  _entries[idx]._permute=0; | ||||||
| @@ -851,7 +876,7 @@ public: | |||||||
| 	   | 	   | ||||||
| 	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | 	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | ||||||
| 	  if ( ocb & cbmask ) { | 	  if ( ocb & cbmask ) { | ||||||
| 	    int idx = point+(so+o+b)*_npoints; | 	    int idx = point+(so+o+b)*this->_npoints; | ||||||
| 	    _entries[idx]._offset  =offset+(bo++); | 	    _entries[idx]._offset  =offset+(bo++); | ||||||
| 	    _entries[idx]._is_local=0; | 	    _entries[idx]._is_local=0; | ||||||
| 	    _entries[idx]._permute =0; | 	    _entries[idx]._permute =0; | ||||||
| @@ -922,16 +947,16 @@ public: | |||||||
| 	if ( compress.DecompressionStep() ) { | 	if ( compress.DecompressionStep() ) { | ||||||
| 	  recv_buf=u_simd_recv_buf[0]; | 	  recv_buf=u_simd_recv_buf[0]; | ||||||
| 	} else { | 	} else { | ||||||
| 	  recv_buf=u_recv_buf_p; | 	  recv_buf=this->u_recv_buf_p; | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); | 	send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); | ||||||
| 	if ( send_buf==NULL ) {  | 	if ( send_buf==NULL ) {  | ||||||
| 	  send_buf = u_send_buf_p; | 	  send_buf = this->u_send_buf_p; | ||||||
| 	}  | 	}  | ||||||
| 	 | 	 | ||||||
| 	// Find out if we get the direct copy. | 	// Find out if we get the direct copy. | ||||||
| 	void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_send_buf_p); | 	void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p); | ||||||
| 	if (success==NULL) {  | 	if (success==NULL) {  | ||||||
| 	  // we found a packet that comes from MPI and contributes to this leg of stencil | 	  // we found a packet that comes from MPI and contributes to this leg of stencil | ||||||
| 	  shm_receive_only = 0; | 	  shm_receive_only = 0; | ||||||
| @@ -945,11 +970,11 @@ public: | |||||||
| 	if ( compress.DecompressionStep() ) { | 	if ( compress.DecompressionStep() ) { | ||||||
| 	   | 	   | ||||||
| 	  if ( shm_receive_only ) { // Early decompress before MPI is finished is possible | 	  if ( shm_receive_only ) { // Early decompress before MPI is finished is possible | ||||||
| 	    AddDecompress(&u_recv_buf_p[u_comm_offset], | 	    AddDecompress(&this->u_recv_buf_p[u_comm_offset], | ||||||
| 			  &recv_buf[u_comm_offset], | 			  &recv_buf[u_comm_offset], | ||||||
| 			  words,DecompressionsSHM); | 			  words,DecompressionsSHM); | ||||||
| 	  } else { // Decompress after MPI is finished | 	  } else { // Decompress after MPI is finished | ||||||
| 	    AddDecompress(&u_recv_buf_p[u_comm_offset], | 	    AddDecompress(&this->u_recv_buf_p[u_comm_offset], | ||||||
| 			  &recv_buf[u_comm_offset], | 			  &recv_buf[u_comm_offset], | ||||||
| 			  words,Decompressions); | 			  words,Decompressions); | ||||||
| 	  } | 	  } | ||||||
| @@ -962,7 +987,7 @@ public: | |||||||
|  |  | ||||||
| 	} else { | 	} else { | ||||||
| 	  AddPacket((void *)&send_buf[u_comm_offset], | 	  AddPacket((void *)&send_buf[u_comm_offset], | ||||||
| 		    (void *)&u_recv_buf_p[u_comm_offset], | 		    (void *)&this->u_recv_buf_p[u_comm_offset], | ||||||
| 		    xmit_to_rank, | 		    xmit_to_rank, | ||||||
| 		    recv_from_rank, | 		    recv_from_rank, | ||||||
| 		    bytes); | 		    bytes); | ||||||
| @@ -1072,8 +1097,8 @@ public: | |||||||
| 	    if (shm==NULL) {  | 	    if (shm==NULL) {  | ||||||
| 	      shm = rp; | 	      shm = rp; | ||||||
| 	      // we found a packet that comes from MPI and contributes to this shift. | 	      // we found a packet that comes from MPI and contributes to this shift. | ||||||
| 	      // same_node is only used in the WilsonStencil, and gets set for this point in the stencil. | 	      // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil. | ||||||
| 	      // Kernel will add the exterior_terms except if same_node. | 	      // Kernel will add the exterior_terms except if is_same_node. | ||||||
| 	      shm_receive_only = 0; | 	      shm_receive_only = 0; | ||||||
| 	      // leg of stencil | 	      // leg of stencil | ||||||
| 	    } | 	    } | ||||||
| @@ -1092,9 +1117,9 @@ public: | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if ( shm_receive_only ) {  | 	if ( shm_receive_only ) {  | ||||||
| 	  AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM); | 	  AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM); | ||||||
| 	} else { | 	} else { | ||||||
| 	  AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); | 	  AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	u_comm_offset     +=buffer_size; | 	u_comm_offset     +=buffer_size; | ||||||
| @@ -1109,7 +1134,7 @@ public: | |||||||
|     mpi3synctime=0.; |     mpi3synctime=0.; | ||||||
|     mpi3synctime_g=0.; |     mpi3synctime_g=0.; | ||||||
|     shmmergetime=0.; |     shmmergetime=0.; | ||||||
|     for(int i=0;i<_npoints;i++){ |     for(int i=0;i<this->_npoints;i++){ | ||||||
|       comm_time_thr[i]=0; |       comm_time_thr[i]=0; | ||||||
|       comm_bytes_thr[i]=0; |       comm_bytes_thr[i]=0; | ||||||
|       comm_enter_thr[i]=0; |       comm_enter_thr[i]=0; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user