mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	Remove partial dirichlet. Favour intro reduced prec comms options
This commit is contained in:
		| @@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid); | |||||||
| // These can move into a params header and be given MacroMagic serialisation | // These can move into a params header and be given MacroMagic serialisation | ||||||
| struct DefaultImplParams { | struct DefaultImplParams { | ||||||
|   Coordinate dirichlet; // Blocksize of dirichlet BCs |   Coordinate dirichlet; // Blocksize of dirichlet BCs | ||||||
|   int  partialDirichlet; |   //  int  partialDirichlet; | ||||||
|   DefaultImplParams()  { |   DefaultImplParams()  { | ||||||
|     dirichlet.resize(0); |     dirichlet.resize(0); | ||||||
|     partialDirichlet=0; |     //    partialDirichlet=0; | ||||||
|   }; |   }; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -113,8 +113,8 @@ class CartesianStencilAccelerator { | |||||||
|   /////////////////////////////////////////////////// |   /////////////////////////////////////////////////// | ||||||
|   // If true, this is partially communicated per face |   // If true, this is partially communicated per face | ||||||
|   /////////////////////////////////////////////////// |   /////////////////////////////////////////////////// | ||||||
|   StencilVector _comms_partial_send;  |   //  StencilVector _comms_partial_send;  | ||||||
|   StencilVector _comms_partial_recv; |   //  StencilVector _comms_partial_recv; | ||||||
|   // |   // | ||||||
|   StencilVector _comm_buf_size; |   StencilVector _comm_buf_size; | ||||||
|   StencilVector _permute_type; |   StencilVector _permute_type; | ||||||
| @@ -205,6 +205,8 @@ public: | |||||||
|   struct Packet { |   struct Packet { | ||||||
|     void * send_buf; |     void * send_buf; | ||||||
|     void * recv_buf; |     void * recv_buf; | ||||||
|  |     void * compressed_send_buf; | ||||||
|  |     void * compressed_recv_buf; | ||||||
| #ifndef ACCELERATOR_AWARE_MPI | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|     void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE |     void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE | ||||||
|     void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE |     void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE | ||||||
| @@ -215,6 +217,8 @@ public: | |||||||
|     Integer do_recv; |     Integer do_recv; | ||||||
|     Integer xbytes; |     Integer xbytes; | ||||||
|     Integer rbytes; |     Integer rbytes; | ||||||
|  |     Integer xbytes_compressed; | ||||||
|  |     Integer rbytes_compressed; | ||||||
|   }; |   }; | ||||||
|   struct Merge { |   struct Merge { | ||||||
|     static constexpr int Nsimd = vobj::Nsimd(); |     static constexpr int Nsimd = vobj::Nsimd(); | ||||||
| @@ -223,7 +227,7 @@ public: | |||||||
|     std::vector<cobj *> vpointers; |     std::vector<cobj *> vpointers; | ||||||
|     Integer buffer_size; |     Integer buffer_size; | ||||||
|     Integer type; |     Integer type; | ||||||
|     Integer partial; // partial dirichlet BCs |     //    Integer partial; // partial dirichlet BCs | ||||||
|     Coordinate dims; |     Coordinate dims; | ||||||
|   }; |   }; | ||||||
|   struct Decompress { |   struct Decompress { | ||||||
| @@ -231,7 +235,7 @@ public: | |||||||
|     cobj * kernel_p; |     cobj * kernel_p; | ||||||
|     cobj * mpi_p; |     cobj * mpi_p; | ||||||
|     Integer buffer_size; |     Integer buffer_size; | ||||||
|     Integer partial; // partial dirichlet BCs |     //    Integer partial; // partial dirichlet BCs | ||||||
|     Coordinate dims; |     Coordinate dims; | ||||||
|   }; |   }; | ||||||
|   struct CopyReceiveBuffer { |   struct CopyReceiveBuffer { | ||||||
| @@ -255,6 +259,12 @@ protected: | |||||||
| public: | public: | ||||||
|   GridBase *Grid(void) const { return _grid; } |   GridBase *Grid(void) const { return _grid; } | ||||||
|  |  | ||||||
|  |   ///////////////////////////////////////////////////////// | ||||||
|  |   // Control reduced precision comms | ||||||
|  |   ///////////////////////////////////////////////////////// | ||||||
|  |   int SloppyComms; | ||||||
|  |   void SetSloppyComms(int sloppy) { SloppyComms = sloppy; }; | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Needed to conveniently communicate gparity parameters into GPU memory |   // Needed to conveniently communicate gparity parameters into GPU memory | ||||||
|   // without adding parameters. Perhaps a template parameter to StenciView is |   // without adding parameters. Perhaps a template parameter to StenciView is | ||||||
| @@ -268,7 +278,7 @@ public: | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   int face_table_computed; |   int face_table_computed; | ||||||
|   int partialDirichlet; |   //  int partialDirichlet; | ||||||
|   int fullDirichlet; |   int fullDirichlet; | ||||||
|   std::vector<deviceVector<std::pair<int,int> > > face_table ; |   std::vector<deviceVector<std::pair<int,int> > > face_table ; | ||||||
|   deviceVector<int> surface_list; |   deviceVector<int> surface_list; | ||||||
| @@ -361,6 +371,118 @@ public: | |||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Non blocking send and receive. Necessarily parallel. |   // Non blocking send and receive. Necessarily parallel. | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|  |   void DecompressPacket(Packet &packet) | ||||||
|  |   { | ||||||
|  |     if ( !SloppyComms ) return; | ||||||
|  |  | ||||||
|  |     if ( packet.do_recv ) { | ||||||
|  |  | ||||||
|  |       typedef typename getPrecision<cobj>::real_scalar_type word; | ||||||
|  |       uint64_t words = packet.rbytes/sizeof(word); | ||||||
|  |       const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word); | ||||||
|  |       const uint64_t outer = words/nsimd; | ||||||
|  |  | ||||||
|  |       if(sizeof(word)==8) { | ||||||
|  |  | ||||||
|  | 	// Can either choose to represent as float vs double and prec change | ||||||
|  | 	// OR | ||||||
|  | 	// truncate the mantissa bfp16 style | ||||||
|  |  | ||||||
|  | 	static deviceVector<uint32_t> compression_buffer; | ||||||
|  |  | ||||||
|  | 	if(words > compression_buffer.size() ) compression_buffer.resize(words); | ||||||
|  | 	 | ||||||
|  | 	uint64_t *fbuf =(uint64_t *) &packet.recv_buf; | ||||||
|  | 	uint32_t *fhbuf=(uint32_t *) &packet.recv_buf; | ||||||
|  | 	uint32_t *hbuf =(uint32_t *) &compression_buffer[0]; | ||||||
|  | 	accelerator_for(ss,outer,nsimd,{ | ||||||
|  |           hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane]; // copy at half precision | ||||||
|  | 	}); | ||||||
|  | 	accelerator_for(ss,outer,nsimd,{ | ||||||
|  | 	  int lane = acceleratorSIMTlane(nsimd); | ||||||
|  | 	  fbuf[ss*nsimd+lane] = ((uint64_t)hbuf[ss*nsimd+lane])<<32; //copy back and pad each word with zeroes | ||||||
|  | 	}); | ||||||
|  |       } else if ( sizeof(word)==4){ | ||||||
|  | 	// Can either choose to represent as half vs float and prec change | ||||||
|  |         // OR | ||||||
|  | 	// truncate the mantissa bfp16 style | ||||||
|  | 	static deviceVector<uint16_t> compression_buffer; | ||||||
|  |  | ||||||
|  | 	if(words > compression_buffer.size() ) compression_buffer.resize(words); | ||||||
|  |  | ||||||
|  | 	uint32_t *fbuf =(uint32_t *) packet.recv_buf; | ||||||
|  | 	uint16_t *fhbuf=(uint16_t *) packet.recv_buf; | ||||||
|  | 	uint16_t *hbuf =(uint16_t *) &compression_buffer[0]; | ||||||
|  | 	accelerator_for(ss,outer,nsimd,{ | ||||||
|  | 	  hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane]; | ||||||
|  | 	}); | ||||||
|  | 	accelerator_for(ss,outer,nsimd,{ | ||||||
|  | 	  int lane = acceleratorSIMTlane(nsimd); | ||||||
|  | 	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes | ||||||
|  | 	}); | ||||||
|  |       } else { | ||||||
|  | 	assert(0 && "unknown floating point precision"); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   void CompressPacket(Packet &packet) | ||||||
|  |   { | ||||||
|  |     if ( !SloppyComms ) { | ||||||
|  |       packet.xbytes_compressed = packet.xbytes; | ||||||
|  |       packet.rbytes_compressed = packet.rbytes; | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     typedef typename getPrecision<cobj>::real_scalar_type word; | ||||||
|  |     uint64_t words = packet.xbytes/sizeof(word); | ||||||
|  |     const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word); | ||||||
|  |     const uint64_t outer = words/nsimd; | ||||||
|  |      | ||||||
|  |     if (packet.do_send) { | ||||||
|  |  | ||||||
|  |       if(sizeof(word)==8) { | ||||||
|  |  | ||||||
|  | 	static deviceVector<uint32_t> compression_buffer; | ||||||
|  |  | ||||||
|  | 	if(words > compression_buffer.size() ) compression_buffer.resize(words); | ||||||
|  |  | ||||||
|  | 	uint64_t *fbuf =(uint64_t *) packet.send_buf; | ||||||
|  | 	uint32_t *fhbuf=(uint32_t *) packet.send_buf; | ||||||
|  | 	uint32_t *hbuf =(uint32_t *) &compression_buffer[0]; | ||||||
|  |  | ||||||
|  | 	accelerator_for(ss,outer,nsimd,{ | ||||||
|  | 	  int lane = acceleratorSIMTlane(nsimd); | ||||||
|  | 	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>32; // truncate and copy | ||||||
|  | 	}); | ||||||
|  | 	accelerator_for(ss,outer,nsimd,{ | ||||||
|  |           fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane]; // copy back | ||||||
|  | 	}); | ||||||
|  |  | ||||||
|  |       } else if ( sizeof(word)==4){ | ||||||
|  |  | ||||||
|  | 	static deviceVector<uint16_t> compression_buffer; | ||||||
|  |  | ||||||
|  | 	if(words > compression_buffer.size() ) compression_buffer.resize(words); | ||||||
|  |  | ||||||
|  | 	uint32_t *fbuf =(uint32_t *) packet.send_buf; | ||||||
|  | 	uint16_t *fhbuf=(uint16_t *) packet.send_buf; | ||||||
|  | 	uint16_t *hbuf =(uint16_t *) &compression_buffer[0]; | ||||||
|  | 	accelerator_for(ss,outer,nsimd,{ | ||||||
|  | 	  int lane = acceleratorSIMTlane(nsimd); | ||||||
|  | 	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; | ||||||
|  | 	}); | ||||||
|  | 	accelerator_for(ss,outer,nsimd,{ | ||||||
|  | 	  fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane]; | ||||||
|  | 	}); | ||||||
|  |       } else { | ||||||
|  | 	assert(0 && "unknown floating point precision"); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     packet.xbytes_compressed = packet.xbytes/2; | ||||||
|  |     packet.rbytes_compressed = packet.rbytes/2; | ||||||
|  |  | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     //    std::cout << "Communicate Begin "<<std::endl; |     //    std::cout << "Communicate Begin "<<std::endl; | ||||||
| @@ -371,6 +493,9 @@ public: | |||||||
|     //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer |     //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer | ||||||
|                                // But the HaloGather had a barrier too. |                                // But the HaloGather had a barrier too. | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |  | ||||||
|  |       this->CompressPacket(Packets[i]); | ||||||
|  |  | ||||||
|       //      std::cout << "Communicate prepare "<<i<<std::endl; |       //      std::cout << "Communicate prepare "<<i<<std::endl; | ||||||
|       //      _grid->Barrier(); |       //      _grid->Barrier(); | ||||||
|       _grid->StencilSendToRecvFromPrepare(MpiReqs, |       _grid->StencilSendToRecvFromPrepare(MpiReqs, | ||||||
| @@ -378,7 +503,7 @@ public: | |||||||
| 					  Packets[i].to_rank,Packets[i].do_send, | 					  Packets[i].to_rank,Packets[i].do_send, | ||||||
| 					  Packets[i].recv_buf, | 					  Packets[i].recv_buf, | ||||||
| 					  Packets[i].from_rank,Packets[i].do_recv, | 					  Packets[i].from_rank,Packets[i].do_recv, | ||||||
| 					  Packets[i].xbytes,Packets[i].rbytes,i); | 					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); | ||||||
|     } |     } | ||||||
|     //    std::cout << "Communicate PollDtoH "<<std::endl; |     //    std::cout << "Communicate PollDtoH "<<std::endl; | ||||||
|     //    _grid->Barrier(); |     //    _grid->Barrier(); | ||||||
| @@ -394,14 +519,14 @@ public: | |||||||
| 					Packets[i].to_rank,Packets[i].do_send, | 					Packets[i].to_rank,Packets[i].do_send, | ||||||
| 					Packets[i].recv_buf, | 					Packets[i].recv_buf, | ||||||
| 					Packets[i].from_rank,Packets[i].do_recv, | 					Packets[i].from_rank,Packets[i].do_recv, | ||||||
| 					Packets[i].xbytes,Packets[i].rbytes,i); | 					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); | ||||||
|     } |     } | ||||||
|     FlightRecorder::StepLog("Communicate begin has finished"); |     FlightRecorder::StepLog("Communicate begin has finished"); | ||||||
|     // Get comms started then run checksums |     // Get comms started then run checksums | ||||||
|     // Having this PRIOR to the dslash seems to make Sunspot work... (!) |     // Having this PRIOR to the dslash seems to make Sunspot work... (!) | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       if ( Packets[i].do_send ) |       if ( Packets[i].do_send ) | ||||||
| 	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes); | 	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes_compressed); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -416,14 +541,15 @@ public: | |||||||
|     //    std::cout << "Communicate Complete Complete "<<std::endl; |     //    std::cout << "Communicate Complete Complete "<<std::endl; | ||||||
|     //    _grid->Barrier(); |     //    _grid->Barrier(); | ||||||
|     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done |     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done | ||||||
|     if   ( this->partialDirichlet ) DslashLogPartial(); |     //    if   ( this->partialDirichlet ) DslashLogPartial(); | ||||||
|     else if ( this->fullDirichlet ) DslashLogDirichlet(); |     if ( this->fullDirichlet ) DslashLogDirichlet(); | ||||||
|     else DslashLogFull(); |     else DslashLogFull(); | ||||||
|     //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete |     //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete | ||||||
|     //    accelerator_barrier();  |     //    accelerator_barrier();  | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       if ( Packets[i].do_recv ) |       if ( Packets[i].do_recv ) | ||||||
| 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); | 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank); | ||||||
|  |       this->DecompressPacket(Packets[i]); | ||||||
|     } |     } | ||||||
|     FlightRecorder::StepLog("Finish communicate complete"); |     FlightRecorder::StepLog("Finish communicate complete"); | ||||||
|   } |   } | ||||||
| @@ -618,7 +744,7 @@ public: | |||||||
|   } |   } | ||||||
|   void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) { |   void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) { | ||||||
|     Decompress d; |     Decompress d; | ||||||
|     d.partial  = this->partialDirichlet; |     //    d.partial  = this->partialDirichlet; | ||||||
|     d.dims     = _grid->_fdimensions; |     d.dims     = _grid->_fdimensions; | ||||||
|     d.kernel_p = k_p; |     d.kernel_p = k_p; | ||||||
|     d.mpi_p    = m_p; |     d.mpi_p    = m_p; | ||||||
| @@ -627,7 +753,7 @@ public: | |||||||
|   } |   } | ||||||
|   void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) { |   void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) { | ||||||
|     Merge m; |     Merge m; | ||||||
|     m.partial  = this->partialDirichlet; |     //    m.partial  = this->partialDirichlet; | ||||||
|     m.dims     = _grid->_fdimensions; |     m.dims     = _grid->_fdimensions; | ||||||
|     m.type     = type; |     m.type     = type; | ||||||
|     m.mpointer = merge_p; |     m.mpointer = merge_p; | ||||||
| @@ -732,8 +858,8 @@ public: | |||||||
|       int block = dirichlet_block[dimension]; |       int block = dirichlet_block[dimension]; | ||||||
|       this->_comms_send[ii] = comm_dim; |       this->_comms_send[ii] = comm_dim; | ||||||
|       this->_comms_recv[ii] = comm_dim; |       this->_comms_recv[ii] = comm_dim; | ||||||
|       this->_comms_partial_send[ii] = 0; |       //      this->_comms_partial_send[ii] = 0; | ||||||
|       this->_comms_partial_recv[ii] = 0; |       //      this->_comms_partial_recv[ii] = 0; | ||||||
|       if ( block && comm_dim ) { |       if ( block && comm_dim ) { | ||||||
| 	assert(abs(displacement) < ld ); | 	assert(abs(displacement) < ld ); | ||||||
| 	// Quiesce communication across block boundaries | 	// Quiesce communication across block boundaries | ||||||
| @@ -754,10 +880,10 @@ public: | |||||||
| 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0; | 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0; | ||||||
| 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0; | 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0; | ||||||
| 	} | 	} | ||||||
| 	if ( partialDirichlet ) { | 	//	if ( partialDirichlet ) { | ||||||
| 	  this->_comms_partial_send[ii] = !this->_comms_send[ii]; | 	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii]; | ||||||
| 	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; | 	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; | ||||||
| 	} | 	//	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -769,6 +895,7 @@ public: | |||||||
| 		   Parameters p=Parameters(), | 		   Parameters p=Parameters(), | ||||||
| 		   bool preserve_shm=false) | 		   bool preserve_shm=false) | ||||||
|   { |   { | ||||||
|  |     SloppyComms = 0; | ||||||
|     face_table_computed=0; |     face_table_computed=0; | ||||||
|     _grid    = grid; |     _grid    = grid; | ||||||
|     this->parameters=p; |     this->parameters=p; | ||||||
| @@ -786,7 +913,7 @@ public: | |||||||
|     this->same_node.resize(npoints); |     this->same_node.resize(npoints); | ||||||
|  |  | ||||||
|     if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); |     if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); | ||||||
|     partialDirichlet = p.partialDirichlet; |     //    partialDirichlet = p.partialDirichlet; | ||||||
|     DirichletBlock(p.dirichlet); // comms send/recv set up |     DirichletBlock(p.dirichlet); // comms send/recv set up | ||||||
|     fullDirichlet=0; |     fullDirichlet=0; | ||||||
|     for(int d=0;d<p.dirichlet.size();d++){ |     for(int d=0;d<p.dirichlet.size();d++){ | ||||||
| @@ -935,7 +1062,8 @@ public: | |||||||
|     GridBase *grid=_grid; |     GridBase *grid=_grid; | ||||||
|     const int Nsimd = grid->Nsimd(); |     const int Nsimd = grid->Nsimd(); | ||||||
|  |  | ||||||
|     int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ; |     //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ; | ||||||
|  |     int comms_recv      = this->_comms_recv[point]; | ||||||
|     int fd              = _grid->_fdimensions[dimension]; |     int fd              = _grid->_fdimensions[dimension]; | ||||||
|     int ld              = _grid->_ldimensions[dimension]; |     int ld              = _grid->_ldimensions[dimension]; | ||||||
|     int rd              = _grid->_rdimensions[dimension]; |     int rd              = _grid->_rdimensions[dimension]; | ||||||
| @@ -1124,8 +1252,8 @@ public: | |||||||
|  |  | ||||||
|     int comms_send   = this->_comms_send[point]; |     int comms_send   = this->_comms_send[point]; | ||||||
|     int comms_recv   = this->_comms_recv[point]; |     int comms_recv   = this->_comms_recv[point]; | ||||||
|     int comms_partial_send   = this->_comms_partial_send[point] ; |     //    int comms_partial_send   = this->_comms_partial_send[point] ; | ||||||
|     int comms_partial_recv   = this->_comms_partial_recv[point] ; |     //    int comms_partial_recv   = this->_comms_partial_recv[point] ; | ||||||
|      |      | ||||||
|     assert(rhs.Grid()==_grid); |     assert(rhs.Grid()==_grid); | ||||||
|     //	  conformable(_grid,rhs.Grid()); |     //	  conformable(_grid,rhs.Grid()); | ||||||
| @@ -1160,11 +1288,11 @@ public: | |||||||
| 	int rbytes; | 	int rbytes; | ||||||
|  |  | ||||||
| 	if ( comms_send ) xbytes = bytes; // Full send | 	if ( comms_send ) xbytes = bytes; // Full send | ||||||
| 	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); | 	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); | ||||||
| 	else xbytes = 0; // full dirichlet | 	else xbytes = 0; // full dirichlet | ||||||
|  |  | ||||||
| 	if ( comms_recv ) rbytes = bytes; | 	if ( comms_recv ) rbytes = bytes; | ||||||
| 	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); | 	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); | ||||||
| 	else rbytes = 0; | 	else rbytes = 0; | ||||||
| 	 | 	 | ||||||
| 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane | 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane | ||||||
| @@ -1191,7 +1319,8 @@ public: | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  |  | ||||||
| 	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) { | 	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) { | ||||||
|  | 	if ( compress.DecompressionStep()&&comms_recv) { | ||||||
| 	  recv_buf=u_simd_recv_buf[0]; | 	  recv_buf=u_simd_recv_buf[0]; | ||||||
| 	} else { | 	} else { | ||||||
| 	  recv_buf=this->u_recv_buf_p; | 	  recv_buf=this->u_recv_buf_p; | ||||||
| @@ -1225,7 +1354,8 @@ public: | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
| 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl; | 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl; | ||||||
| 	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send); | 	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send); | ||||||
|  | 	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0); | ||||||
|  |  | ||||||
|         int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask); |         int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask); | ||||||
| 	if ( !duplicate ) { // Force comms for now | 	if ( !duplicate ) { // Force comms for now | ||||||
| @@ -1234,8 +1364,8 @@ public: | |||||||
| 	  // Build a list of things to do after we synchronise GPUs | 	  // Build a list of things to do after we synchronise GPUs | ||||||
| 	  // Start comms now??? | 	  // Start comms now??? | ||||||
| 	  /////////////////////////////////////////////////////////// | 	  /////////////////////////////////////////////////////////// | ||||||
| 	  int do_send = (comms_send|comms_partial_send) && (!shm_send ); | 	  int do_send = (comms_send) && (!shm_send ); | ||||||
| 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv ); | 	  int do_recv = (comms_send) && (!shm_recv ); | ||||||
| 	  AddPacket((void *)&send_buf[comm_off], | 	  AddPacket((void *)&send_buf[comm_off], | ||||||
| 		    (void *)&recv_buf[comm_off], | 		    (void *)&recv_buf[comm_off], | ||||||
| 		    xmit_to_rank, do_send, | 		    xmit_to_rank, do_send, | ||||||
| @@ -1243,7 +1373,7 @@ public: | |||||||
| 		    xbytes,rbytes); | 		    xbytes,rbytes); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) { | 	if ( (compress.DecompressionStep() && comms_recv) ) { | ||||||
| 	  AddDecompress(&this->u_recv_buf_p[comm_off], | 	  AddDecompress(&this->u_recv_buf_p[comm_off], | ||||||
| 			&recv_buf[comm_off], | 			&recv_buf[comm_off], | ||||||
| 			words,Decompressions); | 			words,Decompressions); | ||||||
| @@ -1265,8 +1395,8 @@ public: | |||||||
|  |  | ||||||
|     int comms_send   = this->_comms_send[point]; |     int comms_send   = this->_comms_send[point]; | ||||||
|     int comms_recv   = this->_comms_recv[point]; |     int comms_recv   = this->_comms_recv[point]; | ||||||
|     int comms_partial_send   = this->_comms_partial_send[point] ; |     //    int comms_partial_send   = this->_comms_partial_send[point] ; | ||||||
|     int comms_partial_recv   = this->_comms_partial_recv[point] ; |     //    int comms_partial_recv   = this->_comms_partial_recv[point] ; | ||||||
|  |  | ||||||
|     int fd = _grid->_fdimensions[dimension]; |     int fd = _grid->_fdimensions[dimension]; | ||||||
|     int rd = _grid->_rdimensions[dimension]; |     int rd = _grid->_rdimensions[dimension]; | ||||||
| @@ -1341,18 +1471,20 @@ public: | |||||||
|  |  | ||||||
| 	 | 	 | ||||||
| 	if ( comms_send ) xbytes = bytes; | 	if ( comms_send ) xbytes = bytes; | ||||||
| 	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); | 	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); | ||||||
| 	else xbytes = 0; | 	else xbytes = 0; | ||||||
|  |  | ||||||
| 	if ( comms_recv ) rbytes = bytes; | 	if ( comms_recv ) rbytes = bytes; | ||||||
| 	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); | 	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); | ||||||
| 	else rbytes = 0; | 	else rbytes = 0; | ||||||
|  |  | ||||||
| 	// Gathers SIMD lanes for send and merge | 	// Gathers SIMD lanes for send and merge | ||||||
| 	// Different faces can be full comms or partial comms with  multiple ranks per node | 	// Different faces can be full comms or partial comms with  multiple ranks per node | ||||||
| 	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) { | 	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) { | ||||||
|  | 	if ( comms_send || comms_recv ) { | ||||||
|  |  | ||||||
| 	  int partial = partialDirichlet; | 	  //	  int partial = partialDirichlet; | ||||||
|  | 	  int partial = 0; | ||||||
| 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs, | 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs, | ||||||
| 					    spointers,dimension,sx,cbmask, | 					    spointers,dimension,sx,cbmask, | ||||||
| 					    compress,permute_type,partial ); | 					    compress,permute_type,partial ); | ||||||
| @@ -1418,7 +1550,8 @@ public: | |||||||
| 	      if ( (bytes != rbytes) && (rbytes!=0) ){ | 	      if ( (bytes != rbytes) && (rbytes!=0) ){ | ||||||
| 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero | 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero | ||||||
| 	      } | 	      } | ||||||
| 	      int do_send = (comms_send|comms_partial_send) && (!shm_send ); | 	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send ); | ||||||
|  | 	      int do_send = (comms_send) && (!shm_send ); | ||||||
| 	      AddPacket((void *)sp,(void *)rp, | 	      AddPacket((void *)sp,(void *)rp, | ||||||
| 			xmit_to_rank,do_send, | 			xmit_to_rank,do_send, | ||||||
| 			recv_from_rank,do_send, | 			recv_from_rank,do_send, | ||||||
| @@ -1432,7 +1565,8 @@ public: | |||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
| 	// rpointer may be doing a remote read in the gather over SHM | 	// rpointer may be doing a remote read in the gather over SHM | ||||||
| 	if ( comms_recv|comms_partial_recv ) { | 	//	if ( comms_recv|comms_partial_recv ) { | ||||||
|  | 	if ( comms_recv ) { | ||||||
| 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); | 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user