From 821358eda75edf6cf928e47205ce28dae2c012ad Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Jun 2025 05:08:45 +0200 Subject: [PATCH] Remove partial dirichlet. Favour intro reduced prec comms options --- Grid/stencil/Stencil.h | 214 +++++++++++++++++++++++++++++++++-------- 1 file changed, 174 insertions(+), 40 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 20237732..527e8624 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid); // These can move into a params header and be given MacroMagic serialisation struct DefaultImplParams { Coordinate dirichlet; // Blocksize of dirichlet BCs - int partialDirichlet; + // int partialDirichlet; DefaultImplParams() { dirichlet.resize(0); - partialDirichlet=0; + // partialDirichlet=0; }; }; @@ -113,8 +113,8 @@ class CartesianStencilAccelerator { /////////////////////////////////////////////////// // If true, this is partially communicated per face /////////////////////////////////////////////////// - StencilVector _comms_partial_send; - StencilVector _comms_partial_recv; + // StencilVector _comms_partial_send; + // StencilVector _comms_partial_recv; // StencilVector _comm_buf_size; StencilVector _permute_type; @@ -205,6 +205,8 @@ public: struct Packet { void * send_buf; void * recv_buf; + void * compressed_send_buf; + void * compressed_recv_buf; #ifndef ACCELERATOR_AWARE_MPI void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE @@ -215,6 +217,8 @@ public: Integer do_recv; Integer xbytes; Integer rbytes; + Integer xbytes_compressed; + Integer rbytes_compressed; }; struct Merge { static constexpr int Nsimd = vobj::Nsimd(); @@ -223,7 +227,7 @@ public: std::vector vpointers; Integer buffer_size; Integer type; - Integer partial; // partial dirichlet BCs + // Integer partial; // partial dirichlet BCs Coordinate dims; }; struct Decompress { @@ -231,7 +235,7 @@ public: cobj * kernel_p; cobj * mpi_p; Integer buffer_size; - Integer partial; // partial dirichlet BCs + // Integer partial; // partial dirichlet BCs Coordinate dims; }; struct CopyReceiveBuffer { @@ -255,6 +259,12 @@ protected: public: GridBase *Grid(void) const { return _grid; } + ///////////////////////////////////////////////////////// + // Control reduced precision comms + ///////////////////////////////////////////////////////// + int SloppyComms; + void SetSloppyComms(int sloppy) { SloppyComms = sloppy; }; + //////////////////////////////////////////////////////////////////////// // Needed to conveniently communicate gparity parameters into GPU memory // without adding parameters. Perhaps a template parameter to StenciView is @@ -268,7 +278,7 @@ public: } int face_table_computed; - int partialDirichlet; + // int partialDirichlet; int fullDirichlet; std::vector > > face_table ; deviceVector surface_list; @@ -361,6 +371,118 @@ public: //////////////////////////////////////////////////////////////////////// // Non blocking send and receive. Necessarily parallel. //////////////////////////////////////////////////////////////////////// + void DecompressPacket(Packet &packet) + { + if ( !SloppyComms ) return; + + if ( packet.do_recv ) { + + typedef typename getPrecision::real_scalar_type word; + uint64_t words = packet.rbytes/sizeof(word); + const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word); + const uint64_t outer = words/nsimd; + + if(sizeof(word)==8) { + + // Can either choose to represent as float vs double and prec change + // OR + // truncate the mantissa bfp16 style + + static deviceVector compression_buffer; + + if(words > compression_buffer.size() ) compression_buffer.resize(words); + + uint64_t *fbuf =(uint64_t *) &packet.recv_buf; + uint32_t *fhbuf=(uint32_t *) &packet.recv_buf; + uint32_t *hbuf =(uint32_t *) &compression_buffer[0]; + accelerator_for(ss,outer,nsimd,{ + hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane]; // copy at half precision + }); + accelerator_for(ss,outer,nsimd,{ + int lane = acceleratorSIMTlane(nsimd); + fbuf[ss*nsimd+lane] = ((uint64_t)hbuf[ss*nsimd+lane])<<32; //copy back and pad each word with zeroes + }); + } else if ( sizeof(word)==4){ + // Can either choose to represent as half vs float and prec change + // OR + // truncate the mantissa bfp16 style + static deviceVector compression_buffer; + + if(words > compression_buffer.size() ) compression_buffer.resize(words); + + uint32_t *fbuf =(uint32_t *) packet.recv_buf; + uint16_t *fhbuf=(uint16_t *) packet.recv_buf; + uint16_t *hbuf =(uint16_t *) &compression_buffer[0]; + accelerator_for(ss,outer,nsimd,{ + hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane]; + }); + accelerator_for(ss,outer,nsimd,{ + int lane = acceleratorSIMTlane(nsimd); + fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes + }); + } else { + assert(0 && "unknown floating point precision"); + } + } + } + void CompressPacket(Packet &packet) + { + if ( !SloppyComms ) { + packet.xbytes_compressed = packet.xbytes; + packet.rbytes_compressed = packet.rbytes; + return; + } + + typedef typename getPrecision::real_scalar_type word; + uint64_t words = packet.xbytes/sizeof(word); + const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word); + const uint64_t outer = words/nsimd; + + if (packet.do_send) { + + if(sizeof(word)==8) { + + static deviceVector compression_buffer; + + if(words > compression_buffer.size() ) compression_buffer.resize(words); + + uint64_t *fbuf =(uint64_t *) packet.send_buf; + uint32_t *fhbuf=(uint32_t *) packet.send_buf; + uint32_t *hbuf =(uint32_t *) &compression_buffer[0]; + + accelerator_for(ss,outer,nsimd,{ + int lane = acceleratorSIMTlane(nsimd); + hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>32; // truncate and copy + }); + accelerator_for(ss,outer,nsimd,{ + fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane]; // copy back + }); + + } else if ( sizeof(word)==4){ + + static deviceVector compression_buffer; + + if(words > compression_buffer.size() ) compression_buffer.resize(words); + + uint32_t *fbuf =(uint32_t *) packet.send_buf; + uint16_t *fhbuf=(uint16_t *) packet.send_buf; + uint16_t *hbuf =(uint16_t *) &compression_buffer[0]; + accelerator_for(ss,outer,nsimd,{ + int lane = acceleratorSIMTlane(nsimd); + hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; + }); + accelerator_for(ss,outer,nsimd,{ + fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane]; + }); + } else { + assert(0 && "unknown floating point precision"); + } + } + packet.xbytes_compressed = packet.xbytes/2; + packet.rbytes_compressed = packet.rbytes/2; + + return; + } void CommunicateBegin(std::vector > &reqs) { // std::cout << "Communicate Begin "<StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. for(int i=0;iCompressPacket(Packets[i]); + // std::cout << "Communicate prepare "<Barrier(); _grid->StencilSendToRecvFromPrepare(MpiReqs, @@ -378,7 +503,7 @@ public: Packets[i].to_rank,Packets[i].do_send, Packets[i].recv_buf, Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); + Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); } // std::cout << "Communicate PollDtoH "<Barrier(); @@ -394,14 +519,14 @@ public: Packets[i].to_rank,Packets[i].do_send, Packets[i].recv_buf, Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); + Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); } FlightRecorder::StepLog("Communicate begin has finished"); // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iBarrier(); _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done - if ( this->partialDirichlet ) DslashLogPartial(); - else if ( this->fullDirichlet ) DslashLogDirichlet(); + // if ( this->partialDirichlet ) DslashLogPartial(); + if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // accelerator_barrier(); for(int i=0;iDecompressPacket(Packets[i]); } FlightRecorder::StepLog("Finish communicate complete"); } @@ -618,7 +744,7 @@ public: } void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector &dv) { Decompress d; - d.partial = this->partialDirichlet; + // d.partial = this->partialDirichlet; d.dims = _grid->_fdimensions; d.kernel_p = k_p; d.mpi_p = m_p; @@ -627,7 +753,7 @@ public: } void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer type,std::vector &mv) { Merge m; - m.partial = this->partialDirichlet; + // m.partial = this->partialDirichlet; m.dims = _grid->_fdimensions; m.type = type; m.mpointer = merge_p; @@ -732,8 +858,8 @@ public: int block = dirichlet_block[dimension]; this->_comms_send[ii] = comm_dim; this->_comms_recv[ii] = comm_dim; - this->_comms_partial_send[ii] = 0; - this->_comms_partial_recv[ii] = 0; + // this->_comms_partial_send[ii] = 0; + // this->_comms_partial_recv[ii] = 0; if ( block && comm_dim ) { assert(abs(displacement) < ld ); // Quiesce communication across block boundaries @@ -754,10 +880,10 @@ public: if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0; if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0; } - if ( partialDirichlet ) { - this->_comms_partial_send[ii] = !this->_comms_send[ii]; - this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; - } + // if ( partialDirichlet ) { + // this->_comms_partial_send[ii] = !this->_comms_send[ii]; + // this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; + // } } } } @@ -769,6 +895,7 @@ public: Parameters p=Parameters(), bool preserve_shm=false) { + SloppyComms = 0; face_table_computed=0; _grid = grid; this->parameters=p; @@ -786,7 +913,7 @@ public: this->same_node.resize(npoints); if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); - partialDirichlet = p.partialDirichlet; + // partialDirichlet = p.partialDirichlet; DirichletBlock(p.dirichlet); // comms send/recv set up fullDirichlet=0; for(int d=0;dNsimd(); - int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; + // int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; + int comms_recv = this->_comms_recv[point]; int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1124,8 +1252,8 @@ public: int comms_send = this->_comms_send[point]; int comms_recv = this->_comms_recv[point]; - int comms_partial_send = this->_comms_partial_send[point] ; - int comms_partial_recv = this->_comms_partial_recv[point] ; + // int comms_partial_send = this->_comms_partial_send[point] ; + // int comms_partial_recv = this->_comms_partial_recv[point] ; assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1160,11 +1288,11 @@ public: int rbytes; if ( comms_send ) xbytes = bytes; // Full send - else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); + // else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); else xbytes = 0; // full dirichlet if ( comms_recv ) rbytes = bytes; - else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); + // else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); else rbytes = 0; int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane @@ -1191,7 +1319,8 @@ public: } - if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) { + // if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) { + if ( compress.DecompressionStep()&&comms_recv) { recv_buf=u_simd_recv_buf[0]; } else { recv_buf=this->u_recv_buf_p; @@ -1225,7 +1354,8 @@ public: #endif // std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<u_recv_buf_p[comm_off], &recv_buf[comm_off], words,Decompressions); @@ -1265,8 +1395,8 @@ public: int comms_send = this->_comms_send[point]; int comms_recv = this->_comms_recv[point]; - int comms_partial_send = this->_comms_partial_send[point] ; - int comms_partial_recv = this->_comms_partial_recv[point] ; + // int comms_partial_send = this->_comms_partial_send[point] ; + // int comms_partial_recv = this->_comms_partial_recv[point] ; int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -1341,18 +1471,20 @@ public: if ( comms_send ) xbytes = bytes; - else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); + // else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); else xbytes = 0; if ( comms_recv ) rbytes = bytes; - else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); + // else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); else rbytes = 0; // Gathers SIMD lanes for send and merge // Different faces can be full comms or partial comms with multiple ranks per node - if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) { + // if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) { + if ( comms_send || comms_recv ) { - int partial = partialDirichlet; + // int partial = partialDirichlet; + int partial = 0; compressor::Gather_plane_exchange(face_table[face_idx],rhs, spointers,dimension,sx,cbmask, compress,permute_type,partial ); @@ -1418,7 +1550,8 @@ public: if ( (bytes != rbytes) && (rbytes!=0) ){ acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero } - int do_send = (comms_send|comms_partial_send) && (!shm_send ); + // int do_send = (comms_send|comms_partial_send) && (!shm_send ); + int do_send = (comms_send) && (!shm_send ); AddPacket((void *)sp,(void *)rp, xmit_to_rank,do_send, recv_from_rank,do_send, @@ -1432,7 +1565,8 @@ public: } } // rpointer may be doing a remote read in the gather over SHM - if ( comms_recv|comms_partial_recv ) { + // if ( comms_recv|comms_partial_recv ) { + if ( comms_recv ) { AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); }