From 821358eda75edf6cf928e47205ce28dae2c012ad Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 13 Jun 2025 05:08:45 +0200
Subject: [PATCH] Remove partial dirichlet. Favour intro reduced prec comms
 options

---
 Grid/stencil/Stencil.h | 214 +++++++++++++++++++++++++++++++++--------
 1 file changed, 174 insertions(+), 40 deletions(-)
diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h
index 20237732..527e8624 100644
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct DefaultImplParams {
   Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
+  //  int  partialDirichlet;
   DefaultImplParams()  {
     dirichlet.resize(0);
-    partialDirichlet=0;
+    //    partialDirichlet=0;
   };
 };
 
@@ -113,8 +113,8 @@ class CartesianStencilAccelerator {
   ///////////////////////////////////////////////////
   // If true, this is partially communicated per face
   ///////////////////////////////////////////////////
-  StencilVector _comms_partial_send; 
-  StencilVector _comms_partial_recv;
+  //  StencilVector _comms_partial_send; 
+  //  StencilVector _comms_partial_recv;
   //
   StencilVector _comm_buf_size;
   StencilVector _permute_type;
@@ -205,6 +205,8 @@ public:
   struct Packet {
     void * send_buf;
     void * recv_buf;
+    void * compressed_send_buf;
+    void * compressed_recv_buf;
 #ifndef ACCELERATOR_AWARE_MPI
     void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
     void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
@@ -215,6 +217,8 @@ public:
     Integer do_recv;
     Integer xbytes;
     Integer rbytes;
+    Integer xbytes_compressed;
+    Integer rbytes_compressed;
   };
   struct Merge {
     static constexpr int Nsimd = vobj::Nsimd();
@@ -223,7 +227,7 @@ public:
     std::vector<cobj *> vpointers;
     Integer buffer_size;
     Integer type;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
     Coordinate dims;
   };
   struct Decompress {
@@ -231,7 +235,7 @@ public:
     cobj * kernel_p;
     cobj * mpi_p;
     Integer buffer_size;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
     Coordinate dims;
   };
   struct CopyReceiveBuffer {
@@ -255,6 +259,12 @@ protected:
 public:
   GridBase *Grid(void) const { return _grid; }
 
+  /////////////////////////////////////////////////////////
+  // Control reduced precision comms
+  /////////////////////////////////////////////////////////
+  int SloppyComms;
+  void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
+
   ////////////////////////////////////////////////////////////////////////
   // Needed to conveniently communicate gparity parameters into GPU memory
   // without adding parameters. Perhaps a template parameter to StenciView is
@@ -268,7 +278,7 @@ public:
   }
 
   int face_table_computed;
-  int partialDirichlet;
+  //  int partialDirichlet;
   int fullDirichlet;
   std::vector<deviceVector<std::pair<int,int> > > face_table ;
   deviceVector<int> surface_list;
@@ -361,6 +371,118 @@ public:
   ////////////////////////////////////////////////////////////////////////
   // Non blocking send and receive. Necessarily parallel.
   ////////////////////////////////////////////////////////////////////////
+  void DecompressPacket(Packet &packet)
+  {
+    if ( !SloppyComms ) return;
+
+    if ( packet.do_recv ) {
+
+      typedef typename getPrecision<cobj>::real_scalar_type word;
+      uint64_t words = packet.rbytes/sizeof(word);
+      const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+      const uint64_t outer = words/nsimd;
+
+      if(sizeof(word)==8) {
+
+	// Can either choose to represent as float vs double and prec change
+	// OR
+	// truncate the mantissa bfp16 style
+
+	static deviceVector<uint32_t> compression_buffer;
+
+	if(words > compression_buffer.size() ) compression_buffer.resize(words);
+	
+	uint64_t *fbuf =(uint64_t *) &packet.recv_buf;
+	uint32_t *fhbuf=(uint32_t *) &packet.recv_buf;
+	uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
+	accelerator_for(ss,outer,nsimd,{
+          hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane]; // copy at half precision
+	});
+	accelerator_for(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = ((uint64_t)hbuf[ss*nsimd+lane])<<32; //copy back and pad each word with zeroes
+	});
+      } else if ( sizeof(word)==4){
+	// Can either choose to represent as half vs float and prec change
+        // OR
+	// truncate the mantissa bfp16 style
+	static deviceVector<uint16_t> compression_buffer;
+
+	if(words > compression_buffer.size() ) compression_buffer.resize(words);
+
+	uint32_t *fbuf =(uint32_t *) packet.recv_buf;
+	uint16_t *fhbuf=(uint16_t *) packet.recv_buf;
+	uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
+	accelerator_for(ss,outer,nsimd,{
+	  hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane];
+	});
+	accelerator_for(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
+	});
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+    }
+  }
+  void CompressPacket(Packet &packet)
+  {
+    if ( !SloppyComms ) {
+      packet.xbytes_compressed = packet.xbytes;
+      packet.rbytes_compressed = packet.rbytes;
+      return;
+    }
+    
+    typedef typename getPrecision<cobj>::real_scalar_type word;
+    uint64_t words = packet.xbytes/sizeof(word);
+    const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+    const uint64_t outer = words/nsimd;
+    
+    if (packet.do_send) {
+
+      if(sizeof(word)==8) {
+
+	static deviceVector<uint32_t> compression_buffer;
+
+	if(words > compression_buffer.size() ) compression_buffer.resize(words);
+
+	uint64_t *fbuf =(uint64_t *) packet.send_buf;
+	uint32_t *fhbuf=(uint32_t *) packet.send_buf;
+	uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
+
+	accelerator_for(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>32; // truncate and copy
+	});
+	accelerator_for(ss,outer,nsimd,{
+          fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane]; // copy back
+	});
+
+      } else if ( sizeof(word)==4){
+
+	static deviceVector<uint16_t> compression_buffer;
+
+	if(words > compression_buffer.size() ) compression_buffer.resize(words);
+
+	uint32_t *fbuf =(uint32_t *) packet.send_buf;
+	uint16_t *fhbuf=(uint16_t *) packet.send_buf;
+	uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
+	accelerator_for(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16;
+	});
+	accelerator_for(ss,outer,nsimd,{
+	  fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane];
+	});
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+    }
+    packet.xbytes_compressed = packet.xbytes/2;
+    packet.rbytes_compressed = packet.rbytes/2;
+
+    return;
+  }
   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
   {
     //    std::cout << "Communicate Begin "<<std::endl;
@@ -371,6 +493,9 @@ public:
     //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                                // But the HaloGather had a barrier too.
     for(int i=0;i<Packets.size();i++){
+
+      this->CompressPacket(Packets[i]);
+
       //      std::cout << "Communicate prepare "<<i<<std::endl;
       //      _grid->Barrier();
       _grid->StencilSendToRecvFromPrepare(MpiReqs,
@@ -378,7 +503,7 @@ public:
 					  Packets[i].to_rank,Packets[i].do_send,
 					  Packets[i].recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
-					  Packets[i].xbytes,Packets[i].rbytes,i);
+					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
     }
     //    std::cout << "Communicate PollDtoH "<<std::endl;
     //    _grid->Barrier();
@@ -394,14 +519,14 @@ public:
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
+					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
     }
     FlightRecorder::StepLog("Communicate begin has finished");
     // Get comms started then run checksums
     // Having this PRIOR to the dslash seems to make Sunspot work... (!)
     for(int i=0;i<Packets.size();i++){
       if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
+	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes_compressed);
     }
   }
 
@@ -416,14 +541,15 @@ public:
     //    std::cout << "Communicate Complete Complete "<<std::endl;
     //    _grid->Barrier();
     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
-    if   ( this->partialDirichlet ) DslashLogPartial();
-    else if ( this->fullDirichlet ) DslashLogDirichlet();
+    //    if   ( this->partialDirichlet ) DslashLogPartial();
+    if ( this->fullDirichlet ) DslashLogDirichlet();
     else DslashLogFull();
     //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
     //    accelerator_barrier(); 
     for(int i=0;i<Packets.size();i++){
       if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
+	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
+      this->DecompressPacket(Packets[i]);
     }
     FlightRecorder::StepLog("Finish communicate complete");
   }
@@ -618,7 +744,7 @@ public:
   }
   void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
     Decompress d;
-    d.partial  = this->partialDirichlet;
+    //    d.partial  = this->partialDirichlet;
     d.dims     = _grid->_fdimensions;
     d.kernel_p = k_p;
     d.mpi_p    = m_p;
@@ -627,7 +753,7 @@ public:
   }
   void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
     Merge m;
-    m.partial  = this->partialDirichlet;
+    //    m.partial  = this->partialDirichlet;
     m.dims     = _grid->_fdimensions;
     m.type     = type;
     m.mpointer = merge_p;
@@ -732,8 +858,8 @@ public:
       int block = dirichlet_block[dimension];
       this->_comms_send[ii] = comm_dim;
       this->_comms_recv[ii] = comm_dim;
-      this->_comms_partial_send[ii] = 0;
-      this->_comms_partial_recv[ii] = 0;
+      //      this->_comms_partial_send[ii] = 0;
+      //      this->_comms_partial_recv[ii] = 0;
       if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
 	// Quiesce communication across block boundaries
@@ -754,10 +880,10 @@ public:
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
-	if ( partialDirichlet ) {
-	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
-	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
-	}
+	//	if ( partialDirichlet ) {
+	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	//	}
       }
     }
   }
@@ -769,6 +895,7 @@ public:
 		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
   {
+    SloppyComms = 0;
     face_table_computed=0;
     _grid    = grid;
     this->parameters=p;
@@ -786,7 +913,7 @@ public:
     this->same_node.resize(npoints);
 
     if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-    partialDirichlet = p.partialDirichlet;
+    //    partialDirichlet = p.partialDirichlet;
     DirichletBlock(p.dirichlet); // comms send/recv set up
     fullDirichlet=0;
     for(int d=0;d<p.dirichlet.size();d++){
@@ -935,7 +1062,8 @@ public:
     GridBase *grid=_grid;
     const int Nsimd = grid->Nsimd();
 
-    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    int comms_recv      = this->_comms_recv[point];
     int fd              = _grid->_fdimensions[dimension];
     int ld              = _grid->_ldimensions[dimension];
     int rd              = _grid->_rdimensions[dimension];
@@ -1124,8 +1252,8 @@ public:
 
     int comms_send   = this->_comms_send[point];
     int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
     
     assert(rhs.Grid()==_grid);
     //	  conformable(_grid,rhs.Grid());
@@ -1160,11 +1288,11 @@ public:
 	int rbytes;
 
 	if ( comms_send ) xbytes = bytes; // Full send
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0; // full dirichlet
 
 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1191,7 +1319,8 @@ public:
 	}
 
 
-	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	if ( compress.DecompressionStep()&&comms_recv) {
 	  recv_buf=u_simd_recv_buf[0];
 	} else {
 	  recv_buf=this->u_recv_buf_p;
@@ -1225,7 +1354,8 @@ public:
 #endif
 
 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
-	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
 
         int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
 	if ( !duplicate ) { // Force comms for now
@@ -1234,8 +1364,8 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
-	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  int do_send = (comms_send) && (!shm_send );
+	  int do_recv = (comms_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
@@ -1243,7 +1373,7 @@ public:
 		    xbytes,rbytes);
 	}
 
-	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
+	if ( (compress.DecompressionStep() && comms_recv) ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@@ -1265,8 +1395,8 @@ public:
 
     int comms_send   = this->_comms_send[point];
     int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
 
     int fd = _grid->_fdimensions[dimension];
     int rd = _grid->_rdimensions[dimension];
@@ -1341,18 +1471,20 @@ public:
 
 	
 	if ( comms_send ) xbytes = bytes;
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0;
 
 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 
 	// Gathers SIMD lanes for send and merge
 	// Different faces can be full comms or partial comms with  multiple ranks per node
-	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	if ( comms_send || comms_recv ) {
 
-	  int partial = partialDirichlet;
+	  //	  int partial = partialDirichlet;
+	  int partial = 0;
 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
 					    spointers,dimension,sx,cbmask,
 					    compress,permute_type,partial );
@@ -1418,7 +1550,8 @@ public:
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
-	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_send = (comms_send) && (!shm_send );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
@@ -1432,7 +1565,8 @@ public:
 	  }
 	}
 	// rpointer may be doing a remote read in the gather over SHM
-	if ( comms_recv|comms_partial_recv ) {
+	//	if ( comms_recv|comms_partial_recv ) {
+	if ( comms_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}