Remove partial dirichlet. Favour intro reduced prec comms options

2025-08-02 20:57:06 +01:00 · 2025-06-13 05:08:45 +02:00
parent fce6e1f135
commit 821358eda7
1 changed files with 174 additions and 40 deletions
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct DefaultImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
+  //  int  partialDirichlet;
  DefaultImplParams()  {
    dirichlet.resize(0);
-    partialDirichlet=0;
+    //    partialDirichlet=0;
  };
 };

@@ -113,8 +113,8 @@ class CartesianStencilAccelerator {
  ///////////////////////////////////////////////////
  // If true, this is partially communicated per face
  ///////////////////////////////////////////////////
-  StencilVector _comms_partial_send; 
-  StencilVector _comms_partial_recv;
+  //  StencilVector _comms_partial_send; 
+  //  StencilVector _comms_partial_recv;
  //
  StencilVector _comm_buf_size;
  StencilVector _permute_type;
@@ -205,6 +205,8 @@ public:
  struct Packet {
    void * send_buf;
    void * recv_buf;
+    void * compressed_send_buf;
+    void * compressed_recv_buf;
 #ifndef ACCELERATOR_AWARE_MPI
    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
@@ -215,6 +217,8 @@ public:
    Integer do_recv;
    Integer xbytes;
    Integer rbytes;
+    Integer xbytes_compressed;
+    Integer rbytes_compressed;
  };
  struct Merge {
    static constexpr int Nsimd = vobj::Nsimd();
@@ -223,7 +227,7 @@ public:
    std::vector<cobj *> vpointers;
    Integer buffer_size;
    Integer type;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct Decompress {
@@ -231,7 +235,7 @@ public:
    cobj * kernel_p;
    cobj * mpi_p;
    Integer buffer_size;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct CopyReceiveBuffer {
@@ -255,6 +259,12 @@ protected:
 public:
  GridBase *Grid(void) const { return _grid; }

+  /////////////////////////////////////////////////////////
+  // Control reduced precision comms
+  /////////////////////////////////////////////////////////
+  int SloppyComms;
+  void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
+
  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
  // without adding parameters. Perhaps a template parameter to StenciView is
@@ -268,7 +278,7 @@ public:
  }

  int face_table_computed;
-  int partialDirichlet;
+  //  int partialDirichlet;
  int fullDirichlet;
  std::vector<deviceVector<std::pair<int,int> > > face_table ;
  deviceVector<int> surface_list;
@@ -361,6 +371,118 @@ public:
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
+  void DecompressPacket(Packet &packet)
+  {
+    if ( !SloppyComms ) return;
+
+    if ( packet.do_recv ) {
+
+      typedef typename getPrecision<cobj>::real_scalar_type word;
+      uint64_t words = packet.rbytes/sizeof(word);
+      const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+      const uint64_t outer = words/nsimd;
+
+      if(sizeof(word)==8) {
+
+	// Can either choose to represent as float vs double and prec change
+	// OR
+	// truncate the mantissa bfp16 style
+
+	static deviceVector<uint32_t> compression_buffer;
+
+	if(words > compression_buffer.size() ) compression_buffer.resize(words);
+	
+	uint64_t *fbuf =(uint64_t *) &packet.recv_buf;
+	uint32_t *fhbuf=(uint32_t *) &packet.recv_buf;
+	uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
+	accelerator_for(ss,outer,nsimd,{
+          hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane]; // copy at half precision
+	});
+	accelerator_for(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = ((uint64_t)hbuf[ss*nsimd+lane])<<32; //copy back and pad each word with zeroes
+	});
+      } else if ( sizeof(word)==4){
+	// Can either choose to represent as half vs float and prec change
+        // OR
+	// truncate the mantissa bfp16 style
+	static deviceVector<uint16_t> compression_buffer;
+
+	if(words > compression_buffer.size() ) compression_buffer.resize(words);
+
+	uint32_t *fbuf =(uint32_t *) packet.recv_buf;
+	uint16_t *fhbuf=(uint16_t *) packet.recv_buf;
+	uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
+	accelerator_for(ss,outer,nsimd,{
+	  hbuf[ss*nsimd+lane] = fhbuf[ss*nsimd+lane];
+	});
+	accelerator_for(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
+	});
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+    }
+  }
+  void CompressPacket(Packet &packet)
+  {
+    if ( !SloppyComms ) {
+      packet.xbytes_compressed = packet.xbytes;
+      packet.rbytes_compressed = packet.rbytes;
+      return;
+    }
+    
+    typedef typename getPrecision<cobj>::real_scalar_type word;
+    uint64_t words = packet.xbytes/sizeof(word);
+    const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+    const uint64_t outer = words/nsimd;
+    
+    if (packet.do_send) {
+
+      if(sizeof(word)==8) {
+
+	static deviceVector<uint32_t> compression_buffer;
+
+	if(words > compression_buffer.size() ) compression_buffer.resize(words);
+
+	uint64_t *fbuf =(uint64_t *) packet.send_buf;
+	uint32_t *fhbuf=(uint32_t *) packet.send_buf;
+	uint32_t *hbuf =(uint32_t *) &compression_buffer[0];
+
+	accelerator_for(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>32; // truncate and copy
+	});
+	accelerator_for(ss,outer,nsimd,{
+          fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane]; // copy back
+	});
+
+      } else if ( sizeof(word)==4){
+
+	static deviceVector<uint16_t> compression_buffer;
+
+	if(words > compression_buffer.size() ) compression_buffer.resize(words);
+
+	uint32_t *fbuf =(uint32_t *) packet.send_buf;
+	uint16_t *fhbuf=(uint16_t *) packet.send_buf;
+	uint16_t *hbuf =(uint16_t *) &compression_buffer[0];
+	accelerator_for(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16;
+	});
+	accelerator_for(ss,outer,nsimd,{
+	  fhbuf[ss*nsimd+lane] = hbuf[ss*nsimd+lane];
+	});
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+    }
+    packet.xbytes_compressed = packet.xbytes/2;
+    packet.rbytes_compressed = packet.rbytes/2;
+
+    return;
+  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    //    std::cout << "Communicate Begin "<<std::endl;
@@ -371,6 +493,9 @@ public:
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
    for(int i=0;i<Packets.size();i++){
+
+      this->CompressPacket(Packets[i]);
+
      //      std::cout << "Communicate prepare "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
@@ -378,7 +503,7 @@ public:
 					  Packets[i].to_rank,Packets[i].do_send,
 					  Packets[i].recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
-					  Packets[i].xbytes,Packets[i].rbytes,i);
+					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
    }
    //    std::cout << "Communicate PollDtoH "<<std::endl;
    //    _grid->Barrier();
@@ -394,14 +519,14 @@ public:
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
+					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
    }
    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
+	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes_compressed);
    }
  }

@@ -416,14 +541,15 @@ public:
    //    std::cout << "Communicate Complete Complete "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
-    if   ( this->partialDirichlet ) DslashLogPartial();
-    else if ( this->fullDirichlet ) DslashLogDirichlet();
+    //    if   ( this->partialDirichlet ) DslashLogPartial();
+    if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
+	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
+      this->DecompressPacket(Packets[i]);
    }
    FlightRecorder::StepLog("Finish communicate complete");
  }
@@ -618,7 +744,7 @@ public:
  }
  void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
    Decompress d;
-    d.partial  = this->partialDirichlet;
+    //    d.partial  = this->partialDirichlet;
    d.dims     = _grid->_fdimensions;
    d.kernel_p = k_p;
    d.mpi_p    = m_p;
@@ -627,7 +753,7 @@ public:
  }
  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
    Merge m;
-    m.partial  = this->partialDirichlet;
+    //    m.partial  = this->partialDirichlet;
    m.dims     = _grid->_fdimensions;
    m.type     = type;
    m.mpointer = merge_p;
@@ -732,8 +858,8 @@ public:
      int block = dirichlet_block[dimension];
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
-      this->_comms_partial_send[ii] = 0;
-      this->_comms_partial_recv[ii] = 0;
+      //      this->_comms_partial_send[ii] = 0;
+      //      this->_comms_partial_recv[ii] = 0;
      if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
 	// Quiesce communication across block boundaries
@@ -754,10 +880,10 @@ public:
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
-	if ( partialDirichlet ) {
-	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
-	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
-	}
+	//	if ( partialDirichlet ) {
+	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	//	}
      }
    }
  }
@@ -769,6 +895,7 @@ public:
 		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
  {
+    SloppyComms = 0;
    face_table_computed=0;
    _grid    = grid;
    this->parameters=p;
@@ -786,7 +913,7 @@ public:
    this->same_node.resize(npoints);

    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-    partialDirichlet = p.partialDirichlet;
+    //    partialDirichlet = p.partialDirichlet;
    DirichletBlock(p.dirichlet); // comms send/recv set up
    fullDirichlet=0;
    for(int d=0;d<p.dirichlet.size();d++){
@@ -935,7 +1062,8 @@ public:
    GridBase *grid=_grid;
    const int Nsimd = grid->Nsimd();

-    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    int comms_recv      = this->_comms_recv[point];
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
@@ -1124,8 +1252,8 @@ public:

    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
    
    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());
@@ -1160,11 +1288,11 @@ public:
 	int rbytes;

 	if ( comms_send ) xbytes = bytes; // Full send
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0; // full dirichlet

 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1191,7 +1319,8 @@ public:
 	}


-	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	if ( compress.DecompressionStep()&&comms_recv) {
 	  recv_buf=u_simd_recv_buf[0];
 	} else {
 	  recv_buf=this->u_recv_buf_p;
@@ -1225,7 +1354,8 @@ public:
 #endif

 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
-	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);

        int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
 	if ( !duplicate ) { // Force comms for now
@@ -1234,8 +1364,8 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
-	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  int do_send = (comms_send) && (!shm_send );
+	  int do_recv = (comms_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
@@ -1243,7 +1373,7 @@ public:
 		    xbytes,rbytes);
 	}

-	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
+	if ( (compress.DecompressionStep() && comms_recv) ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@@ -1265,8 +1395,8 @@ public:

    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;

    int fd = _grid->_fdimensions[dimension];
    int rd = _grid->_rdimensions[dimension];
@@ -1341,18 +1471,20 @@ public:

 	
 	if ( comms_send ) xbytes = bytes;
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0;

 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;

 	// Gathers SIMD lanes for send and merge
 	// Different faces can be full comms or partial comms with  multiple ranks per node
-	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	if ( comms_send || comms_recv ) {

-	  int partial = partialDirichlet;
+	  //	  int partial = partialDirichlet;
+	  int partial = 0;
 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
 					    spointers,dimension,sx,cbmask,
 					    compress,permute_type,partial );
@@ -1418,7 +1550,8 @@ public:
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
-	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_send = (comms_send) && (!shm_send );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
@@ -1432,7 +1565,8 @@ public:
 	  }
 	}
 	// rpointer may be doing a remote read in the gather over SHM
-	if ( comms_recv|comms_partial_recv ) {
+	//	if ( comms_recv|comms_partial_recv ) {
+	if ( comms_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}