Partial dirichlet BCs

2025-08-02 04:37:06 +01:00 · 2022-11-15 16:24:26 -05:00
parent 0db4f1803f
commit e2e269e03b
1 changed files with 131 additions and 73 deletions
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -59,6 +59,7 @@ NAMESPACE_BEGIN(Grid);
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);

+/*
 template<class vobj,class cobj,class compressor>
 void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)   __attribute__((noinline));

@@ -103,6 +104,7 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
  });
  rhs_v.ViewClose();
 }
+*/

 struct StencilEntry {
 #ifdef GRID_CUDA
@@ -133,8 +135,18 @@ class CartesianStencilAccelerator {
  int           _osites;
  StencilVector _directions;
  StencilVector _distances;
-  StencilVector _comms_send;
-  StencilVector _comms_recv;
+  ///////////////////////////////////////////////////
+  // If true, this is FULLY communicated per face
+  // Otherwise will either be full or partial dirichlet
+  ///////////////////////////////////////////////////
+  StencilVector _comms_send; 
+  StencilVector _comms_recv; // this is FULLY communicated per face
+  ///////////////////////////////////////////////////
+  // If true, this is partially communicated per face
+  ///////////////////////////////////////////////////
+  StencilVector _comms_partial_send; 
+  StencilVector _comms_partial_recv;
+  //
  StencilVector _comm_buf_size;
  StencilVector _permute_type;
  StencilVector same_node;
@@ -229,7 +241,8 @@ public:
    Integer from_rank;
    Integer do_send;
    Integer do_recv;
-    Integer bytes;
+    Integer xbytes;
+    Integer rbytes;
  };
  struct Merge {
    cobj * mpointer;
@@ -237,11 +250,15 @@ public:
    Vector<cobj *> vpointers;
    Integer buffer_size;
    Integer type;
+    Integer partial; // partial dirichlet BCs
+    Coordinate dims;
  };
  struct Decompress {
    cobj * kernel_p;
    cobj * mpi_p;
    Integer buffer_size;
+    Integer partial; // partial dirichlet BCs
+    Coordinate dims;
  };
  struct CopyReceiveBuffer {
    void * from_p;
@@ -252,7 +269,8 @@ public:
    Integer direction;
    Integer OrthogPlane;
    Integer DestProc;
-    Integer bytes;
+    Integer xbytes;
+    Integer rbytes;
    Integer lane;
    Integer cb;
    void *recv_buf;
@@ -277,6 +295,7 @@ public:
  }

  int face_table_computed;
+  int partialDirichlet;
  std::vector<commVector<std::pair<int,int> > > face_table ;
  Vector<int> surface_list;

@@ -365,7 +384,7 @@ public:
 					Packets[i].to_rank,Packets[i].do_send,
 					Packets[i].recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].bytes,i);
+					Packets[i].xbytes,Packets[i].rbytes,i);
    }
  }

@@ -501,7 +520,9 @@ public:
    }
  }
  
-  Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,Integer bytes,Integer cb)
+  Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,
+			    Integer xbytes,Integer rbytes,
+			    Integer cb)
  {
    CachedTransfer obj;
    obj.direction   = direction;
@@ -509,19 +530,22 @@ public:
    obj.DestProc    = DestProc;
    obj.recv_buf    = recv_buf;
    obj.lane        = lane;
-    obj.bytes       = bytes;
+    obj.xbytes      = xbytes;
+    obj.rbytes      = rbytes;
    obj.cb          = cb;

    for(int i=0;i<CachedTransfers.size();i++){
      if (   (CachedTransfers[i].direction  ==direction)
 	   &&(CachedTransfers[i].OrthogPlane==OrthogPlane)
 	   &&(CachedTransfers[i].DestProc   ==DestProc)
-	   &&(CachedTransfers[i].bytes      ==bytes)
+	   &&(CachedTransfers[i].xbytes      ==xbytes)
+	   &&(CachedTransfers[i].rbytes      ==rbytes)
 	   &&(CachedTransfers[i].lane       ==lane)
 	   &&(CachedTransfers[i].cb         ==cb)
 	     ){
-
-	AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes);
+	// FIXME worry about duplicate with partial compression
+	// Wont happen as DWF has no duplicates, but...
+	AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes);
 	return 1;
      }
    }
@@ -532,7 +556,7 @@ public:
  void AddPacket(void *xmit,void * rcv,
 		 Integer to, Integer do_send,
 		 Integer from, Integer do_recv,
-		 Integer bytes){
+		 Integer xbytes,Integer rbytes){
    Packet p;
    p.send_buf = xmit;
    p.recv_buf = rcv;
@@ -540,11 +564,16 @@ public:
    p.from_rank= from;
    p.do_send  = do_send;
    p.do_recv  = do_recv;
-    p.bytes    = bytes;
+    p.xbytes    = xbytes;
+    p.rbytes    = rbytes;
+    //    if (do_send) std::cout << GridLogMessage << " MPI packet to   "<<to<< " of size "<<xbytes<<std::endl;
+    //    if (do_recv) std::cout << GridLogMessage << " MPI packet from "<<from<< " of size "<<xbytes<<std::endl;
    Packets.push_back(p);
  }
  void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
    Decompress d;
+    d.partial  = this->partialDirichlet;
+    d.dims     = _grid->_fdimensions;
    d.kernel_p = k_p;
    d.mpi_p    = m_p;
    d.buffer_size = buffer_size;
@@ -552,6 +581,8 @@ public:
  }
  void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
    Merge m;
+    m.partial  = this->partialDirichlet;
+    m.dims     = _grid->_fdimensions;
    m.type     = type;
    m.mpointer = merge_p;
    m.vpointers= rpointers;
@@ -571,21 +602,10 @@ public:
  void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd)
  {
    for(int i=0;i<mm.size();i++){
-      auto mp = &mm[i].mpointer[0];
-      auto vp0= &mm[i].vpointers[0][0];
-      auto vp1= &mm[i].vpointers[1][0];
-      auto type= mm[i].type;
-      accelerator_forNB(o,mm[i].buffer_size/2,vobj::Nsimd(),{
-	  decompress.Exchange(mp,vp0,vp1,type,o);
-      });
+      decompressor::MergeFace(decompress,mm[i]);
    }
-
    for(int i=0;i<dd.size();i++){
-      auto kp = dd[i].kernel_p;
-      auto mp = dd[i].mpi_p;
-      accelerator_forNB(o,dd[i].buffer_size,1,{
-	decompress.Decompress(kp,mp,o);
-      });
+      decompressor::DecompressFace(decompress,dd[i]);
    }
  }
  ////////////////////////////////////////
@@ -664,6 +684,10 @@ public:
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
+	if ( partialDirichlet ) {
+	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	}
      }
    }
  }
@@ -691,7 +715,7 @@ public:
    this->same_node.resize(npoints);

    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-
+    partialDirichlet = p.partialDirichlet;
    DirichletBlock(p.dirichlet); // comms send/recv set up

    _unified_buffer_size=0;
@@ -827,7 +851,7 @@ public:
    GridBase *grid=_grid;
    const int Nsimd = grid->Nsimd();

-    int comms_recv      = this->_comms_recv[point];
+    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
@@ -1014,8 +1038,10 @@ public:
  {
    typedef typename cobj::vector_type vector_type;

-    int comms_send   = this->_comms_send[point] ;
-    int comms_recv   = this->_comms_recv[point] ;
+    int comms_send   = this->_comms_send[point];
+    int comms_recv   = this->_comms_recv[point];
+    int comms_partial_send   = this->_comms_partial_send[point] ;
+    int comms_partial_recv   = this->_comms_partial_recv[point] ;

    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());
@@ -1046,7 +1072,17 @@ public:
 	if (cbmask != 0x3) words=words>>1;

 	int bytes =  words * compress.CommDatumSize();
+	int xbytes;
+	int rbytes;

+	if ( comms_send ) xbytes = bytes; // Full send
+	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else xbytes = 0; // full dirichlet
+
+	if ( comms_recv ) rbytes = bytes;
+	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else rbytes = 0;
+	
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
 	int comm_off = u_comm_offset;

@@ -1059,49 +1095,47 @@ public:
 	assert (xmit_to_rank   != _grid->ThisRank());
 	assert (recv_from_rank != _grid->ThisRank());

-	if( comms_send ) {
-
-	  if ( !face_table_computed ) {
-	    face_table.resize(face_idx+1);
-	    std::vector<std::pair<int,int> >  face_table_host ;
-	    Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
+	if ( !face_table_computed ) {
+	  face_table.resize(face_idx+1);
+	  std::vector<std::pair<int,int> >  face_table_host ;
+	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
+	  //	  std::cout << "bytes expect "<< bytes << " " << face_table_host.size()* compress.CommDatumSize()<<std::endl;
 	    face_table[face_idx].resize(face_table_host.size());
 	    acceleratorCopyToDevice(&face_table_host[0],
 				    &face_table[face_idx][0],
 				    face_table[face_idx].size()*sizeof(face_table_host[0]));
-	  }
-
-
-	  if ( compress.DecompressionStep() ) {
-	    recv_buf=u_simd_recv_buf[0];
-	  } else {
-	    recv_buf=this->u_recv_buf_p;
-	  }
-
-	  send_buf = this->u_send_buf_p; // Gather locally, must send
-	
-	  ////////////////////////////////////////////////////////
-	  // Gather locally
-	  ////////////////////////////////////////////////////////
-	  assert(send_buf!=NULL);
-	  Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
 	}

-	int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask);
-	if ( (!duplicate) ) { // Force comms for now

+	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	  recv_buf=u_simd_recv_buf[0];
+	} else {
+	  recv_buf=this->u_recv_buf_p;
+	}
+
+	////////////////////////////////////////////////////////
+	// Gather locally
+	////////////////////////////////////////////////////////
+	send_buf = this->u_send_buf_p; // Gather locally, must send
+	assert(send_buf!=NULL);
+
+	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+
+        int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
+	if ( !duplicate ) { // Force comms for now
+	  
 	  ///////////////////////////////////////////////////////////
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
-		    xmit_to_rank, comms_send,
-		    recv_from_rank, comms_recv,
-		    bytes);
+		    xmit_to_rank, comms_send|comms_partial_send,
+		    recv_from_rank, comms_recv|comms_partial_recv,
+		    xbytes,rbytes);
 	}
-	
-	if ( compress.DecompressionStep()  && comms_recv ) {
+
+	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@@ -1109,7 +1143,6 @@ public:
 	
 	u_comm_offset+=words;
 	face_idx++;
-
      }
    }
    return 0;
@@ -1122,8 +1155,10 @@ public:

    const int maxl =2;// max layout in a direction

-    int comms_send   = this->_comms_send[point] ;
-    int comms_recv   = this->_comms_recv[point] ;
+    int comms_send   = this->_comms_send[point];
+    int comms_recv   = this->_comms_recv[point];
+    int comms_partial_send   = this->_comms_partial_send[point] ;
+    int comms_partial_recv   = this->_comms_partial_recv[point] ;

    int fd = _grid->_fdimensions[dimension];
    int rd = _grid->_rdimensions[dimension];
@@ -1153,6 +1188,11 @@ public:

    int datum_bytes = compress.CommDatumSize();
    int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
+
+    // how many bytes on wire : partial dirichlet or dirichlet may set to < bytes
+    int xbytes; 
+    int rbytes; 
+    
    assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);

    Vector<cobj *> rpointers(maxl);
@@ -1182,22 +1222,37 @@ public:
 	if ( !face_table_computed ) {
 	  face_table.resize(face_idx+1);
 	  std::vector<std::pair<int,int> >  face_table_host ;
-				
+
 	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
 	  face_table[face_idx].resize(face_table_host.size());
 	  acceleratorCopyToDevice(&face_table_host[0],
 				  &face_table[face_idx][0],
 				  face_table[face_idx].size()*sizeof(face_table_host[0]));
+
 	}

-	if ( comms_send || comms_recv ) {
-	  Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
+	
+	if ( comms_send ) xbytes = bytes;
+	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else xbytes = 0;
+
+	if ( comms_recv ) rbytes = bytes;
+	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	else rbytes = 0;
+
+	// Gathers SIMD lanes for send and merge
+	// Different faces can be full comms or partial comms with  multiple ranks per node
+	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+
+	  int partial = partialDirichlet;
+	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
+					    spointers,dimension,sx,cbmask,
+					    compress,permute_type,partial );
 	}
 	face_idx++;

-	//spointers[0] -- low
-	//spointers[1] -- high
-
+	//spointers[0] -- low simd coor
+	//spointers[1] -- high simd coor
 	for(int i=0;i<maxl;i++){

 	  int my_coor  = rd*i + x;            // self explanatory
@@ -1222,13 +1277,16 @@ public:
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);

 	    rpointers[i] = rp;
-
-	    int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask);
+	    
+	    int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
 	    if ( !duplicate  ) { 
+	      if ( (bytes != rbytes) && (rbytes!=0) ){
+		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
+	      }
 	      AddPacket((void *)sp,(void *)rp,
-			xmit_to_rank,comms_send,
-			recv_from_rank,comms_recv,
-			bytes);
+			xmit_to_rank,comms_send|comms_partial_send,
+			recv_from_rank,comms_recv|comms_partial_recv,
+			xbytes,rbytes);
 	    }

 	  } else {
@@ -1238,7 +1296,7 @@ public:
 	  }
 	}

-	if ( comms_recv ) {
+	if ( comms_recv|comms_partial_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}