Renable cache blocking and efficient UPI type SHM comms

2025-07-15 20:46:54 +01:00 · 2023-03-14 09:10:27 -07:00
parent cad5b187dd
commit eeb6e0a6e3
5 changed files with 112 additions and 10 deletions
--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -63,6 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  _tmp(&FiveDimRedBlackGrid),
  Dirichlet(0)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // some assertions
  assert(FiveDimGrid._ndimension==5);
  assert(FourDimGrid._ndimension==4);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -60,6 +60,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
      _tmp(&Hgrid),
      anisotropyCoeff(anis)
 {
  Stencil.lo     = &Lebesgue;
  StencilEven.lo = &LebesgueEvenOdd;
  StencilOdd.lo  = &LebesgueEvenOdd;
  // Allocate the required comms buffer
  ImportGauge(_Umu);
  if  (anisotropyCoeff.isAnisotropic){
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -433,7 +433,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
    });									
 #define ASM_CALL(A)							\
-  thread_for( ss, Nsite, {						\
+  thread_for( sss, Nsite, {						\
    int ss = st.lo->Reorder(sss);					\
    int sU = ss;							\
    int sF = ss*Ls;							\
    WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -290,9 +290,9 @@ public:
 protected:
  GridBase *                        _grid;
 public:
  GridBase *Grid(void) const { return _grid; }
  LebesgueOrder *lo;
  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
@@ -337,6 +337,7 @@ public:
  ////////////////////////////////////////
  // Stencil query
  ////////////////////////////////////////
 #ifdef SHM_FAST_PATH
  inline int SameNode(int point) {
    int dimension    = this->_directions[point];
@@ -356,7 +357,40 @@ public:
    if ( displacement == 0 ) return 1;
    return 0;
  }
 #else
  // 
  inline int SameNode(int point) {
    int dimension    = this->_directions[point];
    int displacement = this->_distances[point];
    int pd              = _grid->_processors[dimension];
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
    int simd_layout     = _grid->_simd_layout[dimension];
    int comm_dim        = _grid->_processors[dimension] >1 ;
    int recv_from_rank;
    int xmit_to_rank;
    if ( ! comm_dim ) return 1;
    int nbr_proc;
    if (displacement>0) nbr_proc = 1;
    else                 nbr_proc = pd-1;
    // FIXME  this logic needs to be sorted for three link term
    //    assert( (displacement==1) || (displacement==-1));
    // Present hack only works for >= 4^4 subvol per node
    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
    void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
    if ( (shm==NULL) ) return 0;
    return 1;
  }
 #endif
  //////////////////////////////////////////
  // Comms packet queue for asynch thread
  // Use OpenMP Tasks for cleaner ???
@@ -1127,11 +1161,32 @@ public:
 	  recv_buf=this->u_recv_buf_p;
 	}
 	// potential SHM fast path for intranode
 	int shm_send=0;
 	int shm_recv=0;
 #ifdef SHM_FAST_PATH
 	// Put directly in place if we can
 	send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
 	if ( (send_buf==NULL) ) {
 	  shm_send=0;
 	  send_buf = this->u_send_buf_p;
 	} else {
 	  shm_send=1;
 	}
 	void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf);
 	if ( test_ptr != NULL ) shm_recv = 1;
 	//	static int printed;
 	//	if (!printed){
 	  //	  std::cout << " GATHER FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
 	//	  printed = 1;
 	//	}
 #else
 	////////////////////////////////////////////////////////
 	// Gather locally
 	////////////////////////////////////////////////////////
 	send_buf = this->u_send_buf_p; // Gather locally, must send
 	assert(send_buf!=NULL);
 #endif
 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
 	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
@@ -1143,10 +1198,13 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
 	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
-		    xmit_to_rank, comms_send|comms_partial_send,
+		    xmit_to_rank, do_send,
-		    recv_from_rank, comms_recv|comms_partial_recv,
+		    recv_from_rank, do_recv,
 		    xbytes,rbytes);
 	}
@@ -1288,19 +1346,47 @@ public:
 	    int recv_from_rank;
 	    int xmit_to_rank;
-
+	    int shm_send=0;
 	    int shm_recv=0;
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
-
+#ifdef SHM_FAST_PATH
  #warning STENCIL SHM FAST PATH SELECTED
 	    // shm == receive pointer         if offnode
 	    // shm == Translate[send pointer] if on node -- my view of his send pointer
 	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
 	    if (shm==NULL) {
 	      shm = rp;
 	      // we found a packet that comes from MPI and contributes to this shift.
 	      // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
 	      // Kernel will add the exterior_terms except if is_same_node.
 	      // leg of stencil
 	      shm_recv=0;
 	    } else {
 	      shm_recv=1;
 	    }
 	    rpointers[i] = shm;
 	    // Test send side
 	    void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
 	    if ( test_ptr != NULL ) shm_send = 1;
 	    //	    static int printed;
 	    //	    if (!printed){
 	    //	      std::cout << " GATHERSIMD FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
 	    //	      printed = 1;
 	    //	    }
 #else
 	    rpointers[i] = rp;
 #endif
 	    int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
 	    if ( !duplicate  ) { 
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
 	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
 	      int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
 	      AddPacket((void *)sp,(void *)rp,
-			xmit_to_rank,comms_send|comms_partial_send,
+			xmit_to_rank,do_send,
-			recv_from_rank,comms_recv|comms_partial_recv,
+			recv_from_rank,do_send,
 			xbytes,rbytes);
 	    }
@@ -1310,7 +1396,7 @@ public:
 	  }
 	}
-
+	// rpointer may be doing a remote read in the gather over SHM
 	if ( comms_recv|comms_partial_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}
--- a/configure.ac
+++ b/configure.ac
@@ -646,6 +646,14 @@ case ${ac_SHM_FORCE_MPI} in
      ;;
     *) ;;
 esac
 ############### force MPI in SMP
 AC_ARG_ENABLE([shm-fast-path],[AS_HELP_STRING([--enable-shm-fast-path],[Allow kernels to remote copy over intranode])],[ac_SHM_FAST_PATH=${enable_shm_fast_path}],[ac_SHM_FAST_PATH=no])
 case ${ac_SHM_FAST_PATH} in
     yes)
        AC_DEFINE([SHM_FAST_PATH],[1],[SHM_FAST_PATH] )
      ;;
     *) ;;
 esac
 ############### communication type selection
 AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])