Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet

2025-12-23 14:14:30 +00:00 · 2022-05-29 11:08:09 -04:00
parent f729b9b889 34faa39f4f
commit 6a1a198144
4 changed files with 20 additions and 12 deletions
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -372,7 +372,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  double off_node_bytes=0.0;
  int tag;

-  if ( dox ) {
+  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
      ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
@@ -382,7 +382,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    }
  }
  
-  if (dor) {
+  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
      ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -125,6 +125,12 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
+template<class vobj>
+typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
+  typename vobj::scalar_object s;
+  peekSite(s,l,site);
+  return s;
+}        
 template<class vobj,class sobj>
 void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
        
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -665,7 +665,7 @@ public:
      this->_comms_recv[ii] = comm_dim;
      if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
-      
+	// Quiesce communication across block boundaries
 	if( displacement > 0 ) {
 	  // High side, low side
 	  // | <--B--->|
@@ -730,7 +730,7 @@ public:
      int gd = _grid->_gdimensions[dimension];
      int fd = _grid->_fdimensions[dimension];
      int pd = _grid->_processors [dimension];
-      int ld = gd/pd;
+      //      int ld = gd/pd;
      int rd = _grid->_rdimensions[dimension];
      int pc = _grid->_processor_coor[dimension];
      this->_permute_type[point]=_grid->PermuteType(dimension);
@@ -871,12 +871,14 @@ public:
    for(int x=0;x<rd;x++){

      int permute_type=grid->PermuteType(dimension);
+      int permute_slice;

      int sx        =  (x+sshift)%rd;

      int offnode = 0;
      if ( simd_layout > 1 ) {

+	permute_slice=1;
 	for(int i=0;i<Nsimd;i++){

 	  int inner_bit = (Nsimd>>(permute_type+1));
@@ -893,6 +895,7 @@ public:
      } else {
 	int comm_proc = ((x+sshift)/rd)%pd;
 	offnode = (comm_proc!= 0);
+	permute_slice=0;
      }

      int wraparound=0;
@@ -906,19 +909,18 @@ public:
      // Wrap locally dirichlet support case OR node local
      if ( offnode==0 ) {

-	int permute_slice=0;
+	permute_slice=0;
 	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
 	
      } else {

-	if ( comms_recv==0 ) {
+	if ( comms_recv ) {

-	  int permute_slice=1;
-	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
+	  ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase

 	} else { 

-	  ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase
+	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);

 	}

@@ -1208,7 +1210,7 @@ public:
 				  face_table[face_idx].size()*sizeof(face_table_host[0]));
 	}

-	if ( comms_send )
+	if ( comms_send || comms_recv )
 	  Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);
 	face_idx++;

--- a/systems/Perlmutter/dwf4.slurm
+++ b/systems/Perlmutter/dwf4.slurm
@@ -19,8 +19,8 @@ export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
 export MPICH_GPU_NO_ASYNC_MEMCPY=0
 #export MPICH_SMP_SINGLE_COPY_MODE=CMA

-OPT="--comms-overlap --shm-mpi 1"
-VOL=64.64.32.32
+OPT="--comms-sequential --shm-mpi 1"
+VOL=64.64.64.64
 srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.1.1 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
 #srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.1.1.4 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT
 #srun ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.8 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT