From 7eb29cf5292b9889427bd9afde7e9a95737b1ae5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 28 May 2022 15:51:34 -0700 Subject: [PATCH 1/3] MPI fix --- Grid/communicator/Communicator_mpi3.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 1e9e7840..0d0a3443 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -372,7 +372,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Sat, 28 May 2022 15:52:39 -0700 Subject: [PATCH 2/3] Extra easier signature for peek --- Grid/lattice/Lattice_peekpoke.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index 5caab214..f3b485a4 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -125,6 +125,12 @@ void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ ////////////////////////////////////////////////////////// // Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////// +template +typename vobj::scalar_object peekSite(const Lattice &l,const Coordinate &site){ + typename vobj::scalar_object s; + peekSite(s,l,site); + return s; +} template void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ From 34faa39f4f070a1ea230e63cc8e97976125bbdab Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 28 May 2022 17:18:08 -0700 Subject: [PATCH 3/3] Clean up Dirichlet. Big oops fix --- Grid/stencil/Stencil.h | 18 ++++++++++-------- systems/Perlmutter/dwf4.slurm | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 07598265..19eb19fb 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -665,7 +665,7 @@ public: this->_comms_recv[ii] = comm_dim; if ( block && comm_dim ) { assert(abs(displacement) < ld ); - + // Quiesce communication across block boundaries if( displacement > 0 ) { // High side, low side // | <--B--->| @@ -730,7 +730,7 @@ public: int gd = _grid->_gdimensions[dimension]; int fd = _grid->_fdimensions[dimension]; int pd = _grid->_processors [dimension]; - int ld = gd/pd; + // int ld = gd/pd; int rd = _grid->_rdimensions[dimension]; int pc = _grid->_processor_coor[dimension]; this->_permute_type[point]=_grid->PermuteType(dimension); @@ -871,12 +871,14 @@ public: for(int x=0;xPermuteType(dimension); + int permute_slice; int sx = (x+sshift)%rd; int offnode = 0; if ( simd_layout > 1 ) { + permute_slice=1; for(int i=0;i>(permute_type+1)); @@ -893,6 +895,7 @@ public: } else { int comm_proc = ((x+sshift)/rd)%pd; offnode = (comm_proc!= 0); + permute_slice=0; } int wraparound=0; @@ -906,19 +909,18 @@ public: // Wrap locally dirichlet support case OR node local if ( offnode==0 ) { - int permute_slice=0; + permute_slice=0; CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); } else { - if ( comms_recv==0 ) { + if ( comms_recv ) { - int permute_slice=1; - CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase } else { - ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); } @@ -1208,7 +1210,7 @@ public: face_table[face_idx].size()*sizeof(face_table_host[0])); } - if ( comms_send ) + if ( comms_send || comms_recv ) Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++; diff --git a/systems/Perlmutter/dwf4.slurm b/systems/Perlmutter/dwf4.slurm index 8a37a266..426573d9 100644 --- a/systems/Perlmutter/dwf4.slurm +++ b/systems/Perlmutter/dwf4.slurm @@ -19,8 +19,8 @@ export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0 export MPICH_GPU_NO_ASYNC_MEMCPY=0 #export MPICH_SMP_SINGLE_COPY_MODE=CMA -OPT="--comms-overlap --shm-mpi 1" -VOL=64.64.32.32 +OPT="--comms-sequential --shm-mpi 1" +VOL=64.64.64.64 srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.1.1 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT #srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.1.1.4 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT #srun ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.8 --grid $VOL --accelerator-threads 8 --shm 2048 $OPT