From 6f5a5cd9b3269932a720804aebe8b7046d4b68fe Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 28 Jun 2017 23:27:02 +0100 Subject: [PATCH] Improved threaded comms benchmark --- TODO | 11 ++-- benchmarks/Benchmark_comms.cc | 94 +++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/TODO b/TODO index 001c6c0c..3d29215e 100644 --- a/TODO +++ b/TODO @@ -2,10 +2,13 @@ TODO: --------------- Large item work list: -1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O +1)- I/O; There appear to be issues with MPI IO and NERSC with large files. + Possible 2GB limit reappeared. GPFS driver in Intel MPI. + +2)- BG/Q port and check + +3)- Christoph's local basis expansion Lanczos; port to use Lattice_transfer features -2)- Christoph's local basis expansion Lanczos -3)- BG/Q port and check 4)- Precision conversion and sort out localConvert <-- partial - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet 5)- Physical propagator interface @@ -14,6 +17,8 @@ Large item work list: 8)- HDCR resume Recent DONE + +-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O. <--- DONE -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE -- GaugeFix into central location <-- DONE -- Scidac and Ildg metadata handling <-- DONE diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 753b8a58..698f9d25 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -435,6 +435,100 @@ int main (int argc, char ** argv) } } + + + + std::cout< latt_size ({lat*mpi_layout[0], + lat*mpi_layout[1], + lat*mpi_layout[2], + lat*mpi_layout[3]}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank/Nnode; + + std::vector xbuf(8); + std::vector rbuf(8); + Grid.ShmBufferFreeAll(); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + int ncomm; + int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + double dbytes; + for(int i=0;i requests; + dbytes=0; + ncomm=0; + + parallel_for(int dir=0;dir<8;dir++){ + + double tbytes; + int mu =dir % 4; + + if (mpi_layout[mu]>1 ) { + + ncomm++; + int xmit_to_rank; + int recv_from_rank; + if ( dir == mu ) { + int comm_proc=1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } else { + int comm_proc = mpi_layout[mu]-1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } + tbytes= Grid.StencilSendToRecvFromBegin(requests, + (void *)&xbuf[dir][0], + xmit_to_rank, + (void *)&rbuf[dir][0], + recv_from_rank, + bytes,dir); + Grid.StencilSendToRecvFromComplete(requests,dir); + requests.resize(0); + +#pragma omp atomic + dbytes+=tbytes; + } + } + Grid.Barrier(); + double stop=usecond(); + t_time[i] = stop-start; // microseconds + } + + timestat.statistics(t_time); + + dbytes=dbytes*ppn; + double xbytes = dbytes*0.5; + double rbytes = dbytes*0.5; + double bidibytes = dbytes; + + + std::cout<