diff --git a/TODO b/TODO index cccc5f45..c37cbf8b 100644 --- a/TODO +++ b/TODO @@ -6,6 +6,7 @@ Large item work list: 1)- BG/Q port and check 2)- Christoph's local basis expansion Lanczos 3)- Precision conversion and sort out localConvert <-- partial + - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet 4)- Physical propagator interface 5)- Conserved currents @@ -13,7 +14,8 @@ Large item work list: 7)- HDCR resume Recent DONE --- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O + +-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O. <--- DONE -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE -- GaugeFix into central location <-- DONE -- Scidac and Ildg metadata handling <-- DONE diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 91524149..c0ce451f 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -32,6 +32,19 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; +typedef WilsonFermion5D WilsonFermion5DR; +typedef WilsonFermion5D WilsonFermion5DF; +typedef WilsonFermion5D WilsonFermion5DD; + + +std::vector L_list; +std::vector Ls_list; +std::vector mflop_list; + +double mflop_ref; +double mflop_ref_err; + +int NN_global; struct time_statistics{ double mean; @@ -95,13 +108,15 @@ public: static void Comms(void) { - int Nloop=100; + int Nloop=200; int nmu=0; int maxlat=32; std::vector simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); + for(int mu=0;mu1) nmu++; + std::vector t_time(Nloop); time_statistics timestat; @@ -133,13 +148,14 @@ public: bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); } - int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + int ncomm; double dbytes; + std::vector times(Nloop); for(int i=0;i requests; dbytes=0; ncomm=0; @@ -150,7 +166,6 @@ public: if (mpi_layout[mu]>1 ) { - ncomm++; int xmit_to_rank; int recv_from_rank; if ( dir == mu ) { @@ -160,18 +175,18 @@ public: int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } -#if 0 - tbytes= Grid.StencilSendToRecvFromBegin(requests, - (void *)&xbuf[dir][0], - xmit_to_rank, - (void *)&rbuf[dir][0], - recv_from_rank, - bytes,dir); - Grid.StencilSendToRecvFromComplete(requests,dir); -#endif - requests.resize(0); - + tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, + (void *)&rbuf[dir][0], recv_from_rank, + bytes,dir); + +#ifdef GRID_OMP #pragma omp atomic +#endif + ncomm++; + +#ifdef GRID_OMP +#pragma omp atomic +#endif dbytes+=tbytes; } } @@ -181,13 +196,15 @@ public: } timestat.statistics(t_time); + // for(int i=0;i({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=4){ @@ -253,8 +271,7 @@ public: } }; - - static void DWF(int Ls,int L) + static double DWF5(int Ls,int L) { RealD mass=0.1; RealD M5 =1.8; @@ -262,6 +279,7 @@ public: double mflops; double mflops_best = 0; double mflops_worst= 0; + std::vector mflops_all; /////////////////////////////////////////////////////// // Set/Get the layout & grid size @@ -274,6 +292,189 @@ public: GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + std::vector internal; + if ( SHM == 1 ) internal = std::vector({1,1,1,1}); + else if ( SHM == 2 ) internal = std::vector({2,1,1,1}); + else if ( SHM == 4 ) internal = std::vector({2,2,1,1}); + else if ( SHM == 8 ) internal = std::vector({2,2,2,1}); + else assert(0); + + std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); + std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + ///////// Source preparation //////////// + LatticeFermion src (sFGrid); random(RNG5,src); + LatticeFermion tmp (sFGrid); + + RealD N2 = 1.0/::sqrt(norm2(src)); + src = src*N2; + + LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + + WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); + LatticeFermion src_e (sFrbGrid); + LatticeFermion src_o (sFrbGrid); + LatticeFermion r_e (sFrbGrid); + LatticeFermion r_o (sFrbGrid); + LatticeFermion r_eo (sFGrid); + LatticeFermion err (sFGrid); + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + +#if defined(AVX512) + const int num_cases = 6; + std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); +#else + const int num_cases = 4; + std::string fmt("U/S ; U/O ; G/S ; G/O "); +#endif + controls Cases [] = { +#ifdef AVX512 + { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, +#endif + { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + // if (ncall < 500) ncall = 500; + uint64_t ncall = 500; + + sFGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4); + std::vector local({L,L,L,L}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}), + GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; uint64_t SHM=NP/NN; std::vector internal; @@ -364,13 +565,15 @@ public: #if defined(AVX512) const int num_cases = 6; + std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); #else const int num_cases = 4; + std::string fmt("U/S ; U/O ; G/S ; G/O "); #endif controls Cases [] = { -#if defined(AVX512) - { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, +#ifdef AVX512 { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, #endif { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, @@ -380,6 +583,10 @@ public: for(int c=0;cBarrier(); for(int i=0;iBarrier(); double t1=usecond(); - uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); + // if (ncall < 500) ncall = 500; + uint64_t ncall = 1000; + FGrid->Broadcast(0,&ncall,sizeof(ncall)); // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<mflops_best ) mflops_best = mflops; @@ -450,12 +656,20 @@ public: } std::cout<({8,2,2,2}); +#else LebesgueOrder::Block = std::vector({2,2,2,2}); - +#endif Benchmark::Decomposition(); int do_memory=1; @@ -493,26 +710,66 @@ int main (int argc, char ** argv) // empty for now } + int sel=2; + std::vector L_list({8,12,16,24}); + std::vector wilson; + std::vector dwf4; + std::vector dwf5; + if ( do_wilson ) { int Ls=1; std::cout<1) nmu++; std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; @@ -80,7 +80,7 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -92,11 +92,16 @@ int main (int argc, char ** argv) RealD Nnode = Grid.NodeCount(); RealD ppn = Nrank/Nnode; - std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); - std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); + std::vector > xbuf(8); + std::vector > rbuf(8); int ncomm; int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < latt_size ({lat,lat,lat,lat}); @@ -172,9 +176,14 @@ int main (int argc, char ** argv) RealD Nnode = Grid.NodeCount(); RealD ppn = Nrank/Nnode; - std::vector > xbuf(8,std::vector(lat*lat*lat*Ls)); - std::vector > rbuf(8,std::vector(lat*lat*lat*Ls)); + std::vector > xbuf(8); + std::vector > rbuf(8); + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -299,7 +308,7 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu][0], recv_from_rank, - bytes); + bytes,mu); comm_proc = mpi_layout[mu]-1; @@ -310,11 +319,11 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu+4][0], recv_from_rank, - bytes); + bytes,mu+4); } } - Grid.StencilSendToRecvFromComplete(requests); + Grid.StencilSendToRecvFromComplete(requests,0); Grid.Barrier(); double stop=usecond(); t_time[i] = stop-start; // microseconds @@ -346,7 +355,7 @@ int main (int argc, char ** argv) header(); for(int lat=4;lat<=maxlat;lat+=4){ - for(int Ls=8;Ls<=32;Ls*=2){ + for(int Ls=8;Ls<=8;Ls*=2){ std::vector latt_size ({lat*mpi_layout[0], lat*mpi_layout[1], @@ -393,8 +402,8 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu][0], recv_from_rank, - bytes); - Grid.StencilSendToRecvFromComplete(requests); + bytes,mu); + Grid.StencilSendToRecvFromComplete(requests,mu); requests.resize(0); comm_proc = mpi_layout[mu]-1; @@ -406,8 +415,8 @@ int main (int argc, char ** argv) xmit_to_rank, (void *)&rbuf[mu+4][0], recv_from_rank, - bytes); - Grid.StencilSendToRecvFromComplete(requests); + bytes,mu+4); + Grid.StencilSendToRecvFromComplete(requests,mu+4); requests.resize(0); } @@ -436,5 +445,97 @@ int main (int argc, char ** argv) } } + + + std::cout< latt_size ({lat*mpi_layout[0], + lat*mpi_layout[1], + lat*mpi_layout[2], + lat*mpi_layout[3]}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + RealD Nrank = Grid._Nprocessors; + RealD Nnode = Grid.NodeCount(); + RealD ppn = Nrank/Nnode; + + std::vector xbuf(8); + std::vector rbuf(8); + Grid.ShmBufferFreeAll(); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + int ncomm; + int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + double dbytes; + for(int i=0;i requests; + dbytes=0; + ncomm=0; + + parallel_for(int dir=0;dir<8;dir++){ + + double tbytes; + int mu =dir % 4; + + if (mpi_layout[mu]>1 ) { + + ncomm++; + int xmit_to_rank; + int recv_from_rank; + if ( dir == mu ) { + int comm_proc=1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } else { + int comm_proc = mpi_layout[mu]-1; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + } + + tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, + (void *)&rbuf[dir][0], recv_from_rank, bytes,dir); + +#pragma omp atomic + dbytes+=tbytes; + } + } + Grid.Barrier(); + double stop=usecond(); + t_time[i] = stop-start; // microseconds + } + + timestat.statistics(t_time); + + dbytes=dbytes*ppn; + double xbytes = dbytes*0.5; + double rbytes = dbytes*0.5; + double bidibytes = dbytes; + + + std::cout< +#include namespace Grid { @@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) { return NULL; } + +void check_huge_pages(void *Buf,uint64_t BYTES) +{ +#ifdef __linux__ + int fd = open("/proc/self/pagemap", O_RDONLY); + assert(fd >= 0); + const int page_size = 4096; + uint64_t virt_pfn = (uint64_t)Buf / page_size; + off_t offset = sizeof(uint64_t) * virt_pfn; + uint64_t npages = (BYTES + page_size-1) / page_size; + uint64_t pagedata[npages]; + uint64_t ret = lseek(fd, offset, SEEK_SET); + assert(ret == offset); + ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); + assert(ret == sizeof(uint64_t) * npages); + int nhugepages = npages / 512; + int n4ktotal, nnothuge; + n4ktotal = 0; + nnothuge = 0; + for (int i = 0; i < nhugepages; ++i) { + uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; + for (int j = 0; j < 512; ++j) { + uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; + ++n4ktotal; + if (pageaddr != baseaddr + j * page_size) + ++nnothuge; + } + } + int rank = CartesianCommunicator::RankWorld(); + printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); +#endif +} + } diff --git a/lib/allocator/AlignedAllocator.h b/lib/allocator/AlignedAllocator.h index db86c435..e64a5949 100644 --- a/lib/allocator/AlignedAllocator.h +++ b/lib/allocator/AlignedAllocator.h @@ -64,6 +64,8 @@ namespace Grid { }; + void check_huge_pages(void *Buf,uint64_t BYTES); + //////////////////////////////////////////////////////////////////// // A lattice of something, but assume the something is SIMDized. //////////////////////////////////////////////////////////////////// @@ -92,12 +94,20 @@ public: size_type bytes = __n*sizeof(_Tp); _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); - + // if ( ptr != NULL ) + // std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr < *************************************************************************************/ /* END LEGAL */ #include +#include +#include +#include +#include namespace Grid { @@ -34,7 +38,10 @@ namespace Grid { /////////////////////////////////////////////////////////////// void * CartesianCommunicator::ShmCommBuf; uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024; -CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; +CartesianCommunicator::CommunicatorPolicy_t +CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; +int CartesianCommunicator::nCommThreads = -1; +int CartesianCommunicator::Hugepages = 0; ///////////////////////////////// // Alloc, free shmem region @@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) GlobalSumVector((double *)c,2*N); } -#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) +#if !defined( GRID_COMMS_MPI3) int CartesianCommunicator::NodeCount(void) { return ProcessorCount();}; int CartesianCommunicator::RankCount(void) { return ProcessorCount();}; - -double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes) +#endif +#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT) +double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes, int dir) { + std::vector list; + // Discard the "dir" + SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); + SendToRecvFromComplete(list); + return 2.0*bytes; +} +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes, int dir) +{ + // Discard the "dir" SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); return 2.0*bytes; } -void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) { SendToRecvFromComplete(waitall); } +#endif + +#if !defined( GRID_COMMS_MPI3) + void CartesianCommunicator::StencilBarrier(void){}; commVector CartesianCommunicator::ShmBufStorageVector; @@ -121,8 +146,22 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { return NULL; } void CartesianCommunicator::ShmInitGeneric(void){ +#if 1 + + int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; +#ifdef MAP_HUGETLB + if ( Hugepages ) mmap_flag |= MAP_HUGETLB; +#endif + ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); + if (ShmCommBuf == (void *)MAP_FAILED) { + perror("mmap failed "); + exit(EXIT_FAILURE); + } +#else ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); ShmCommBuf=(void *)&ShmBufStorageVector[0]; +#endif + bzero(ShmCommBuf,MAX_MPI_SHM_BYTES); } #endif diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 12a8429f..ac866ced 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -38,7 +38,7 @@ Author: Peter Boyle #ifdef GRID_COMMS_MPI3 #include #endif -#ifdef GRID_COMMS_MPI3L +#ifdef GRID_COMMS_MPIT #include #endif #ifdef GRID_COMMS_SHMEM @@ -50,12 +50,24 @@ namespace Grid { class CartesianCommunicator { public: - // 65536 ranks per node adequate for now + + //////////////////////////////////////////// + // Isend/Irecv/Wait, or Sendrecv blocking + //////////////////////////////////////////// + enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; + static CommunicatorPolicy_t CommunicatorPolicy; + static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } + + /////////////////////////////////////////// + // Up to 65536 ranks per node adequate for now // 128MB shared memory for comms enought for 48^4 local vol comms // Give external control (command line override?) of this - - static const int MAXLOG2RANKSPERNODE = 16; - static uint64_t MAX_MPI_SHM_BYTES; + /////////////////////////////////////////// + static const int MAXLOG2RANKSPERNODE = 16; + static uint64_t MAX_MPI_SHM_BYTES; + static int nCommThreads; + // use explicit huge pages + static int Hugepages; // Communicator should know nothing of the physics grid, only processor grid. int _Nprocessors; // How many in all @@ -64,14 +76,18 @@ class CartesianCommunicator { std::vector _processor_coor; // linear processor coordinate unsigned long _ndimension; -#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L) +#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) static MPI_Comm communicator_world; - MPI_Comm communicator; + + MPI_Comm communicator; + std::vector communicator_halo; + typedef MPI_Request CommsRequest_t; #else typedef int CommsRequest_t; #endif + //////////////////////////////////////////////////////////////////// // Helper functionality for SHM Windows common to all other impls //////////////////////////////////////////////////////////////////// @@ -117,11 +133,7 @@ class CartesianCommunicator { ///////////////////////////////// static void * ShmCommBuf; - // Isend/Irecv/Wait, or Sendrecv blocking - enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; - static CommunicatorPolicy_t CommunicatorPolicy; - static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } - + size_t heap_top; size_t heap_bytes; @@ -211,14 +223,21 @@ class CartesianCommunicator { void SendToRecvFromComplete(std::vector &waitall); + double StencilSendToRecvFrom(void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir); + double StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir); - void StencilSendToRecvFromComplete(std::vector &waitall); + + void StencilSendToRecvFromComplete(std::vector &waitall,int i); void StencilBarrier(void); //////////////////////////////////////////////////////////// diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index 4192300b..cb7fa390 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -41,9 +41,7 @@ Author: Peter Boyle #ifdef HAVE_NUMAIF_H #include #endif -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif + namespace Grid { @@ -213,13 +211,19 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); if ( fd < 0 ) { perror("failed shm_open"); assert(0); } ftruncate(fd, size); + + int mmap_flag = MAP_SHARED; +#ifdef MAP_HUGETLB + if (Hugepages) mmap_flag |= MAP_HUGETLB; +#endif + void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); - void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } assert(((uint64_t)ptr&0x3F)==0); - // Try to force numa domain on the shm segment if we have numaif.h -#ifdef HAVE_NUMAIF_H +// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h +#if 0 +//#ifdef HAVE_NUMAIF_H int status; int flags=MPOL_MF_MOVE; #ifdef KNL @@ -266,7 +270,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { for(int r=0;r &processors) { int ierr; communicator=communicator_world; + _ndimension = processors.size(); + communicator_halo.resize (2*_ndimension); + for(int i=0;i<_ndimension*2;i++){ + MPI_Comm_dup(communicator,&communicator_halo[i]); + } + //////////////////////////////////////////////////////////////// // Assert power of two shm_size. //////////////////////////////////////////////////////////////// @@ -621,13 +635,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis } } -double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) +double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, + int dest, + void *recv, + int from, + int bytes,int dir) { + std::vector list; + double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); + StencilSendToRecvFromComplete(list,dir); + return offbytes; +} + +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes,int dir) +{ + assert(dir < communicator_halo.size()); + MPI_Request xrq; MPI_Request rrq; @@ -646,26 +674,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorStencilSendToRecvFromComplete(list); + this->StencilSendToRecvFromComplete(list,dir); } return off_node_bytes; } -void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &waitall,int dir) { SendToRecvFromComplete(waitall); } diff --git a/lib/communicator/Communicator_mpit.cc b/lib/communicator/Communicator_mpit.cc new file mode 100644 index 00000000..eb6ef87d --- /dev/null +++ b/lib/communicator/Communicator_mpit.cc @@ -0,0 +1,286 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/communicator/Communicator_mpi.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +namespace Grid { + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Info that is setup once and indept of cartesian layout +/////////////////////////////////////////////////////////////////////////////////////////////////// +MPI_Comm CartesianCommunicator::communicator_world; + +// Should error check all MPI calls. +void CartesianCommunicator::Init(int *argc, char ***argv) { + int flag; + int provided; + MPI_Initialized(&flag); // needed to coexist with other libs apparently + if ( !flag ) { + MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); + if ( provided != MPI_THREAD_MULTIPLE ) { + QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; + } + } + MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); + ShmInitGeneric(); +} + +CartesianCommunicator::CartesianCommunicator(const std::vector &processors) +{ + _ndimension = processors.size(); + std::vector periodic(_ndimension,1); + + _Nprocessors=1; + _processors = processors; + _processor_coor.resize(_ndimension); + + MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); + MPI_Comm_rank(communicator,&_processor); + MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); + + for(int i=0;i<_ndimension;i++){ + _Nprocessors*=_processors[i]; + } + + communicator_halo.resize (2*_ndimension); + for(int i=0;i<_ndimension*2;i++){ + MPI_Comm_dup(communicator,&communicator_halo[i]); + } + + int Size; + MPI_Comm_size(communicator,&Size); + + assert(Size==_Nprocessors); +} +void CartesianCommunicator::GlobalSum(uint32_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(uint64_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalXOR(uint32_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalXOR(uint64_t &u){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(float &f){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSumVector(float *f,int N) +{ + int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSum(double &d) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalSumVector(double *d,int N) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); + assert(ierr==0); +} +void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) +{ + int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); + assert(ierr==0); +} +int CartesianCommunicator::RankFromProcessorCoor(std::vector &coor) +{ + int rank; + int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); + assert(ierr==0); + return rank; +} +void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector &coor) +{ + coor.resize(_ndimension); + int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); + assert(ierr==0); +} + +// Basic Halo comms primitive +void CartesianCommunicator::SendToRecvFrom(void *xmit, + int dest, + void *recv, + int from, + int bytes) +{ + std::vector reqs(0); + SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); + SendToRecvFromComplete(reqs); +} + +void CartesianCommunicator::SendRecvPacket(void *xmit, + void *recv, + int sender, + int receiver, + int bytes) +{ + MPI_Status stat; + assert(sender != receiver); + int tag = sender; + if ( _processor == sender ) { + MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); + } + if ( _processor == receiver ) { + MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); + } +} + +// Basic Halo comms primitive +void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, + void *xmit, + int dest, + void *recv, + int from, + int bytes) +{ + int myrank = _processor; + int ierr; + if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { + MPI_Request xrq; + MPI_Request rrq; + + ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); + ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); + + assert(ierr==0); + list.push_back(xrq); + list.push_back(rrq); + } else { + // Give the CPU to MPI immediately; can use threads to overlap optionally + ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, + recv,bytes,MPI_CHAR,from, from, + communicator,MPI_STATUS_IGNORE); + assert(ierr==0); + } +} +void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) +{ + if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { + int nreq=list.size(); + std::vector status(nreq); + int ierr = MPI_Waitall(nreq,&list[0],&status[0]); + assert(ierr==0); + } +} + +void CartesianCommunicator::Barrier(void) +{ + int ierr = MPI_Barrier(communicator); + assert(ierr==0); +} + +void CartesianCommunicator::Broadcast(int root,void* data, int bytes) +{ + int ierr=MPI_Bcast(data, + bytes, + MPI_BYTE, + root, + communicator); + assert(ierr==0); +} + /////////////////////////////////////////////////////// + // Should only be used prior to Grid Init finished. + // Check for this? + /////////////////////////////////////////////////////// +int CartesianCommunicator::RankWorld(void){ + int r; + MPI_Comm_rank(communicator_world,&r); + return r; +} +void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) +{ + int ierr= MPI_Bcast(data, + bytes, + MPI_BYTE, + root, + communicator_world); + assert(ierr==0); +} + +double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &list, + void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir) +{ + int myrank = _processor; + int ierr; + assert(dir < communicator_halo.size()); + + // std::cout << " sending on communicator "< &waitall,int dir) +{ + int nreq=waitall.size(); + MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE); +}; +double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, + int xmit_to_rank, + void *recv, + int recv_from_rank, + int bytes,int dir) +{ + int myrank = _processor; + int ierr; + assert(dir < communicator_halo.size()); + + // std::cout << " sending on communicator "< #include #endif -#ifdef GRID_COMMS_MPI3L +#ifdef GRID_COMMS_MPIT #include #endif diff --git a/lib/log/Log.cc b/lib/log/Log.cc index 69a9a0a8..65dc2812 100644 --- a/lib/log/Log.cc +++ b/lib/log/Log.cc @@ -95,7 +95,7 @@ void GridLogConfigure(std::vector &logstreams) { //////////////////////////////////////////////////////////// void Grid_quiesce_nodes(void) { int me = 0; -#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) MPI_Comm_rank(MPI_COMM_WORLD, &me); #endif #ifdef GRID_COMMS_SHMEM diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index f56f6514..d14f3fe2 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -29,7 +29,7 @@ #ifndef GRID_BINARY_IO_H #define GRID_BINARY_IO_H -#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) +#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) #define USE_MPI_IO #else #undef USE_MPI_IO diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index 46ba3793..838b1c3d 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -414,7 +414,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(omega[i]!=Coeff_t(0.0)); bs[i] = 0.5*(bpc/omega[i] + bmc); cs[i] = 0.5*(bpc/omega[i] - bmc); } @@ -429,7 +429,7 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vectorM5) +1.0); - // assert(fabs(bee[i])>0.0); + assert(bee[i]!=Coeff_t(0.0)); cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); beo[i]=as[i]*bs[i]; ceo[i]=-as[i]*cs[i]; @@ -455,11 +455,17 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector::SetCoefficientsInternal(RealD zolo_hi,std::vector0.0); + assert(bee[j] != Coeff_t(0.0)); delta_d *= cee[j]/bee[j]; } dee[Ls-1] += delta_d; diff --git a/lib/qcd/action/fermion/WilsonCompressor.h b/lib/qcd/action/fermion/WilsonCompressor.h index 96cbe1ec..cc5c3c63 100644 --- a/lib/qcd/action/fermion/WilsonCompressor.h +++ b/lib/qcd/action/fermion/WilsonCompressor.h @@ -238,7 +238,33 @@ template using WilsonCompressor = WilsonCom template class WilsonStencil : public CartesianStencil { public: - + double timer0; + double timer1; + double timer2; + double timer3; + double timer4; + double timer5; + double timer6; + uint64_t callsi; + void ZeroCountersi(void) + { + timer0=0; + timer1=0; + timer2=0; + timer3=0; + timer4=0; + timer5=0; + timer6=0; + callsi=0; + } + void Reporti(int calls) + { + if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " < same_node; @@ -252,6 +278,7 @@ public: : CartesianStencil (grid,npoints,checkerboard,directions,distances) , same_node(npoints) { + ZeroCountersi(); surface_list.resize(0); }; @@ -261,7 +288,6 @@ public: // Here we know the distance is 1 for WilsonStencil for(int point=0;point_npoints;point++){ same_node[point] = this->SameNode(point); - // std::cout << " dir " < > reqs; this->HaloExchangeOptGather(source,compress); - this->CommunicateBegin(reqs); - this->CommunicateComplete(reqs); + double t1=usecond(); + // Asynchronous MPI calls multidirectional, Isend etc... + // this->CommunicateBegin(reqs); + // this->CommunicateComplete(reqs); + // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways. + this->Communicate(); + double t2=usecond(); timer1 += t2-t1; this->CommsMerge(compress); + double t3=usecond(); timer2 += t3-t2; this->CommsMergeSHM(compress); + double t4=usecond(); timer3 += t4-t3; } template void HaloExchangeOptGather(const Lattice &source,compressor &compress) { this->Prepare(); + double t0=usecond(); this->HaloGatherOpt(source,compress); + double t1=usecond(); + timer0 += t1-t0; + callsi++; } template @@ -304,7 +341,9 @@ public: typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; + this->mpi3synctime_g-=usecond(); this->_grid->StencilBarrier(); + this->mpi3synctime_g+=usecond(); assert(source._grid==this->_grid); this->halogtime-=usecond(); @@ -323,7 +362,6 @@ public: int dag = compress.dag; int face_idx=0; if ( dag ) { - // std::cout << " Optimised Dagger compress " <HaloGatherDir(source,XpCompress,Xp,face_idx)); assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 5ddfde9a..19a37c34 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -124,22 +124,24 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, int vol4; vol4=FourDimGrid.oSites(); Stencil.BuildSurfaceList(LLs,vol4); + vol4=FourDimRedBlackGrid.oSites(); StencilEven.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4); - std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() - <<" " << StencilEven.surface_list.size()< void WilsonFermion5D::Report(void) { - std::vector latt = GridDefaultLatt(); - RealD volume = Ls; for(int mu=0;mu_Nprocessors; - RealD NN = _FourDimGrid->NodeCount(); + RealD NP = _FourDimGrid->_Nprocessors; + RealD NN = _FourDimGrid->NodeCount(); + RealD volume = Ls; + std::vector latt = _FourDimGrid->GlobalDimensions(); + for(int mu=0;mu 0 ) { std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; @@ -185,6 +187,11 @@ void WilsonFermion5D::Report(void) std::cout << GridLogMessage << "WilsonFermion5D StencilEven"< 0){ + std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" < @@ -204,6 +211,9 @@ void WilsonFermion5D::ZeroCounters(void) { Stencil.ZeroCounters(); StencilEven.ZeroCounters(); StencilOdd.ZeroCounters(); + Stencil.ZeroCountersi(); + StencilEven.ZeroCountersi(); + StencilOdd.ZeroCountersi(); } @@ -380,7 +390,6 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg { #ifdef GRID_OMP // assert((dag==DaggerNo) ||(dag==DaggerYes)); - typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; Compressor compressor(dag); @@ -389,46 +398,70 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg DhopFaceTime-=usecond(); st.HaloExchangeOptGather(in,compressor); - DhopFaceTime+=usecond(); - std::vector > reqs; - - // Rely on async comms; start comms before merge of local data - DhopCommTime-=usecond(); - st.CommunicateBegin(reqs); - - DhopFaceTime-=usecond(); - st.CommsMergeSHM(compressor); + st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond(); - // Perhaps use omp task and region -#pragma omp parallel + double ctime=0; + double ptime=0; + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ugly explicit thread mapping introduced for OPA reasons. + ////////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma omp parallel reduction(max:ctime) reduction(max:ptime) { + int tid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); - int me = omp_get_thread_num(); - int myoff, mywork; - - GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1); - int sF = LLs * myoff; - - if ( me == 0 ) { - st.CommunicateComplete(reqs); - DhopCommTime+=usecond(); - } else { - // Interior links in stencil - if ( me==1 ) DhopComputeTime-=usecond(); - if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); - else Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); - if ( me==1 ) DhopComputeTime+=usecond(); + int ncomms = CartesianCommunicator::nCommThreads; + if (ncomms == -1) ncomms = 1; + assert(nthreads > ncomms); + if (tid >= ncomms) { + double start = usecond(); + nthreads -= ncomms; + int ttid = tid - ncomms; + int n = U._grid->oSites(); + int chunk = n / nthreads; + int rem = n % nthreads; + int myblock, myn; + if (ttid < rem) { + myblock = ttid * chunk + ttid; + myn = chunk+1; + } else { + myblock = ttid*chunk + rem; + myn = chunk; + } + + // do the compute + if (dag == DaggerYes) { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + } + } else { + for (int ss = myblock; ss < myblock+myn; ++ss) { + int sU = ss; + int sF = LLs * sU; + Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); + } + } + ptime = usecond() - start; + } + { + double start = usecond(); + st.CommunicateThreaded(); + ctime = usecond() - start; } } + DhopCommTime += ctime; + DhopComputeTime+=ptime; + + // First to enter, last to leave timing + st.CollateThreads(); DhopFaceTime-=usecond(); st.CommsMerge(compressor); DhopFaceTime+=usecond(); - // Load imbalance alert. Should use dynamic schedule OMP for loop - // Perhaps create a list of only those sites with face work, and - // load balance process the list. DhopComputeTime2-=usecond(); if (dag == DaggerYes) { int sz=st.surface_list.size(); @@ -449,11 +482,9 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg #else assert(0); #endif - } - template void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, diff --git a/lib/stencil/Stencil.h b/lib/stencil/Stencil.h index 2894778a..cd0792d5 100644 --- a/lib/stencil/Stencil.h +++ b/lib/stencil/Stencil.h @@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal // Timing info; ugly; possibly temporary ///////////////////////////////////////// double commtime; + double mpi3synctime; + double mpi3synctime_g; + double shmmergetime; double gathertime; double gathermtime; double halogtime; @@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal double splicetime; double nosplicetime; double calls; + std::vector comm_bytes_thr; + std::vector comm_time_thr; + std::vector comm_enter_thr; + std::vector comm_leave_thr; //////////////////////////////////////// // Stencil query @@ -248,35 +255,120 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal ////////////////////////////////////////// // Comms packet queue for asynch thread ////////////////////////////////////////// + void CommunicateThreaded() + { +#ifdef GRID_OMP + // must be called in parallel region + int mythread = omp_get_thread_num(); + int nthreads = CartesianCommunicator::nCommThreads; +#else + int mythread = 0; + int nthreads = 1; +#endif + if (nthreads == -1) nthreads = 1; + if (mythread < nthreads) { + comm_enter_thr[mythread] = usecond(); + for (int i = mythread; i < Packets.size(); i += nthreads) { + uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); + comm_bytes_thr[mythread] += bytes; + } + comm_leave_thr[mythread]= usecond(); + comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; + } + } + + void CollateThreads(void) + { + int nthreads = CartesianCommunicator::nCommThreads; + double first=0.0; + double last =0.0; + + for(int t=0;t 0.0) && ( t0 < first ) ) first = t0; // min time seen + + if ( t1 > last ) last = t1; // max time seen + + } + commtime+= last-first; + } void CommunicateBegin(std::vector > &reqs) { reqs.resize(Packets.size()); commtime-=usecond(); for(int i=0;iStencilSendToRecvFromBegin(reqs[i], - Packets[i].send_buf, - Packets[i].to_rank, - Packets[i].recv_buf, - Packets[i].from_rank, - Packets[i].bytes); + Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); } } void CommunicateComplete(std::vector > &reqs) { for(int i=0;iStencilSendToRecvFromComplete(reqs[i]); + _grid->StencilSendToRecvFromComplete(reqs[i],i); } commtime+=usecond(); } + void Communicate(void) + { +#ifdef GRID_OMP +#pragma omp parallel + { + // must be called in parallel region + int mythread = omp_get_thread_num(); + int maxthreads= omp_get_max_threads(); + int nthreads = CartesianCommunicator::nCommThreads; + assert(nthreads <= maxthreads); + + if (nthreads == -1) nthreads = 1; +#else + int mythread = 0; + int nthreads = 1; +#endif + if (mythread < nthreads) { + for (int i = mythread; i < Packets.size(); i += nthreads) { + double start = usecond(); + comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes,i); + comm_time_thr[mythread] += usecond() - start; + } + } +#ifdef GRID_OMP + } +#endif + } template void HaloExchange(const Lattice &source,compressor &compress) { std::vector > reqs; Prepare(); HaloGather(source,compress); - CommunicateBegin(reqs); - CommunicateComplete(reqs); + // Concurrent + //CommunicateBegin(reqs); + //CommunicateComplete(reqs); + // Sequential, possibly threaded + Communicate(); CommsMergeSHM(compress); CommsMerge(compress); } @@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal template void HaloGather(const Lattice &source,compressor &compress) { + mpi3synctime_g-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes + mpi3synctime_g+=usecond(); // conformable(source._grid,_grid); assert(source._grid==_grid); @@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal CommsMerge(decompress,Mergers,Decompressions); } template void CommsMergeSHM(decompressor decompress) { + mpi3synctime-=usecond(); _grid->StencilBarrier();// Synch shared memory on a single nodes + mpi3synctime+=usecond(); + shmmergetime-=usecond(); CommsMerge(decompress,MergersSHM,DecompressionsSHM); + shmmergetime+=usecond(); } template @@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal int checkerboard, const std::vector &directions, const std::vector &distances) - : _permute_type(npoints), _comm_buf_size(npoints) + : _permute_type(npoints), + _comm_buf_size(npoints), + comm_bytes_thr(npoints), + comm_enter_thr(npoints), + comm_leave_thr(npoints), + comm_time_thr(npoints) { face_table_computed=0; _npoints = npoints; @@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal void ZeroCounters(void) { gathertime = 0.; commtime = 0.; + mpi3synctime=0.; + mpi3synctime_g=0.; + shmmergetime=0.; + for(int i=0;i<_npoints;i++){ + comm_time_thr[i]=0; + comm_bytes_thr[i]=0; + comm_enter_thr[i]=0; + comm_leave_thr[i]=0; + } halogtime = 0.; mergetime = 0.; decompresstime = 0.; @@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; RealD NN = _grid->NodeCount(); + double t = 0; + // if comm_time_thr is set they were all done in parallel so take the max + // but add up the bytes + int threaded = 0 ; + for (int i = 0; i < 8; ++i) { + if ( comm_time_thr[i]>0.0 ) { + threaded = 1; + comms_bytes += comm_bytes_thr[i]; + if (t < comm_time_thr[i]) t = comm_time_thr[i]; + } + } + if (threaded) commtime += t; _grid->GlobalSum(commtime); commtime/=NP; if ( calls > 0. ) { @@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<