diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 90052051..710792ee 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -68,7 +68,7 @@ template Lattice Cshift(const Lattice &rhs,int dimension if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"< void Cshift_comms(Lattice& ret,const Lattice &rhs,int dimension,int shift) { int sshift[2]; @@ -125,7 +125,11 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; static deviceVector send_buf; send_buf.resize(buffer_size); static deviceVector recv_buf; recv_buf.resize(buffer_size); - +#ifndef ACCELERATOR_AWARE_MPI + static hostVector hsend_buf; hsend_buf.resize(buffer_size); + static hostVector hrecv_buf; hrecv_buf.resize(buffer_size); +#endif + int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); RealD tcopy=0.0; @@ -156,16 +160,29 @@ template void Cshift_comms(Lattice &ret,const Lattice &r // int rank = grid->_processor; int recv_from_rank; int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); tcomms-=usecond(); grid->Barrier(); +#ifdef ACCELERATOR_AWARE_MPI grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); +#else + // bouncy bouncy + acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes); + grid->SendToRecvFrom((void *)&hsend_buf[0], + xmit_to_rank, + (void *)&hrecv_buf[0], + recv_from_rank, + bytes); + acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); +#endif + xbytes+=bytes; grid->Barrier(); tcomms+=usecond(); @@ -226,12 +243,17 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; - + + for(int s=0;s hsend_buf; hsend_buf.resize(buffer_size); + hostVector hrecv_buf; hrecv_buf.resize(buffer_size); +#endif + int bytes = buffer_size*sizeof(scalar_object); ExtractPointerArray pointers(Nsimd); // @@ -283,11 +305,22 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeSendToRecvFrom((void *)send_buf_extract_mpi, xmit_to_rank, (void *)recv_buf_extract_mpi, recv_from_rank, bytes); +#else + // bouncy bouncy + acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes); + grid->SendToRecvFrom((void *)&hsend_buf[0], + xmit_to_rank, + (void *)&hrecv_buf[0], + recv_from_rank, + bytes); + acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes); +#endif xbytes+=bytes; grid->Barrier(); @@ -311,234 +344,6 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) -{ - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_type scalar_type; - - GridBase *grid=rhs.Grid(); - Lattice temp(rhs.Grid()); - - int fd = rhs.Grid()->_fdimensions[dimension]; - int rd = rhs.Grid()->_rdimensions[dimension]; - int pd = rhs.Grid()->_processors[dimension]; - int simd_layout = rhs.Grid()->_simd_layout[dimension]; - int comm_dim = rhs.Grid()->_processors[dimension] >1 ; - assert(simd_layout==1); - assert(comm_dim==1); - assert(shift>=0); - assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); - static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); - vobj *send_buf; - vobj *recv_buf; - { - grid->ShmBufferFreeAll(); - size_t bytes = buffer_size*sizeof(vobj); - send_buf=(vobj *)grid->ShmBufferMalloc(bytes); - recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); - } - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - - for(int x=0;x>1; - - int bytes = words * sizeof(vobj); - - tgather-=usecond(); - Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); - tgather+=usecond(); - - // int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - - - tcomms-=usecond(); - // grid->Barrier(); - - acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); - grid->SendToRecvFrom((void *)&send_buf[0], - xmit_to_rank, - (void *)&recv_buf[0], - recv_from_rank, - bytes); - xbytes+=bytes; - acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); - - // grid->Barrier(); - tcomms+=usecond(); - - tscatter-=usecond(); - Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); - tscatter+=usecond(); - } - } - if(Cshift_verbose){ - std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) -{ - GridBase *grid=rhs.Grid(); - const int Nsimd = grid->Nsimd(); - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_object scalar_object; - typedef typename vobj::scalar_type scalar_type; - - int fd = grid->_fdimensions[dimension]; - int rd = grid->_rdimensions[dimension]; - int ld = grid->_ldimensions[dimension]; - int pd = grid->_processors[dimension]; - int simd_layout = grid->_simd_layout[dimension]; - int comm_dim = grid->_processors[dimension] >1 ; - - //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<=0); - assert(shiftPermuteType(dimension); - - /////////////////////////////////////////////// - // Simd direction uses an extract/merge pair - /////////////////////////////////////////////// - int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; - // int words = sizeof(vobj)/sizeof(vector_type); - - static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); - static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); - scalar_object * recv_buf_extract_mpi; - scalar_object * send_buf_extract_mpi; - { - size_t bytes = sizeof(scalar_object)*buffer_size; - grid->ShmBufferFreeAll(); - send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); - recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); - } - for(int s=0;s pointers(Nsimd); // - ExtractPointerArray rpointers(Nsimd); // received pointers - - /////////////////////////////////////////// - // Work out what to send where - /////////////////////////////////////////// - int cb = (cbmask==0x2)? Odd : Even; - int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer - int nbr_ox = (nbr_coor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - int recv_from_rank; - int xmit_to_rank; - - if (nbr_ic) nbr_lane|=inner_bit; - - assert (sx == nbr_ox); - - if(nbr_proc){ - grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - tcomms-=usecond(); - // grid->Barrier(); - - acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); - grid->SendToRecvFrom((void *)send_buf_extract_mpi, - xmit_to_rank, - (void *)recv_buf_extract_mpi, - recv_from_rank, - bytes); - acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); - xbytes+=bytes; - - // grid->Barrier(); - tcomms+=usecond(); - rpointers[i] = &recv_buf_extract[i][0]; - } else { - rpointers[i] = &send_buf_extract[nbr_lane][0]; - } - - } - tscatter-=usecond(); - Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); - tscatter+=usecond(); - - } - if(Cshift_verbose){ - std::cout << GridLogPerformance << " Cshift (s) copy "<