diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index 742c99da..309517b2 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -29,8 +29,27 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); -extern Vector > Cshift_table; +extern std::vector > Cshift_table; +extern commVector > Cshift_table_device; +inline std::pair *MapCshiftTable(void) +{ + // GPU version +#ifdef ACCELERATOR_CSHIFT + uint64_t sz=Cshift_table.size(); + if (Cshift_table_device.size()!=sz ) { + Cshift_table_device.resize(sz); + } + acceleratorCopyToDevice((void *)&Cshift_table[0], + (void *)&Cshift_table_device[0], + sizeof(Cshift_table[0])*sz); + + return &Cshift_table_device[0]; +#else + return &Cshift_table[0]; +#endif + // CPU version use identify map +} /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// @@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dim } { auto buffer_p = & buffer[0]; - auto table = &Cshift_table[0]; -#ifdef ACCELERATOR_CSHIFT + auto table = MapCshiftTable(); +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); @@ -225,7 +244,7 @@ template void Scatter_plane_simple (Lattice &rhs,cshiftVector< { auto buffer_p = & buffer[0]; - auto table = &Cshift_table[0]; + auto table = MapCshiftTable(); #ifdef ACCELERATOR_CSHIFT autoView( rhs_v, rhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ @@ -297,30 +316,6 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } -#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) - -template -T iDivUp(T a, T b) // Round a / b to nearest higher integer value -{ return (a % b != 0) ? (a / b + 1) : (a / b); } - -template -__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) -{ - int idx = blockIdx.x*blockDim.x + threadIdx.x; - if (idx >= e1*e2) return; - - int n, b, o; - - n = idx / e2; - b = idx % e2; - o = n*stride + b; - - vector[2*idx + 0] = lo + o; - vector[2*idx + 1] = ro + o; -} - -#endif - ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -345,20 +340,12 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ -#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) - ent = e1*e2; - dim3 blockSize(acceleratorThreads()); - dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); - populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); - accelerator_barrier(); -#else for(int n=0;n(lo+o,ro+o); } } -#endif } else { for(int n=0;n void Copy_plane(Lattice& lhs,const Lattice &rhs } { - auto table = &Cshift_table[0]; + auto table = MapCshiftTable(); #ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); @@ -409,19 +396,11 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); - accelerator_barrier(); -#else for(int n=0;n(lo+o+b,ro+o+b); }} -#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice Lattice Cshift(const Lattice &rhs,int dimension int comm_dim = rhs.Grid()->_processors[dimension] >1 ; int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); - + RealD t1,t0; + t0=usecond(); if ( !comm_dim ) { //std::cout << "CSHIFT: Cshift_local" < Lattice Cshift(const Lattice &rhs,int dimension //std::cout << "CSHIFT: Cshift_comms" < void Cshift_comms(Lattice &ret,const Lattice &r int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - + RealD tcopy=0.0; + RealD tgather=0.0; + RealD tscatter=0.0; + RealD tcomms=0.0; + uint64_t xbytes=0; for(int x=0;x void Cshift_comms(Lattice &ret,const Lattice &r int bytes = words * sizeof(vobj); + tgather-=usecond(); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); + tgather+=usecond(); // int rank = grid->_processor; int recv_from_rank; int xmit_to_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - - grid->Barrier(); + + tcomms-=usecond(); + // grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); + xbytes+=bytes; + // grid->Barrier(); + tcomms+=usecond(); - grid->Barrier(); - + tscatter-=usecond(); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); + tscatter+=usecond(); } } + /* + std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -190,6 +210,12 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice=0); assert(shiftPermuteType(dimension); /////////////////////////////////////////////// @@ -227,7 +253,9 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->Barrier(); + tcomms-=usecond(); + // grid->Barrier(); send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0]; @@ -262,7 +291,9 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); + xbytes+=bytes; + // grid->Barrier(); + tcomms+=usecond(); rpointers[i] = &recv_buf_extract[i][0]; } else { @@ -270,9 +301,17 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -292,6 +331,11 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(comm_dim==1); assert(shift>=0); assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); @@ -315,7 +359,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r if (comm_proc==0) { + tcopy-=usecond(); Copy_plane(ret,rhs,dimension,x,sx,cbmask); + tcopy+=usecond(); } else { @@ -324,7 +370,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int bytes = words * sizeof(vobj); + tgather-=usecond(); Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); + tgather+=usecond(); // int rank = grid->_processor; int recv_from_rank; @@ -332,7 +380,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - grid->Barrier(); + tcomms-=usecond(); + // grid->Barrier(); acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); grid->SendToRecvFrom((void *)&send_buf[0], @@ -340,13 +389,24 @@ template void Cshift_comms(Lattice &ret,const Lattice &r (void *)&recv_buf[0], recv_from_rank, bytes); + xbytes+=bytes; acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); - grid->Barrier(); + // grid->Barrier(); + tcomms+=usecond(); + tscatter-=usecond(); Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); + tscatter+=usecond(); } } + /* + std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -372,6 +432,11 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice=0); assert(shiftPermuteType(dimension); @@ -414,8 +479,10 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->Barrier(); + tcomms-=usecond(); + // grid->Barrier(); acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); grid->SendToRecvFrom((void *)send_buf_extract_mpi, @@ -449,17 +517,28 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); + // grid->Barrier(); + tcomms+=usecond(); rpointers[i] = &recv_buf_extract[i][0]; } else { rpointers[i] = &send_buf_extract[nbr_lane][0]; } } + tscatter-=usecond(); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); - } + tscatter+=usecond(); + } + /* + std::cout << GridLogPerformance << " Cshift (s) copy "< NAMESPACE_BEGIN(Grid); -Vector > Cshift_table; +std::vector > Cshift_table; +commVector > Cshift_table_device; NAMESPACE_END(Grid);