From bd600702cf2a0e5bd8335c1192cbe480dc76a7af Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 15 Feb 2017 11:11:04 +0000 Subject: [PATCH] Vectorise the XYZT face gathering better. Hard coded for simd_layout <= 2 in any given spread out direction; full generality is inconsistent with efficiency. --- lib/Init.cc | 4 +- lib/Stencil.h | 161 +++++++++++++++++++++++++-- lib/communicator/Communicator_mpi.cc | 6 +- lib/cshift/Cshift_common.h | 58 ++++++++++ lib/simd/Grid_avx.h | 40 ++++++- lib/simd/Grid_avx512.h | 40 +++++++ lib/simd/Grid_sse4.h | 36 ++++++ lib/simd/Grid_vector_types.h | 36 +++--- lib/tensors/Tensor_class.h | 20 ++++ tests/Test_simd.cc | 143 +++++++++++++++++++++++- 10 files changed, 510 insertions(+), 34 deletions(-) diff --git a/lib/Init.cc b/lib/Init.cc index 34dc1720..aeab5835 100644 --- a/lib/Init.cc +++ b/lib/Init.cc @@ -338,9 +338,9 @@ void Grid_init(int *argc,char ***argv) QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric; } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ - WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute; + QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute; } else { - WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute; + QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; } if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-isend") ){ CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyIsend); diff --git a/lib/Stencil.h b/lib/Stencil.h index 71f086af..8b5eac2d 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -184,14 +184,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal struct Merge { cobj * mpointer; std::vector rpointers; + std::vector vpointers; Integer buffer_size; Integer packet_id; + Integer exchange; + Integer type; }; std::vector Mergers; void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { Merge m; + m.exchange = 0; m.mpointer = merge_p; m.rpointers= rpointers; m.buffer_size = buffer_size; @@ -199,6 +203,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal Mergers.push_back(m); } + void AddMergeNew(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id,Integer type) { + Merge m; + m.exchange = 1; + m.type = type; + m.mpointer = merge_p; + m.vpointers= rpointers; + m.buffer_size = buffer_size; + m.packet_id = packet_id; + Mergers.push_back(m); + } + void CommsMerge(void ) { for(int i=0;i u_simd_send_buf; std::vector u_simd_recv_buf; + std::vector new_simd_send_buf; + std::vector new_simd_recv_buf; int u_comm_offset; int _unified_buffer_size; @@ -432,12 +457,15 @@ PARALLEL_FOR_LOOP u_simd_send_buf.resize(Nsimd); u_simd_recv_buf.resize(Nsimd); - + new_simd_send_buf.resize(Nsimd); + new_simd_recv_buf.resize(Nsimd); u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); for(int l=0;lShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); + new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); + new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); } PrecomputeByteOffsets(); @@ -675,7 +703,7 @@ PARALLEL_FOR_LOOP HaloGather(source,compress); this->CommunicateBegin(reqs); this->CommunicateComplete(reqs); - CommsMerge(); // spins + CommsMerge(); } template void HaloGatherDir(const Lattice &source,compressor &compress,int point,int & face_idx) @@ -706,7 +734,9 @@ PARALLEL_FOR_LOOP if ( sshift[0] == sshift[1] ) { if (splice_dim) { splicetime-=usecond(); - GatherSimd(source,dimension,shift,0x3,compress,face_idx); + // GatherSimd(source,dimension,shift,0x3,compress,face_idx); + // std::cout << "GatherSimdNew"< + void GatherSimdNew(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) + { + const int Nsimd = _grid->Nsimd(); + + const int maxl =2;// max layout in a direction + int fd = _grid->_fdimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int pd = _grid->_processors[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + assert(comm_dim==1); + // This will not work with a rotate dim + assert(simd_layout==maxl); + assert(shift>=0); + assert(shiftPermuteType(dimension); + // std::cout << "SimdNew permute type "<_slice_nblock[dimension]*_grid->_slice_block[dimension]; + int words = sizeof(cobj)/sizeof(vector_type); + + assert(cbmask==0x3); // Fixme think there is a latent bug if not true + + int bytes = (buffer_size*sizeof(cobj))/simd_layout; + assert(bytes*simd_layout == buffer_size*sizeof(cobj)); + + std::vector rpointers(maxl); + std::vector spointers(maxl); + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + + int cb = (cbmask==0x2)? Odd : Even; + int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); + + // loop over outer coord planes orthog to dim + for(int x=0;x= rd ); + + if ( any_offnode ) { + + for(int i=0;iShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + // shm == receive pointer if offnode + // shm == Translate[send pointer] if on node -- my view of his send pointer + cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); + if (shm==NULL) { + shm = rp; + } + + // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node + // assuming above pointer flip + AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); + + rpointers[i] = shm; + + } else { + + rpointers[i] = sp; + + } + } + + AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1,permute_type); + + u_comm_offset +=buffer_size; + } + } + } + }; } diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 61126a17..2033a446 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -42,10 +42,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { int provided; MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { - // MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided); - // assert (provided == MPI_THREAD_SERIALIZED); MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); - assert (provided == MPI_THREAD_MULTIPLE); + if ( provided != MPI_THREAD_MULTIPLE ) { + QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; + } } MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); ShmInitGeneric(); diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 813929d8..c01187de 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -103,6 +103,7 @@ Gather_plane_extract(const Lattice &rhs,std::vector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; int n1=rhs._grid->_slice_stride[dimension]; + if ( cbmask ==0x3){ PARALLEL_NESTED_LOOP2 for(int n=0;n(temp,pointers,offset); @@ -137,6 +139,62 @@ PARALLEL_NESTED_LOOP2 } } +/////////////////////////////////////////////////////////////////// +// Gather for when there *is* need to SIMD split with compression +/////////////////////////////////////////////////////////////////// +template void +Gather_plane_exchange(const Lattice &rhs, + std::vector pointers,int dimension,int plane,int cbmask,compressor &compress,int type) +{ + int rd = rhs._grid->_rdimensions[dimension]; + + if ( !rhs._grid->CheckerBoarded(dimension) ) { + cbmask = 0x3; + } + + int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + + int e1=rhs._grid->_slice_nblock[dimension]; + int e2=rhs._grid->_slice_block[dimension]; + int n1=rhs._grid->_slice_stride[dimension]; + + // Need to switch to a table loop + std::vector > table; + + if ( cbmask ==0x3){ + for(int n=0;n (offset,o+b)); + } + } + } else { + // Case of SIMD split AND checker dim cannot currently be hit, except in + // Test_cshift_red_black code. + for(int n=0;nCheckerBoardFromOindex(o+b); + int offset = b+n*e2; + + if ( ocb & cbmask ) { + table.push_back(std::pair (offset,o+b)); + } + } + } + } + + assert( (table.size()&0x1)==0); +PARALLEL_FOR_LOOP + for(int j=0;j Ah Bh, Al Bl + // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl + // The operation is its own inverse + struct Exchange{ + // 3210 ordering + static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ + out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); + }; + static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ + out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + }; + static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ + out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); + }; + static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){ + out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + }; + + static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ + out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); + }; + static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ + out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + }; + static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ + out1 = _mm512_shuffle_pd(in1,in2,0x00); + out2 = _mm512_shuffle_pd(in1,in2,0xFF); + }; + static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){ + assert(0); + return; + }; + }; + struct Rotate{ diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 943756b2..fcad4c28 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -326,7 +326,43 @@ namespace Optimization { static inline __m128d Permute3(__m128d in){ return in; }; + }; + struct Exchange{ + // 3210 ordering + static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ + out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); + out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); + }; + static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ + out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + }; + static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ + assert(0); + return; + }; + static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){ + assert(0); + return; + }; + + static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ + out1= _mm_shuffle_pd(in1,in2,0x0); + out2= _mm_shuffle_pd(in1,in2,0x3); + }; + static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ + assert(0); + return; + }; + static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ + assert(0); + return; + }; + static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){ + assert(0); + return; + }; }; struct Rotate{ diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 8a6ab2e7..cd499d88 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -350,6 +350,18 @@ class Grid_simd { return ret; } + /////////////////////// + // Exchange + // Al Ah , Bl Bh -> Al Bl Ah,Bh + /////////////////////// + friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n) + { + if (n==3) Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v); + else if(n==2) Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v); + else if(n==1) Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v); + else if(n==0) Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v); + } + //////////////////////////////////////////////////////////////////// // General permute; assumes vector length is same across // all subtypes; may not be a good assumption, but could @@ -372,23 +384,11 @@ class Grid_simd { int dist = perm & 0xF; y = rotate(b, dist); return; - } - switch (perm) { - case 3: - permute3(y, b); - break; - case 2: - permute2(y, b); - break; - case 1: - permute1(y, b); - break; - case 0: - permute0(y, b); - break; - default: - assert(0); - } + } + else if(perm==3) permute3(y, b); + else if(perm==2) permute2(y, b); + else if(perm==1) permute1(y, b); + else if(perm==0) permute0(y, b); } }; // end of Grid_simd class definition @@ -444,6 +444,8 @@ inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ ret.v = unary(real(typepun[lane]), VsplatSIMD()); } + + /////////////////////// // Splat /////////////////////// diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h index 473dd6b1..e0b69eb0 100644 --- a/lib/tensors/Tensor_class.h +++ b/lib/tensors/Tensor_class.h @@ -105,6 +105,11 @@ class iScalar { friend strong_inline void rotate(iScalar &out,const iScalar &in,int rot){ rotate(out._internal,in._internal,rot); } + friend strong_inline void exchange(iScalar &out1,iScalar &out2, + const iScalar &in1,const iScalar &in2,int type){ + exchange(out1._internal,out2._internal, + in1._internal, in2._internal,type); + } // Unary negation friend strong_inline iScalar operator-(const iScalar &r) { @@ -248,6 +253,13 @@ class iVector { rotate(out._internal[i],in._internal[i],rot); } } + friend strong_inline void exchange(iVector &out1,iVector &out2, + const iVector &in1,const iVector &in2,int type){ + for(int i=0;i operator-(const iVector &r) { @@ -374,6 +386,14 @@ class iMatrix { rotate(out._internal[i][j],in._internal[i][j],rot); }} } + friend strong_inline void exchange(iMatrix &out1,iMatrix &out2, + const iMatrix &in1,const iMatrix &in2,int type){ + for(int i=0;i operator-(const iMatrix &r) { diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 92f9bcd8..d840140e 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -113,8 +113,6 @@ public: // outerproduct, // zeroit // permute - - class funcReduce { public: funcReduce() {}; @@ -168,7 +166,7 @@ void Tester(const functor &func) int ok=0; for(int i=0;i1.0e-7){ + if ( abs(reference[i]-result[i])>1.0e-6){ std::cout< void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);} + template void apply(std::vector &r1,std::vector &r2,std::vector &in1,std::vector &in2) const { + int sz=in1.size(); + + + int msk = sz>>(n+1); + + int j1=0; + int j2=0; + for(int i=0;i +void ExchangeTester(const functor &func) +{ + GridSerialRNG sRNG; + sRNG.SeedRandomDevice(); + + int Nsimd = vec::Nsimd(); + + std::vector input1(Nsimd); + std::vector input2(Nsimd); + std::vector result1(Nsimd); + std::vector result2(Nsimd); + std::vector reference1(Nsimd); + std::vector reference2(Nsimd); + std::vector test1(Nsimd); + std::vector test2(Nsimd); + + std::vector > buf(6); + vec & v_input1 = buf[0]; + vec & v_input2 = buf[1]; + vec & v_result1 = buf[2]; + vec & v_result2 = buf[3]; + vec & v_test1 = buf[4]; + vec & v_test2 = buf[5]; + + for(int i=0;i(v_input1,input1); + merge(v_input2,input2); + merge(v_result1,result1); + merge(v_result2,result1); + + func(v_result1,v_result2,v_input1,v_input2); + func.apply(reference1,reference2,input1,input2); + + func(v_test1,v_test2,v_result1,v_result2); + + extract(v_result1,result1); + extract(v_result2,result2); + extract(v_test1,test1); + extract(v_test2,test2); + + std::cout<(funcPermute(i)); } + std::cout<(funcExchange(i)); + } + std::cout<(funcPermute(i)); } + std::cout<(funcExchange(i)); + } + std::cout<(funcPermute(i)); } + + std::cout<(funcExchange(i)); + } + + std::cout<(funcExchange(i)); + } + + std::cout<