From 0b7d389258ef83847a1bad9f9a5d2fabab063336 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 27 Apr 2015 13:45:07 +0100 Subject: [PATCH] Reworking CSHIFT and Stencil. Implementing Wilson and discovered rework is required --- TODO | 4 + lib/Grid_config.h | 4 +- lib/Grid_stencil.h | 156 ++++++++++++---------------- lib/cartesian/Grid_cartesian_base.h | 6 +- lib/cshift/Grid_cshift_common.h | 4 +- lib/cshift/Grid_cshift_mpi.h | 132 ++++++++--------------- lib/qcd/Grid_qcd_wilson_dop.cc | 20 +++- tests/Grid_cshift.cc | 9 +- tests/Grid_simd.cc | 2 - tests/Grid_stencil.cc | 2 +- tests/Grid_wilson.cc | 41 ++++++-- 11 files changed, 176 insertions(+), 204 deletions(-) diff --git a/TODO b/TODO index 98dfc3b8..ef3940ea 100644 --- a/TODO +++ b/TODO @@ -2,6 +2,10 @@ - use protocol buffers? replace xmlReader/Writer ec.. - Binary use htonll, htonl +* Stencil -- do the permute for locally permuted in halo exchange. + - BUG cshift mpi; the "s" indexing is weird in the Cshift_comms_simd + as simd_layout or not is confusing + * Reduce implemention is poor * Bug in SeedFixedIntegers gives same output on each site. * Bug in RNG with complex numbers ; only filling real values; need helper function -- DONE diff --git a/lib/Grid_config.h b/lib/Grid_config.h index f3d7016d..4152540e 100644 --- a/lib/Grid_config.h +++ b/lib/Grid_config.h @@ -2,7 +2,7 @@ /* lib/Grid_config.h.in. Generated from configure.ac by autoheader. */ /* AVX */ -/* #undef AVX1 */ +#define AVX1 1 /* AVX2 */ /* #undef AVX2 */ @@ -77,7 +77,7 @@ #define PACKAGE_VERSION "1.0" /* SSE4 */ -#define SSE4 1 +/* #undef SSE4 */ /* Define to 1 if you have the ANSI C header files. */ #define STDC_HEADERS 1 diff --git a/lib/Grid_stencil.h b/lib/Grid_stencil.h index 83db0a80..afb11db6 100644 --- a/lib/Grid_stencil.h +++ b/lib/Grid_stencil.h @@ -52,7 +52,7 @@ namespace Grid { // Gather for when there is no need to SIMD split with compression /////////////////////////////////////////////////////////////////// template void -Gather_plane_simple (Lattice &rhs,std::vector > &buffer,int dimension,int plane,int cbmask,compressor &compress) +Gather_plane_simple (const Lattice &rhs,std::vector > &buffer,int dimension,int plane,int cbmask,compressor &compress) { int rd = rhs._grid->_rdimensions[dimension]; @@ -97,7 +97,7 @@ Gather_plane_simple (Lattice &rhs,std::vector // Gather for when there *is* need to SIMD split with compression /////////////////////////////////////////////////////////////////// template void - Gather_plane_extract(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask,compressor &compress) +Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask,compressor &compress) { int rd = rhs._grid->_rdimensions[dimension]; @@ -182,7 +182,7 @@ template void // Could allow a functional munging of the halo to another type during the comms. // this could implement the 16bit/32bit/64bit compression. template void - HaloExchange(Lattice &source,std::vector > &u_comm_buf,compressor &compress) + HaloExchange(const Lattice &source,std::vector > &u_comm_buf,compressor &compress) { // conformable(source._grid,_grid); assert(source._grid==_grid); @@ -237,7 +237,7 @@ template void } template - void GatherStartComms(Lattice &rhs,int dimension,int shift,int cbmask, + void GatherStartComms(const Lattice &rhs,int dimension,int shift,int cbmask, std::vector > &u_comm_buf, int &u_comm_offset,compressor & compress) { @@ -250,6 +250,7 @@ template void int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; + int pd = _grid->_processors[dimension]; int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; assert(simd_layout==1); @@ -267,11 +268,10 @@ template void for(int x=0;x= rd ); int sx = (x+sshift)%rd; - int comm_proc = (x+sshift)/rd; - - if (offnode) { + int comm_proc = ((x+sshift)/rd)%pd; + + if (comm_proc) { int words = send_buf.size(); if (cbmask != 0x3) words=words>>1; @@ -284,7 +284,8 @@ template void int recv_from_rank; int xmit_to_rank; _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - + assert (xmit_to_rank != _grid->ThisRank()); + assert (recv_from_rank != _grid->ThisRank()); // FIXME Implement asynchronous send & also avoid buffer copy _grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, @@ -293,7 +294,11 @@ template void bytes); printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout); - printf("GatherStartComms inserting %d buf size %d\n",u_comm_offset,buffer_size);fflush(stdout); + printf("GatherStartComms inserting %le to u_comm_offset %d buf size %d for dim %d shift %d\n", + *( (RealF *) &recv_buf[0]), + u_comm_offset,buffer_size, + dimension,shift + ); fflush(stdout); for(int i=0;i void template - void GatherStartCommsSimd(Lattice &rhs,int dimension,int shift,int cbmask, + void GatherStartCommsSimd(const Lattice &rhs,int dimension,int shift,int cbmask, std::vector > &u_comm_buf, int &u_comm_offset,compressor &compress) { const int Nsimd = _grid->Nsimd(); + typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; + int pd = _grid->_processors[dimension]; int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; @@ -346,89 +353,62 @@ template void int cb = (cbmask==0x2)? 1 : 0; int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb); - - std::vector comm_offnode(simd_layout); - std::vector comm_proc (simd_layout); //relative processor coord in dim=dimension - std::vector icoor(_grid->Nd()); - - for(int x=0;x= ld; - comm_any = comm_any | comm_offnode[s]; - comm_proc[s] = shifted_x/ld; + // FIXME call local permute copy if none are offnode. + for(int i=0;i_ostride[dimension]; int sx = (x+sshift)%rd; - - if ( comm_any ) { - for(int i=0;i(rhs,pointers,dimension,sx,cbmask,compress); + std::cout<< "Gathered "<>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer + int nbr_ox = (nbr_coor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + int recv_from_rank; + int xmit_to_rank; + + if (nbr_ic) nbr_lane|=inner_bit; + assert (sx == nbr_ox); + + if(nbr_proc){ + + std::cout<< "MPI sending "<ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], + xmit_to_rank, + (void *)&recv_buf_extract[i][0], + recv_from_rank, + bytes); + std::cout<< "MPI complete "<(rhs,pointers,dimension,sx,cbmask,compress); - - for(int i=0;iiCoorFromIindex(icoor,i); - s = icoor[dimension]; - - if(comm_offnode[s]){ - - int rank = _grid->_processor; - int recv_from_rank; - int xmit_to_rank; - - _grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank); - - - _grid->SendToRecvFrom((void *)&send_buf_extract[i][0], - xmit_to_rank, - (void *)&recv_buf_extract[i][0], - recv_from_rank, - bytes); - - rpointers[i] = (scalar_type *)&recv_buf_extract[i][0]; - - } else { - - rpointers[i] = (scalar_type *)&send_buf_extract[i][0]; - - } - - } - - // Permute by swizzling pointers in merge - int permute_slice=0; - int lshift=sshift%ld; - int wrap =lshift/rd; - int num =lshift%rd; - - if ( x< rd-num ) permute_slice=wrap; - else permute_slice = 1-wrap; - - int toggle_bit = (Nsimd>>(permute_type+1)); - int PermuteMap; - for(int i=0;i& coor,int Oindex){ + CoorFromIndex(coor,Oindex,_rdimensions); + } static inline void IndexFromCoor (std::vector& coor,int &index,std::vector &dims){ int nd=dims.size(); int stride=1; @@ -107,9 +110,6 @@ public: stride=stride*dims[d]; } } - inline void oCoorFromOindex (std::vector& coor,int Oindex){ - CoorFromIndex(coor,Oindex,_rdimensions); - } ////////////////////////////////////////////////////////// // SIMD lane addressing diff --git a/lib/cshift/Grid_cshift_common.h b/lib/cshift/Grid_cshift_common.h index 8c4f1e1d..eebc895e 100644 --- a/lib/cshift/Grid_cshift_common.h +++ b/lib/cshift/Grid_cshift_common.h @@ -5,7 +5,7 @@ namespace Grid { ////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split ////////////////////////////////////////////////////// -template void Gather_plane_simple (Lattice &rhs,std::vector > &buffer, int dimension,int plane,int cbmask) +template void Gather_plane_simple (const Lattice &rhs,std::vector > &buffer, int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; @@ -49,7 +49,7 @@ template void Gather_plane_simple (Lattice &rhs,std::vector void Gather_plane_extract(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) + template void Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; diff --git a/lib/cshift/Grid_cshift_mpi.h b/lib/cshift/Grid_cshift_mpi.h index 60894ef3..ab9237ad 100644 --- a/lib/cshift/Grid_cshift_mpi.h +++ b/lib/cshift/Grid_cshift_mpi.h @@ -79,6 +79,7 @@ template void Cshift_comms(Lattice &ret,Lattice &rhs,int int fd = rhs._grid->_fdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension]; + int pd = rhs._grid->_processors[dimension]; int simd_layout = rhs._grid->_simd_layout[dimension]; int comm_dim = rhs._grid->_processors[dimension] >1 ; assert(simd_layout==1); @@ -95,11 +96,10 @@ template void Cshift_comms(Lattice &ret,Lattice &rhs,int for(int x=0;x= rd ); - int sx = (x+sshift)%rd; - int comm_proc = (x+sshift)/rd; + int sx = (x+sshift)%rd; + int comm_proc = ((x+sshift)/rd)%pd; - if (!offnode) { + if (comm_proc==0) { Copy_plane(ret,rhs,dimension,x,sx,cbmask); @@ -138,6 +138,7 @@ template void Cshift_comms_simd(Lattice &ret,Lattice &r int fd = grid->_fdimensions[dimension]; int rd = grid->_rdimensions[dimension]; int ld = grid->_ldimensions[dimension]; + int pd = grid->_processors[dimension]; int simd_layout = grid->_simd_layout[dimension]; int comm_dim = grid->_processors[dimension] >1 ; @@ -164,101 +165,58 @@ template void Cshift_comms_simd(Lattice &ret,Lattice &r /////////////////////////////////////////// // Work out what to send where /////////////////////////////////////////// - int cb = (cbmask==0x2)? 1 : 0; int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb); - - std::vector comm_offnode(simd_layout); - std::vector comm_proc (simd_layout); //relative processor coord in dim=dimension - std::vector icoor(grid->Nd()); + // loop over outer coord planes orthog to dim for(int x=0;x= ld; - comm_any = comm_any | comm_offnode[s]; - comm_proc[s] = shifted_x/ld; + // FIXME call local permute copy if none are offnode. + + for(int i=0;i_ostride[dimension]; int sx = (x+sshift)%rd; + Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); - if ( comm_any ) { + for(int i=0;i>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; - for(int i=0;iShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], + xmit_to_rank, + (void *)&recv_buf_extract[i][0], + recv_from_rank, + bytes); + + rpointers[i] = (scalar_type *)&recv_buf_extract[i][0]; + } else { + rpointers[i] = (scalar_type *)&send_buf_extract[nbr_lane][0]; } - Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); - - for(int i=0;iiCoorFromIindex(icoor,i); - s = icoor[dimension]; - - if(comm_offnode[s]){ - - int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank); - - - grid->SendToRecvFrom((void *)&send_buf_extract[i][0], - xmit_to_rank, - (void *)&recv_buf_extract[i][0], - recv_from_rank, - bytes); - - rpointers[i] = (scalar_type *)&recv_buf_extract[i][0]; - - } else { - - rpointers[i] = (scalar_type *)&send_buf_extract[i][0]; - - } - - } - - // Permute by swizzling pointers in merge - int permute_slice=0; - int lshift=sshift%ld; - int wrap =lshift/rd; - int num =lshift%rd; - - if ( x< rd-num ) permute_slice=wrap; - else permute_slice = 1-wrap; - - int toggle_bit = (Nsimd>>(permute_type+1)); - int PermuteMap; - for(int i=0;i(in,comm_buf,compressor); for(int ss=0;ssoSites();ss++){ @@ -100,10 +105,13 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) vHalfSpinColourVector chi; vHalfSpinColourVector Uchi; vHalfSpinColourVector *chi_p; + + result=zero; + +#if 0 // Xp offset = Stencil._offsets [Xp][ss]; local = Stencil._is_local[Xp][ss]; - chi_p = &comm_buf[offset]; if ( local ) { spProjXp(chi,in._odata[offset]); @@ -112,7 +120,6 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) mult(&(Uchi()),&(Umu._odata[ss](Xp)),&(*chi_p)()); spReconXp(result,Uchi); -#if 0 // Yp offset = Stencil._offsets [Yp][ss]; local = Stencil._is_local[Yp][ss]; @@ -145,6 +152,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) } mult(&(Uchi()),&(Umu._odata[ss](Tp)),&(*chi_p)()); accumReconTp(result,Uchi); +#endif // Xm offset = Stencil._offsets [Xm][ss]; @@ -156,6 +164,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) } mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)()); accumReconXm(result,Uchi); +#if 0 // Ym offset = Stencil._offsets [Ym][ss]; @@ -190,6 +199,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) mult(&(Uchi()),&(Umu._odata[ss](Tm)),&(*chi_p)()); accumReconTm(result,Uchi); #endif + out._odata[ss] = result; } diff --git a/tests/Grid_cshift.cc b/tests/Grid_cshift.cc index 995756d6..c0863965 100644 --- a/tests/Grid_cshift.cc +++ b/tests/Grid_cshift.cc @@ -1,7 +1,6 @@ #include #include -using namespace std; using namespace Grid; using namespace Grid::QCD; @@ -10,7 +9,7 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); std::vector simd_layout({1,1,2,2}); - std::vector mpi_layout ({2,2,2,2}); + std::vector mpi_layout ({2,2,1,4}); std::vector latt_size ({8,8,8,16}); GridCartesian Fine(latt_size,simd_layout,mpi_layout); @@ -71,11 +70,11 @@ int main (int argc, char ** argv) Fine.CoorFromIndex(peer,index,latt_size); if (nrm > 0){ - cout<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<(funcPlus()); - std::cout << "==================================="<< std::endl; std::cout << "Testing vComplexF "< -#include using namespace std; using namespace Grid; @@ -10,13 +9,19 @@ struct scal { d internal; }; + Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT + }; int main (int argc, char ** argv) { Grid_init(&argc,&argv); std::vector simd_layout({1,1,2,2}); - std::vector mpi_layout ({1,1,1,1}); + std::vector mpi_layout ({2,1,1,2}); std::vector latt_size ({8,8,8,8}); GridCartesian Grid(latt_size,simd_layout,mpi_layout); @@ -29,19 +34,37 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); LatticeFermion result(&Grid); result=zero; LatticeFermion ref(&Grid); ref=zero; + LatticeFermion tmp(&Grid); tmp=zero; LatticeGaugeField Umu(&Grid); random(pRNG,Umu); std::vector U(4,&Grid); for(int mu=0;mu(Umu,U[mu],mu); + // U[mu] = 1.0; + // pokeIndex<3>(Umu,U[mu],mu); + U[mu] = peekIndex<3>(Umu,mu); } - { - int mu=0; - // ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x - ref = src; - ref = U[0]*Cshift(ref,0,1); + std::vector mask({0,0,0,0,1,0,0,0}); + { // Naive wilson implementation + ref = zero; + for(int mu=0;mu