diff --git a/lib/Grid_comparison.h b/lib/Grid_comparison.h index 530817c6..53dec1a0 100644 --- a/lib/Grid_comparison.h +++ b/lib/Grid_comparison.h @@ -101,12 +101,12 @@ namespace Grid { std::vector vlhs(vInteger::Nsimd()); // Use functors to reduce this to single implementation std::vector vrhs(vInteger::Nsimd()); vInteger ret; - extract(lhs,vlhs); - extract(rhs,vrhs); + extract(lhs,vlhs); + extract(rhs,vrhs); for(int s=0;s(ret,vlhs); return ret; } inline vInteger operator < (const vInteger & lhs, const vInteger & rhs) diff --git a/lib/Grid_extract.h b/lib/Grid_extract.h new file mode 100644 index 00000000..e29fbee7 --- /dev/null +++ b/lib/Grid_extract.h @@ -0,0 +1,193 @@ +#ifndef GRID_EXTRACT_H +#define GRID_EXTRACT_H +///////////////////////////////////////////////////////////////// +// Generic extract/merge/permute +///////////////////////////////////////////////////////////////// + +namespace Grid{ + +//////////////////////////////////////////////////////////////////////////////////////////////// +// Extract/merge a fundamental vector type, to pointer array with offset +//////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline void extract(typename std::enable_if::notvalue, const vsimd >::type * y, + std::vector &extracted,int offset){ + // FIXME: bounce off memory is painful + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + + scalar*buf = (scalar *)y; + for(int i=0;i +inline void merge(typename std::enable_if::notvalue, vsimd >::type * y, + std::vector &extracted,int offset){ + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + + scalar *buf =(scalar *) y; + for(int i=0;i +inline void extract(typename std::enable_if::notvalue, const vsimd >::type &y,std::vector &extracted){ + + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + + scalar *buf = (scalar *)&y; + for(int i=0;i +inline void merge(typename std::enable_if::notvalue, vsimd >::type &y,std::vector &extracted){ + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + scalar *buf = (scalar *)&y; + + for(int i=0;i +inline void AmergeA(typename std::enable_if::notvalue, vsimd >::type &y,std::vector &extracted){ + int Nextr=extracted.size(); + int Nsimd=vsimd::Nsimd(); + int s=Nsimd/Nextr; + + scalar *buf = (scalar *)&y; + for(int i=0;i inline void extract(const vobj &vec,std::vector &extracted) +{ + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + const int Nsimd=vobj::vector_type::Nsimd(); + const int words=sizeof(vobj)/sizeof(vector_type); + + extracted.resize(Nsimd); + + std::vector pointers(Nsimd); + for(int i=0;i(&vp[w],pointers,w); + } +} +//////////////////////////////////////////////////////////////////////// +// Extract to a bunch of scalar object pointers, with offset +//////////////////////////////////////////////////////////////////////// +template inline +void extract(const vobj &vec,std::vector &extracted, int offset) +{ + + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + const int words=sizeof(vobj)/sizeof(vector_type); + const int Nsimd=vobj::vector_type::Nsimd(); + + assert(extracted.size()==Nsimd); + + std::vector pointers(Nsimd); + for(int i=0;i(&vp[w],pointers,w); + } +} + +//////////////////////////////////////////////////////////////////////// +// Merge a contiguous array of scalar objects +//////////////////////////////////////////////////////////////////////// +template inline +void merge(vobj &vec,std::vector &extracted) +{ + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + const int Nsimd=vobj::vector_type::Nsimd(); + const int words=sizeof(vobj)/sizeof(vector_type); + + assert(extracted.size()==Nsimd); + + std::vector pointers(Nsimd); + for(int i=0;i(&vp[w],pointers,w); + } +} + +//////////////////////////////////////////////////////////////////////// +// Merge a bunch of different scalar object pointers, with offset +//////////////////////////////////////////////////////////////////////// +template inline +void merge(vobj &vec,std::vector &extracted,int offset) +{ + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + const int Nsimd=vobj::vector_type::Nsimd(); + const int words=sizeof(vobj)/sizeof(vector_type); + + assert(extracted.size()==Nsimd); + + std::vector pointers(Nsimd); + for(int i=0;i(&vp[w],pointers,w); + } +} +} +#endif diff --git a/lib/Grid_lattice.h b/lib/Grid_lattice.h index 7256459c..c76fa0a9 100644 --- a/lib/Grid_lattice.h +++ b/lib/Grid_lattice.h @@ -95,6 +95,7 @@ public: #include #include #include +#include #include #include #include diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index a059cc23..4e33604d 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -133,68 +133,6 @@ namespace Grid { #endif -///////////////////////////////////////////////////////////////// -// Generic extract/merge/permute -///////////////////////////////////////////////////////////////// - -template -inline void Gextract(const vsimd &y,std::vector &extracted){ - // FIXME: bounce off memory is painful - int Nextr=extracted.size(); - int Nsimd=vsimd::Nsimd(); - int s=Nsimd/Nextr; - - std::vector > buf(Nsimd); - - vstore(y,&buf[0]); - for(int i=0;i -inline void Gextract(const vsimd &y,std::vector &extracted){ - int Nextr=extracted.size(); - int Nsimd=vsimd::Nsimd(); - int s=Nsimd/Nextr; - - std::vector > buf(Nsimd); - - vstore(y,&buf[0]); - for(int i=0;i -inline void Gmerge(vsimd &y,std::vector &extracted){ - int Nextr=extracted.size(); - int Nsimd=vsimd::Nsimd(); - int s=Nsimd/Nextr; - - std::vector buf(Nsimd); - for(int i=0;i -inline void Gmerge(vsimd &y,std::vector &extracted){ - int Nextr=extracted.size(); - int Nsimd=vsimd::Nsimd(); - int s=Nsimd/Nextr; - - std::vector buf(Nsimd); - for(int i=0;i BA DC FE HG diff --git a/lib/Grid_stencil.h b/lib/Grid_stencil.h index afb11db6..fdc8ece9 100644 --- a/lib/Grid_stencil.h +++ b/lib/Grid_stencil.h @@ -48,100 +48,6 @@ namespace Grid { } ; -/////////////////////////////////////////////////////////////////// -// Gather for when there is no need to SIMD split with compression -/////////////////////////////////////////////////////////////////// -template void -Gather_plane_simple (const Lattice &rhs,std::vector > &buffer,int dimension,int plane,int cbmask,compressor &compress) -{ - int rd = rhs._grid->_rdimensions[dimension]; - - if ( !rhs._grid->CheckerBoarded(dimension) ) { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - // Simple block stride gather of SIMD objects -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - buffer[bo++]=compress(rhs._odata[so+o+b]); - } - o +=rhs._grid->_slice_stride[dimension]; - } - - } else { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - - int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup - if ( ocb &cbmask ) { - buffer[bo]=compress(rhs._odata[so+o+b]); - bo++; - } - - } - o +=rhs._grid->_slice_stride[dimension]; - } - } -} - -/////////////////////////////////////////////////////////////////// -// Gather for when there *is* need to SIMD split with compression -/////////////////////////////////////////////////////////////////// -template void -Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask,compressor &compress) -{ - int rd = rhs._grid->_rdimensions[dimension]; - - if ( !rhs._grid->CheckerBoarded(dimension) ) { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - // Simple block stride gather of SIMD objects -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - cobj temp; - temp=compress(rhs._odata[so+o+b]); - extract(temp,pointers); - } - o +=rhs._grid->_slice_stride[dimension]; - } - - } else { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - - int ocb=1<CheckerBoardFromOindex(o+b); - if ( ocb & cbmask ) { - cobj temp; - temp =compress(rhs._odata[so+o+b]); - extract(temp,pointers); - } - - } - o +=rhs._grid->_slice_stride[dimension]; - } - } -} - - class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. public: @@ -184,6 +90,7 @@ Gather_plane_extract(const Lattice &rhs,std::vector void HaloExchange(const Lattice &source,std::vector > &u_comm_buf,compressor &compress) { + std::cout<< "HaloExchange comm_buf.size()="<< u_comm_buf.size()<<" unified_buffer_size"<< _unified_buffer_size<< std::endl; // conformable(source._grid,_grid); assert(source._grid==_grid); if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size); @@ -234,6 +141,7 @@ Gather_plane_extract(const Lattice &rhs,std::vector @@ -318,6 +226,7 @@ Gather_plane_extract(const Lattice &rhs,std::vector_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -340,12 +249,12 @@ Gather_plane_extract(const Lattice &rhs,std::vector > send_buf_extract(Nsimd,std::vector(buffer_size*words) ); - std::vector > recv_buf_extract(Nsimd,std::vector(buffer_size*words) ); - int bytes = buffer_size*words*sizeof(scalar_type); + std::vector > send_buf_extract(Nsimd,std::vector(buffer_size) ); + std::vector > recv_buf_extract(Nsimd,std::vector(buffer_size) ); + int bytes = buffer_size*sizeof(scalar_object); - std::vector pointers(Nsimd); // - std::vector rpointers(Nsimd); // received pointers + std::vector pointers(Nsimd); // + std::vector rpointers(Nsimd); // received pointers /////////////////////////////////////////// // Work out what to send where @@ -353,62 +262,77 @@ Gather_plane_extract(const Lattice &rhs,std::vectorCheckerBoardShift(rhs.checkerboard,dimension,shift,cb); - + // loop over outer coord planes orthog to dim for(int x=0;x(rhs,pointers,dimension,sx,cbmask,compress); - std::cout<< "Gathered "<>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer - int nbr_ox = (nbr_coor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - int recv_from_rank; - int xmit_to_rank; - - if (nbr_ic) nbr_lane|=inner_bit; - assert (sx == nbr_ox); - - if(nbr_proc){ - - std::cout<< "MPI sending "<ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], - xmit_to_rank, - (void *)&recv_buf_extract[i][0], - recv_from_rank, - bytes); - std::cout<< "MPI complete "<= rd ); + std::cout<<"any_offnode ="<(rhs,pointers,dimension,sx,cbmask,compress); + std::cout<< "Gathered "< icoor; + _grid->iCoorFromIindex(icoor,i); - // Here we don't want to scatter, just place into a buffer. - std::cout<< "merging "<>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + assert(ic==icoor[dimension]); + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + int nbr_lcoor= (nbr_coor%ld); + int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer + int nbr_ox = (nbr_lcoor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + int recv_from_rank; + int xmit_to_rank; + + if (nbr_ic) nbr_lane|=inner_bit; + assert (sx == nbr_ox); + + std::cout<<"nbr_proc "<ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], + xmit_to_rank, + (void *)&recv_buf_extract[i][0], + recv_from_rank, + bytes); + std::cout<< "MPI complete "< &ret,const Lattice &predicate,Lattice &ret,const Lattice &predicate,Lattice mask(Nsimd); - std::vector > truevals (Nsimd,std::vector(words) ); - std::vector > falsevals(Nsimd,std::vector(words) ); - std::vector pointers(Nsimd); + std::vector truevals (Nsimd); + std::vector falsevals(Nsimd); #pragma omp parallel for for(int ss=0;ssoSites(); ss++){ - for(int s=0;s(TensorRemove(predicate._odata[ss]),mask); for(int s=0;s +class SimpleCompressor { +public: + void Point(int) {}; + + vobj operator() (const vobj &arg) { + return arg; + } +}; + +/////////////////////////////////////////////////////////////////// +// Gather for when there is no need to SIMD split with compression +/////////////////////////////////////////////////////////////////// +template void +Gather_plane_simple (const Lattice &rhs,std::vector > &buffer,int dimension,int plane,int cbmask,compressor &compress) +{ + int rd = rhs._grid->_rdimensions[dimension]; + + if ( !rhs._grid->CheckerBoarded(dimension) ) { + cbmask = 0x3; + } + + int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + +#pragma omp parallel for collapse(2) + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup + if ( ocb &cbmask ) { + buffer[bo]=compress(rhs._odata[so+o+b]); + bo++; + } + } + o +=rhs._grid->_slice_stride[dimension]; + } +} + + +/////////////////////////////////////////////////////////////////// +// Gather for when there *is* need to SIMD split with compression +/////////////////////////////////////////////////////////////////// +template void +Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask,compressor &compress) +{ + int rd = rhs._grid->_rdimensions[dimension]; + + if ( !rhs._grid->CheckerBoarded(dimension) ) { + cbmask = 0x3; + } + + int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer + +#pragma omp parallel for collapse(2) + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int offset = b+n*rhs._grid->_slice_block[dimension]; + + int ocb=1<CheckerBoardFromOindex(o+b); + if ( ocb & cbmask ) { + cobj temp; + temp =compress(rhs._odata[so+o+b]); + extract(temp,pointers,offset); + } + + } + o +=rhs._grid->_slice_stride[dimension]; + } +} + ////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split ////////////////////////////////////////////////////// template void Gather_plane_simple (const Lattice &rhs,std::vector > &buffer, int dimension,int plane,int cbmask) { - int rd = rhs._grid->_rdimensions[dimension]; - - if ( !rhs._grid->CheckerBoarded(dimension) ) { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - // Simple block stride gather of SIMD objects -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - buffer[bo++]=rhs._odata[so+o+b]; - } - o +=rhs._grid->_slice_stride[dimension]; - } - - } else { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - - int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup - if ( ocb &cbmask ) { - buffer[bo]=rhs._odata[so+o+b]; - bo++; - } - - } - o +=rhs._grid->_slice_stride[dimension]; - } - } + SimpleCompressor dontcompress; + Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress); } ////////////////////////////////////////////////////// // Gather for when there *is* need to SIMD split ////////////////////////////////////////////////////// - template void Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) +template void Gather_plane_extract(const Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) { - int rd = rhs._grid->_rdimensions[dimension]; - - if ( !rhs._grid->CheckerBoarded(dimension) ) { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - // Simple block stride gather of SIMD objects -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - extract(rhs._odata[so+o+b],pointers); - } - o +=rhs._grid->_slice_stride[dimension]; - } - - } else { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - - int ocb=1<CheckerBoardFromOindex(o+b); - if ( ocb & cbmask ) { - extract(rhs._odata[so+o+b],pointers); - } - - } - o +=rhs._grid->_slice_stride[dimension]; - } - } + SimpleCompressor dontcompress; + Gather_plane_extract(rhs,pointers,dimension,plane,cbmask,dontcompress); } ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// -template void Scatter_plane_simple (Lattice &rhs,std::vector > &buffer, int dimension,int plane,int cbmask) +template void Scatter_plane_simple (Lattice &rhs,std::vector > &buffer, int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; if ( !rhs._grid->CheckerBoarded(dimension) ) { + cbmask=0x3; + } - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - // Simple block stride gather of SIMD objects -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - rhs._odata[so+o+b]=buffer[bo++]; - } - o +=rhs._grid->_slice_stride[dimension]; - } - - } else { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer + int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer #pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - - int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup - if ( ocb & cbmask ) { - rhs._odata[so+o+b]=buffer[bo++]; - } + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup + if ( ocb & cbmask ) { + rhs._odata[so+o+b]=buffer[bo++]; } - o +=rhs._grid->_slice_stride[dimension]; + } + o +=rhs._grid->_slice_stride[dimension]; } } ////////////////////////////////////////////////////// // Scatter for when there *is* need to SIMD split ////////////////////////////////////////////////////// - template void Scatter_plane_merge(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) + template void Scatter_plane_merge(Lattice &rhs,std::vector pointers,int dimension,int plane,int cbmask) { int rd = rhs._grid->_rdimensions[dimension]; if ( !rhs._grid->CheckerBoarded(dimension) ) { + cbmask=0x3; + } - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer - - // Simple block stride gather of SIMD objects -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - merge(rhs._odata[so+o+b],pointers); - } - o +=rhs._grid->_slice_stride[dimension]; - } - - } else { - - int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - int bo = 0; // offset in buffer + int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + int bo = 0; // offset in buffer #pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - - int ocb=1<CheckerBoardFromOindex(o+b); - if ( ocb&cbmask ) { - merge(rhs._odata[so+o+b],pointers); - } + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + int offset = b+n*rhs._grid->_slice_block[dimension]; + int ocb=1<CheckerBoardFromOindex(o+b); + if ( ocb&cbmask ) { + merge(rhs._odata[so+o+b],pointers,offset); } - o +=rhs._grid->_slice_stride[dimension]; + } + o +=rhs._grid->_slice_stride[dimension]; } } @@ -183,40 +162,26 @@ template void Copy_plane(Lattice& lhs,Lattice &rhs, int int rd = rhs._grid->_rdimensions[dimension]; if ( !rhs._grid->CheckerBoarded(dimension) ) { + cbmask=0x3; + } - int o = 0; // relative offset to base within plane - int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*lhs._grid->_ostride[dimension]; // offset in buffer - // Simple block stride gather of SIMD objects + int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + #pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int ocb=1<CheckerBoardFromOindex(o+b); + + if ( ocb&cbmask ) { lhs._odata[lo+o+b]=rhs._odata[ro+o+b]; } - o +=rhs._grid->_slice_stride[dimension]; + } - - } else { - - int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - - int ocb=1<CheckerBoardFromOindex(o+b); - - if ( ocb&cbmask ) { - lhs._odata[lo+o+b]=rhs._odata[ro+o+b]; - } - - } - o +=rhs._grid->_slice_stride[dimension]; - } - + o +=rhs._grid->_slice_stride[dimension]; } } @@ -224,42 +189,25 @@ template void Copy_plane_permute(Lattice& lhs,Lattice &r { int rd = rhs._grid->_rdimensions[dimension]; - if ( !rhs._grid->CheckerBoarded(dimension) ) { + cbmask=0x3; + } - int o = 0; // relative offset to base within plane - int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*rhs._grid->_ostride[dimension]; // offset in buffer - - // Simple block stride gather of SIMD objects + int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane + int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane + int o = 0; // relative offset to base within plane + #pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ + for(int n=0;n_slice_nblock[dimension];n++){ + for(int b=0;b_slice_block[dimension];b++){ + + int ocb=1<CheckerBoardFromOindex(o+b); + if ( ocb&cbmask ) { permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type); } - o +=rhs._grid->_slice_stride[dimension]; + } - - } else { - - int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane - int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane - int o = 0; // relative offset to base within plane - -#pragma omp parallel for collapse(2) - for(int n=0;n_slice_nblock[dimension];n++){ - for(int b=0;b_slice_block[dimension];b++){ - - int ocb=1<CheckerBoardFromOindex(o+b); - - if ( ocb&cbmask ) { - permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type); - } - - } - o +=rhs._grid->_slice_stride[dimension]; - } - + o +=rhs._grid->_slice_stride[dimension]; } } diff --git a/lib/cshift/Grid_cshift_mpi.h b/lib/cshift/Grid_cshift_mpi.h index ab9237ad..7c77ebc9 100644 --- a/lib/cshift/Grid_cshift_mpi.h +++ b/lib/cshift/Grid_cshift_mpi.h @@ -133,6 +133,7 @@ template void Cshift_comms_simd(Lattice &ret,Lattice &r GridBase *grid=rhs._grid; const int Nsimd = grid->Nsimd(); typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_type scalar_type; int fd = grid->_fdimensions[dimension]; @@ -155,12 +156,12 @@ template void Cshift_comms_simd(Lattice &ret,Lattice &r int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd,std::vector(buffer_size*words) ); - std::vector > recv_buf_extract(Nsimd,std::vector(buffer_size*words) ); - int bytes = buffer_size*words*sizeof(scalar_type); + std::vector > send_buf_extract(Nsimd,std::vector(buffer_size) ); + std::vector > recv_buf_extract(Nsimd,std::vector(buffer_size) ); + int bytes = buffer_size*sizeof(scalar_object); - std::vector pointers(Nsimd); // - std::vector rpointers(Nsimd); // received pointers + std::vector pointers(Nsimd); // + std::vector rpointers(Nsimd); // received pointers /////////////////////////////////////////// // Work out what to send where @@ -171,10 +172,9 @@ template void Cshift_comms_simd(Lattice &ret,Lattice &r // loop over outer coord planes orthog to dim for(int x=0;x void Cshift_comms_simd(Lattice &ret,Lattice &r recv_from_rank, bytes); - rpointers[i] = (scalar_type *)&recv_buf_extract[i][0]; + rpointers[i] = &recv_buf_extract[i][0]; } else { - rpointers[i] = (scalar_type *)&send_buf_extract[nbr_lane][0]; + rpointers[i] = &send_buf_extract[nbr_lane][0]; } } diff --git a/lib/lattice/Grid_lattice_coordinate.h b/lib/lattice/Grid_lattice_coordinate.h index 8fba8b1b..a9b381fa 100644 --- a/lib/lattice/Grid_lattice_coordinate.h +++ b/lib/lattice/Grid_lattice_coordinate.h @@ -5,21 +5,23 @@ namespace Grid { template inline void LatticeCoordinate(Lattice &l,int mu) { + typedef typename iobj::scalar_object scalar_object; typedef typename iobj::scalar_type scalar_type; typedef typename iobj::vector_type vector_type; + GridBase *grid = l._grid; int Nsimd = grid->iSites(); + std::vector gcoor; std::vector mergebuf(Nsimd); - std::vector mergeptr(Nsimd); + vector_type vI; for(int o=0;ooSites();o++){ for(int i=0;iiSites();i++){ grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); - mergebuf[i]=gcoor[mu]; - mergeptr[i]=&mergebuf[i]; + mergebuf[i]=(Integer)gcoor[mu]; } - merge(vI,mergeptr); + AmergeA(vI,mergebuf); l._odata[o]=vI; } }; diff --git a/lib/lattice/Grid_lattice_peekpoke.h b/lib/lattice/Grid_lattice_peekpoke.h index 93245016..ae87c5d8 100644 --- a/lib/lattice/Grid_lattice_peekpoke.h +++ b/lib/lattice/Grid_lattice_peekpoke.h @@ -94,15 +94,12 @@ namespace Grid { grid->Broadcast(grid->BossRank(),s); std::vector buf(Nsimd); - std::vector pointers(Nsimd); // extract-modify-merge cycle is easiest way and this is not perf critical if ( rank == grid->ThisRank() ) { - for(int i=0;iGlobalCoorToRankIndex(rank,odx,idx,site); - std::vector buf(Nsimd); - std::vector pointers(Nsimd); - for(int i=0;i buf(Nsimd); + extract(l._odata[odx],buf); + s = buf[idx]; + grid->Broadcast(rank,s); return; @@ -160,10 +156,8 @@ namespace Grid { odx= grid->oIndex(site); std::vector buf(Nsimd); - std::vector pointers(Nsimd); - for(int i=0;ioIndex(site); std::vector buf(Nsimd); - std::vector pointers(Nsimd); - for(int i=0;i buf(Nsimd); - std::vector pointers(Nsimd); - for(int i=0;i void fillScalar(scalar &s,distribution &dist,generator & gen) + { + s=dist(gen); + } + template void fillScalar(ComplexF &s,distribution &dist, generator &gen) + { + s=ComplexF(dist(gen),dist(gen)); + } + template void fillScalar(ComplexD &s,distribution &dist,generator &gen) + { + s=ComplexD(dist(gen),dist(gen)); + } + class GridRNGbase { public: @@ -64,20 +77,6 @@ namespace Grid { } - // real scalars are one component - template void fillScalar(scalar &s,distribution &dist) - { - s=dist(_generators[0]); - } - template void fillScalar(ComplexF &s,distribution &dist) - { - s=ComplexF(dist(_generators[0]),dist(_generators[0])); - } - template void fillScalar(ComplexD &s,distribution &dist) - { - s=ComplexD(dist(_generators[0]),dist(_generators[0])); - } - template inline void fill(sobj &l,distribution &dist){ @@ -88,7 +87,7 @@ namespace Grid { scalar_type *buf = (scalar_type *) & l; for(int idx=0;idx inline void fill(ComplexF &l,distribution &dist){ - fillScalar(l,dist); + fillScalar(l,dist,_generators[0]); CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); } template inline void fill(ComplexD &l,distribution &dist){ - fillScalar(l,dist); + fillScalar(l,dist,_generators[0]); CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); } template inline void fill(RealF &l,distribution &dist){ - fillScalar(l,dist); + fillScalar(l,dist,_generators[0]); CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); } template inline void fill(RealD &l,distribution &dist){ - fillScalar(l,dist); + fillScalar(l,dist,_generators[0]); CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); } // vector fill template inline void fill(vComplexF &l,distribution &dist){ RealF *pointer=(RealF *)&l; for(int i=0;i<2*vComplexF::Nsimd();i++){ - fillScalar(pointer[i],dist); + fillScalar(pointer[i],dist,_generators[0]); } CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); } template inline void fill(vComplexD &l,distribution &dist){ RealD *pointer=(RealD *)&l; for(int i=0;i<2*vComplexD::Nsimd();i++){ - fillScalar(pointer[i],dist); + fillScalar(pointer[i],dist,_generators[0]); } CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); } template inline void fill(vRealF &l,distribution &dist){ RealF *pointer=(RealF *)&l; for(int i=0;i inline void fill(vRealD &l,distribution &dist){ RealD *pointer=(RealD *)&l; for(int i=0;i gcoor; - for(int gidx=0;gidx<_grid->_gsites;gidx++){ + int gsites = _grid->_gsites; + + typename source::result_type init = src(); + std::ranlux48 pseeder(init); + std::uniform_int_distribution ui; + + for(int gidx=0;gidxGlobalIndexToGlobalCoor(gidx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); int l_idx=generator_idx(o_idx,i_idx); + + std::vector site_seeds(4); + for(int i=0;i<4;i++){ + site_seeds[i]= ui(pseeder); + } - typename source::result_type init = src(); + _grid->Broadcast(0,(void *)&site_seeds[0],sizeof(int)*site_seeds.size()); - _grid->Broadcast(0,(void *)&init,sizeof(init)); if( rank == _grid->ThisRank() ){ - _generators[l_idx] = std::ranlux48(init); + fixedSeed ssrc(site_seeds); + typename source::result_type sinit = ssrc(); + _generators[l_idx] = std::ranlux48(sinit); } } _seeded=1; @@ -210,6 +222,7 @@ namespace Grid { template inline void fill(Lattice &l,distribution &dist){ + typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -217,25 +230,22 @@ namespace Grid { int Nsimd =_grid->Nsimd(); int osites=_grid->oSites(); + int words=sizeof(scalar_object)/sizeof(scalar_type); - int words = sizeof(vobj)/sizeof(vector_type); - std::vector > buf(Nsimd,std::vector(words)); - std::vector pointers(Nsimd); + std::vector buf(Nsimd); for(int ss=0;ss &out,const iScalar &in,int permutetype){ permute(out._internal,in._internal,permutetype); } - friend void extract(const iScalar &in,std::vector &out){ - extract(in._internal,out); // extract advances the pointers in out - } - friend void merge(iScalar &in,std::vector &out){ - merge(in._internal,out); // extract advances the pointers in out - } // Unary negation friend inline iScalar operator -(const iScalar &r) { @@ -149,16 +143,6 @@ public: permute(out._internal[i],in._internal[i],permutetype); } } - friend void extract(const iVector &in,std::vector &out){ - for(int i=0;i &in,std::vector &out){ - for(int i=0;i operator -(const iVector &r) { iVector ret; @@ -232,18 +216,6 @@ public: permute(out._internal[i][j],in._internal[i][j],permutetype); }} } - friend void extract(const iMatrix &in,std::vector &out){ - for(int i=0;i &in,std::vector &out){ - for(int i=0;i operator -(const iMatrix &r) { iMatrix ret; @@ -285,37 +257,6 @@ public: }; -template inline -void extract(const vobj &vec,std::vector &extracted) -{ - typedef typename vobj::scalar_type scalar_type ; - typedef typename vobj::vector_type vector_type ; - - int Nsimd=vobj::vector_type::Nsimd(); - - extracted.resize(Nsimd); - - std::vector pointers(Nsimd); - for(int i=0;i inline -void merge(vobj &vec,std::vector &extracted) -{ - typedef typename vobj::scalar_type scalar_type ; - typedef typename vobj::vector_type vector_type ; - - int Nsimd=vobj::vector_type::Nsimd(); - assert(extracted.size()==Nsimd); - - std::vector pointers(Nsimd); - for(int i=0;i class GridTypeMapper { + public: + typedef Integer scalar_type; + typedef Integer vector_type; + typedef Integer tensor_reduced; + typedef Integer scalar_object; + enum { TensorLevel = 0 }; + }; template<> class GridTypeMapper { public: @@ -99,10 +107,10 @@ namespace Grid { }; template<> class GridTypeMapper { public: - typedef Integer scalar_type; + typedef Integer scalar_type; typedef vInteger vector_type; typedef vInteger tensor_reduced; - typedef Integer scalar_object; + typedef Integer scalar_object; enum { TensorLevel = 0 }; }; diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc index b1d6d525..1e2741b4 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.cc +++ b/lib/qcd/Grid_qcd_wilson_dop.cc @@ -99,106 +99,159 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out) for(int ss=0;ssoSites();ss++){ - int offset,local; + int offset,local,perm, ptype; vSpinColourVector result; vHalfSpinColourVector chi; + vHalfSpinColourVector tmp; vHalfSpinColourVector Uchi; vHalfSpinColourVector *chi_p; result=zero; -#if 0 // Xp offset = Stencil._offsets [Xp][ss]; local = Stencil._is_local[Xp][ss]; + perm = Stencil._permute[Xp][ss]; + ptype = Stencil._permute_type[Xp]; chi_p = &comm_buf[offset]; if ( local ) { - spProjXp(chi,in._odata[offset]); chi_p = χ - } + spProjXp(chi,in._odata[offset]); + if ( perm ) { + permute(tmp,chi,ptype); + chi_p = &tmp; + } + } mult(&(Uchi()),&(Umu._odata[ss](Xp)),&(*chi_p)()); spReconXp(result,Uchi); // Yp offset = Stencil._offsets [Yp][ss]; local = Stencil._is_local[Yp][ss]; + perm = Stencil._permute[Yp][ss]; + ptype = Stencil._permute_type[Yp]; chi_p = &comm_buf[offset]; if ( local ) { - spProjYp(chi,in._odata[offset]); chi_p = χ - } + spProjYp(chi,in._odata[offset]); + if ( perm ) { + permute(tmp,chi,ptype); + chi_p = &tmp; + } + } mult(&(Uchi()),&(Umu._odata[ss](Yp)),&(*chi_p)()); accumReconYp(result,Uchi); // Zp offset = Stencil._offsets [Zp][ss]; local = Stencil._is_local[Zp][ss]; + perm = Stencil._permute[Zp][ss]; + ptype = Stencil._permute_type[Zp]; chi_p = &comm_buf[offset]; + if ( local ) { - spProjZp(chi,in._odata[offset]); chi_p = χ - } - mult(&(Uchi()),&(Umu._odata[ss](Zp)),&(*chi_p)() ); + spProjZp(chi,in._odata[offset]); + if ( perm ) { + permute(tmp,chi,ptype); + chi_p = &tmp; + } + } + mult(&(Uchi()),&(Umu._odata[ss](Zp)),&(*chi_p)()); accumReconZp(result,Uchi); // Tp offset = Stencil._offsets [Tp][ss]; local = Stencil._is_local[Tp][ss]; + perm = Stencil._permute[Tp][ss]; + ptype = Stencil._permute_type[Tp]; chi_p = &comm_buf[offset]; + if ( local ) { - spProjTp(chi,in._odata[offset]); chi_p = χ - } + spProjTp(chi,in._odata[offset]); + if ( perm ) { + permute(tmp,chi,ptype); + chi_p = &tmp; + } + } mult(&(Uchi()),&(Umu._odata[ss](Tp)),&(*chi_p)()); accumReconTp(result,Uchi); -#endif // Xm offset = Stencil._offsets [Xm][ss]; local = Stencil._is_local[Xm][ss]; + perm = Stencil._permute[Xm][ss]; + ptype = Stencil._permute_type[Xm]; chi_p = &comm_buf[offset]; if ( local ) { - spProjXm(chi,in._odata[offset]); chi_p = χ - } + spProjXm(chi,in._odata[offset]); + if ( perm ) { + permute(tmp,chi,ptype); + chi_p = &tmp; + } + } + std::cout<<"Xm for site "<(y,b,perm); @@ -183,6 +184,7 @@ namespace Grid { { Gextract(y,extracted); } + */ /////////////////////// // Splat diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h index fb9c5a67..c42e4029 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Grid_vComplexF.h @@ -412,6 +412,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ { Gpermute(y,b,perm); } + /* friend inline void merge(vComplexF &y,std::vector &extracted) { Gmerge(y,extracted); @@ -428,7 +429,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ { Gextract(y,extracted); } - + */ }; diff --git a/lib/simd/Grid_vInteger.h b/lib/simd/Grid_vInteger.h index 5dc77444..6035aea1 100644 --- a/lib/simd/Grid_vInteger.h +++ b/lib/simd/Grid_vInteger.h @@ -221,6 +221,7 @@ namespace Grid { { Gpermute(y,b,perm); } + /* friend inline void merge(vInteger &y,std::vector &extracted) { Gmerge(y,extracted); @@ -237,7 +238,7 @@ namespace Grid { { Gextract(y,extracted); } - + */ public: static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);} diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Grid_vRealD.h index f6d68963..36e189e1 100644 --- a/lib/simd/Grid_vRealD.h +++ b/lib/simd/Grid_vRealD.h @@ -105,6 +105,7 @@ namespace Grid { // all subtypes; may not be a good assumption, but could // add the vector width as a template param for BG/Q for example //////////////////////////////////////////////////////////////////// + /* friend inline void permute(vRealD &y,vRealD b,int perm) { Gpermute(y,b,perm); @@ -125,7 +126,7 @@ namespace Grid { { Gextract(y,extracted); } - + */ friend inline void vsplat(vRealD &ret,double a){ #if defined (AVX1)|| defined (AVX2) diff --git a/lib/simd/Grid_vRealF.h b/lib/simd/Grid_vRealF.h index 86895492..70f76bc0 100644 --- a/lib/simd/Grid_vRealF.h +++ b/lib/simd/Grid_vRealF.h @@ -127,6 +127,7 @@ namespace Grid { // all subtypes; may not be a good assumption, but could // add the vector width as a template param for BG/Q for example //////////////////////////////////////////////////////////////////// + /* friend inline void permute(vRealF &y,vRealF b,int perm) { Gpermute(y,b,perm); @@ -147,7 +148,7 @@ namespace Grid { { Gextract(y,extracted); } - + */ ///////////////////////////////////////////////////// diff --git a/lib/stencil/Grid_stencil_common.cc b/lib/stencil/Grid_stencil_common.cc index c245aafb..fda4ef2e 100644 --- a/lib/stencil/Grid_stencil_common.cc +++ b/lib/stencil/Grid_stencil_common.cc @@ -118,6 +118,7 @@ namespace Grid { int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; + int pd = _grid->_processors[dimension]; int simd_layout = _grid->_simd_layout[dimension]; int comm_dim = _grid->_processors[dimension] >1 ; @@ -136,10 +137,10 @@ namespace Grid { for(int x=0;x= rd ); + int comm_proc = ((x+sshift)/rd)%pd; + int offnode = (comm_proc!=0); int sx = (x+sshift)%rd; - int comm_proc = (x+sshift)/rd; - + if (!offnode) { int permute_slice=0; diff --git a/tests/Grid_simd.cc b/tests/Grid_simd.cc index 364b13e8..1de2ad1f 100644 --- a/tests/Grid_simd.cc +++ b/tests/Grid_simd.cc @@ -75,9 +75,9 @@ void Tester(const functor &func) random(sRNG,result[i]); } - Gmerge(v_input1,input1); - Gmerge(v_input2,input2); - Gmerge(v_result,result); + merge(v_input1,input1); + merge(v_input2,input2); + merge(v_result,result); func(v_result,v_input1,v_input2); @@ -85,7 +85,7 @@ void Tester(const functor &func) func(reference[i],input1[i],input2[i]); } - Gextract(v_result,result); + extract(v_result,result); std::cout << " " << func.name()< -class SimpleCompressor { -public: - void Point(int) {}; - - vobj operator() (const vobj &arg) { - return arg; - } -}; int main (int argc, char ** argv) { Grid_init(&argc,&argv); std::vector simd_layout({1,1,2,2}); - std::vector mpi_layout ({2,2,2,2}); + std::vector mpi_layout ({2,2,1,2}); std::vector latt_size ({8,8,8,8}); double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; @@ -26,7 +17,9 @@ int main (int argc, char ** argv) GridCartesian Fine(latt_size,simd_layout,mpi_layout); GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout); GridParallelRNG fRNG(&Fine); - fRNG.SeedRandomDevice(); + // fRNG.SeedRandomDevice(); + std::vector seeds({1,2,3,4}); + fRNG.SeedFixedIntegers(seeds); LatticeColourMatrix Foo(&Fine); LatticeColourMatrix Bar(&Fine); @@ -38,8 +31,9 @@ int main (int argc, char ** argv) for(int dir=0;dir<4;dir++){ - for(int disp=0;disp directions(npoint,dir); @@ -47,22 +41,13 @@ int main (int argc, char ** argv) CartesianStencil myStencil(&Fine,npoint,0,directions,displacements); - printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp); std::vector ocoor(4); for(int o=0;o > comm_buf(myStencil._unified_buffer_size); - printf("calling halo exchange\n");fflush(stdout); SimpleCompressor compress; myStencil.HaloExchange(Foo,comm_buf,compress); @@ -81,14 +66,12 @@ int main (int argc, char ** argv) Check._odata[i] = Foo._odata[offset]; else Check._odata[i] = comm_buf[offset]; - - } Real nrmC = norm2(Check); Real nrmB = norm2(Bar); Real nrm = norm2(Check-Bar); - printf("N2diff = %le (%le, %le) \n",nrm,nrmC,nrmB);fflush(stdout); + std::cout<<"N2diff ="< 0){ - printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n", + printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n", coor[0],coor[1],coor[2],coor[3],r,c, nn, real(check()()(r,c)), + imag(check()()(r,c)), real(bar()()(r,c)) ); } @@ -124,7 +108,7 @@ int main (int argc, char ** argv) }}}} - printf("scalar N2diff = %le (%le, %le) \n",snrm,snrmC,snrmB);fflush(stdout); + std::cout<<"scalar N2diff = "< seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); + // std::vector seeds({1,2,3,4}); // pRNG.SeedFixedIntegers(seeds); pRNG.SeedRandomDevice(); @@ -44,7 +45,7 @@ int main (int argc, char ** argv) U[mu] = peekIndex<3>(Umu,mu); } - std::vector mask({0,0,0,0,1,0,0,0}); + std::vector mask({1,1,1,1,1,1,1,1}); { // Naive wilson implementation ref = zero; for(int mu=0;mu