diff --git a/lib/Stencil.h b/lib/Stencil.h index f7c90f6b..4818650f 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -163,18 +163,19 @@ namespace Grid { // So tables are the same whether comm_dim or splice_dim sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); + if ( sshift[0] == sshift[1] ) { - Comms(point,dimension,shift,0x3); // std::cout<<"Comms 0x3"< o"<<_entries[i][ss]._offset<<"; l"<< - // _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute< o"<<_entries[i][ss]._offset<<"; l"<< + // _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<Nsimd(); int fd = _grid->_fdimensions[dimension]; int ld = _grid->_ldimensions[dimension]; @@ -241,8 +242,9 @@ namespace Grid { int shift = (shiftpm + fd) %fd; assert(shift>=0); assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; + + int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored + // std::cout << " dim " <PermuteType(dimension); + + int sx = (x+sshift)%rd; + + int offnode = 0; + if ( simd_layout > 1 ) { + + for(int i=0;i>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + if ( nbr_proc ) { + offnode =1; + } + } + + } else { + int comm_proc = ((x+sshift)/rd)%pd; + offnode = (comm_proc!= 0); + // std::cout << "Stencil x "<x) && (grid->_processor_coor[dimension]==0) ) { wraparound = 1; @@ -282,6 +313,7 @@ namespace Grid { int unified_buffer_offset = _unified_buffer_size; _unified_buffer_size += words; + // std::cout<< "Comms dim "<>(permute_type+1)); int ic= (i&inner_bit)? 1:0; @@ -633,10 +664,11 @@ namespace Grid { } } - // Here we don't want to scatter, just place into a buffer. + // std::cout << " CommsSimd ["< void Cshift_comms_simd(Lattice &ret,const Lattice > recv_buf_extract(Nsimd,std::vector(buffer_size) ); int bytes = buffer_size*sizeof(scalar_object); - std::vector pointers(Nsimd); // + std::vector pointers(Nsimd); // std::vector rpointers(Nsimd); // received pointers /////////////////////////////////////////// diff --git a/lib/tensors/Tensor_extract_merge.h b/lib/tensors/Tensor_extract_merge.h index 530d56e4..34562189 100644 --- a/lib/tensors/Tensor_extract_merge.h +++ b/lib/tensors/Tensor_extract_merge.h @@ -176,6 +176,8 @@ void merge(vobj &vec,std::vector &extracted,int scalar_type *pointer; scalar_type *vp = (scalar_type *)&vec; + // assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0); + for(int w=0;woSites();i++){ int permute_type; StencilEntry *SE; SE = EStencil.GetEntry(permute_type,0,i); - std::cout << "Even source "<< i<<" -> " <_offset << " "<< SE->_is_local< " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type); @@ -187,7 +188,7 @@ int main (int argc, char ** argv) int permute_type; StencilEntry *SE; SE = OStencil.GetEntry(permute_type,0,i); - std::cout << "ODD source "<< i<<" -> " <_offset << " "<< SE->_is_local< " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type);