diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 791d79eb..e2f17d15 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -502,7 +502,6 @@ public: } void AddCopy(void *from,void * to, Integer bytes) { - // std::cout << "Adding CopyReceiveBuffer "<_simd_layout[dimension]>1 && (comm_dim); int rotate_dim = _grid->_simd_layout[dimension]>2; - this->_comms_send[ii] = comm_dim; - this->_comms_recv[ii] = comm_dim; - assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported int sshift[2]; @@ -909,25 +904,30 @@ public: } // Wrap locally dirichlet support case OR node local - if ( (offnode==0) || (comms_recv==0) ) { + if ( offnode==0 ) { int permute_slice=0; CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); - + } else { + if ( comms_recv==0 ) { + + int permute_slice=1; + CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); + + } else { + + ScatterPlane(point,dimension,x,cbmask,_unified_buffer_size,wraparound); // permute/extract/merge is done in comms phase + + } + + } + + if ( offnode ) { int words = buffer_size; if (cbmask != 0x3) words=words>>1; - - // int rank = grid->_processor; - // int recv_from_rank; - // int xmit_to_rank; - - int unified_buffer_offset = _unified_buffer_size; _unified_buffer_size += words; - - ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase - } } } @@ -1058,8 +1058,6 @@ public: int comm_proc = ((x+sshift)/rd)%pd; if (comm_proc) { - - int words = buffer_size; if (cbmask != 0x3) words=words>>1; @@ -1067,64 +1065,69 @@ public: int bytes = words * compress.CommDatumSize(); int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - if ( !face_table_computed ) { - face_table.resize(face_idx+1); - std::vector > face_table_host ; - Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host); - face_table[face_idx].resize(face_table_host.size()); - acceleratorCopyToDevice(&face_table_host[0], - &face_table[face_idx][0], - face_table[face_idx].size()*sizeof(face_table_host[0])); - } + int comm_off = u_comm_offset; - // int rank = _grid->_processor; int recv_from_rank; int xmit_to_rank; + cobj *recv_buf; + cobj *send_buf; _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); assert (xmit_to_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank()); - cobj *recv_buf; - if ( compress.DecompressionStep() ) { - recv_buf=u_simd_recv_buf[0]; - } else { - recv_buf=this->u_recv_buf_p; + if( comms_send ) { + + if ( !face_table_computed ) { + face_table.resize(face_idx+1); + std::vector > face_table_host ; + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); + face_table[face_idx].resize(face_table_host.size()); + acceleratorCopyToDevice(&face_table_host[0], + &face_table[face_idx][0], + face_table[face_idx].size()*sizeof(face_table_host[0])); + } + + + if ( compress.DecompressionStep() ) { + recv_buf=u_simd_recv_buf[0]; + } else { + recv_buf=this->u_recv_buf_p; + } + + send_buf = this->u_send_buf_p; // Gather locally, must send + + //////////////////////////////////////////////////////// + // Gather locally + //////////////////////////////////////////////////////// + assert(send_buf!=NULL); + + Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so); } - - cobj *send_buf; - send_buf = this->u_send_buf_p; // Gather locally, must send - - //////////////////////////////////////////////////////// - // Gather locally - //////////////////////////////////////////////////////// - assert(send_buf!=NULL); - if ( comms_send ) - Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); - face_idx++; - - - int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[u_comm_offset],0,bytes,cbmask); + int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask); if ( (!duplicate) ) { // Force comms for now /////////////////////////////////////////////////////////// // Build a list of things to do after we synchronise GPUs // Start comms now??? /////////////////////////////////////////////////////////// - AddPacket((void *)&send_buf[u_comm_offset], - (void *)&recv_buf[u_comm_offset], + AddPacket((void *)&send_buf[comm_off], + (void *)&recv_buf[comm_off], xmit_to_rank, comms_send, recv_from_rank, comms_recv, bytes); } - if ( compress.DecompressionStep() ) { - AddDecompress(&this->u_recv_buf_p[u_comm_offset], - &recv_buf[u_comm_offset], + if ( compress.DecompressionStep() && comms_recv ) { + AddDecompress(&this->u_recv_buf_p[comm_off], + &recv_buf[comm_off], words,Decompressions); } + u_comm_offset+=words; + face_idx++; + } } return 0; @@ -1154,7 +1157,6 @@ public: int permute_type=_grid->PermuteType(dimension); - // std::cout << "SimdNew permute type "< > face_table_host ; - Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host); + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); face_table[face_idx].resize(face_table_host.size()); acceleratorCopyToDevice(&face_table_host[0], &face_table[face_idx][0], @@ -1225,8 +1228,8 @@ public: int nbr_plane = nbr_ic; assert (sx == nbr_ox); - auto rp = &u_simd_recv_buf[i ][u_comm_offset]; - auto sp = &u_simd_send_buf[nbr_plane][u_comm_offset]; + auto rp = &u_simd_recv_buf[i ][comm_off]; + auto sp = &u_simd_send_buf[nbr_plane][comm_off]; if(nbr_proc){ @@ -1252,9 +1255,10 @@ public: } } - AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); + AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); u_comm_offset +=buffer_size; + } } return 0;