1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Bug fix for stencil with large shifts (3+), would be important to naik term for example but did not

impact Wilson based nearest neighbour stencils.
This commit is contained in:
paboyle 2015-12-30 19:29:48 +00:00
parent 841a37f941
commit 145a295231
4 changed files with 52 additions and 17 deletions

View File

@ -163,13 +163,14 @@ namespace Grid {
// So tables are the same whether comm_dim or splice_dim // So tables are the same whether comm_dim or splice_dim
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
Comms(point,dimension,shift,0x3);
// std::cout<<"Comms 0x3"<<std::endl; // std::cout<<"Comms 0x3"<<std::endl;
Comms(point,dimension,shift,0x3);
} else { } else {
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration Comms(point,dimension,shift,0x2);// both with block stride loop iteration
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
} }
} }
// for(int ss=0;ss<osites;ss++){ // for(int ss=0;ss<osites;ss++){
@ -211,7 +212,6 @@ namespace Grid {
wraparound = 1; wraparound = 1;
} }
int permute_slice=0; int permute_slice=0;
if(permute_dim){ if(permute_dim){
int wrap = sshift/rd; int wrap = sshift/rd;
@ -228,6 +228,7 @@ namespace Grid {
void Comms (int point,int dimension,int shiftpm,int cbmask) void Comms (int point,int dimension,int shiftpm,int cbmask)
{ {
GridBase *grid=_grid; GridBase *grid=_grid;
const int Nsimd = grid->Nsimd();
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension]; int ld = _grid->_ldimensions[dimension];
@ -242,7 +243,8 @@ namespace Grid {
assert(shift>=0); assert(shift>=0);
assert(shift<fd); assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
// std::cout << " dim " <<dimension<<" buffersize "<<buffer_size<<std::endl;
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes. // send to one or more remote nodes.
@ -252,11 +254,40 @@ namespace Grid {
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
int offnode = (comm_proc!= 0);
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl; int permute_type=grid->PermuteType(dimension);
int sx = (x+sshift)%rd;
int offnode = 0;
if ( simd_layout > 1 ) {
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
if ( nbr_proc ) {
offnode =1;
}
}
} else {
int comm_proc = ((x+sshift)/rd)%pd;
offnode = (comm_proc!= 0);
// std::cout << "Stencil x "<<x<<" shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<< " comm_proc "<<comm_proc<<" pd "<< pd <<std::endl;
}
// Stencil x 1 shift 3 sshift 3 fd 8 rd 2 offnode 0 sx 0 comm_proc 0 pd 2
// x+sshift = 4
// x+sshift/2 = 2
// 2%2 == 0
// Problem: sshift is wrong in "rd" for SIMD directions. The complex logic in Cshift_mpi is needed.
int wraparound=0; int wraparound=0;
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) { if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
wraparound = 1; wraparound = 1;
@ -282,6 +313,7 @@ namespace Grid {
int unified_buffer_offset = _unified_buffer_size; int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words; _unified_buffer_size += words;
// std::cout<< "Comms dim "<<dimension<<" offset "<<unified_buffer_offset<<" size "<<" " << _unified_buffer_size<<std::endl;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
} }
@ -440,7 +472,7 @@ namespace Grid {
nosplicetime+=usecond(); nosplicetime+=usecond();
} }
} else { } else {
std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; // std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if(splice_dim){ if(splice_dim){
splicetime-=usecond(); splicetime-=usecond();
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
@ -595,7 +627,6 @@ namespace Grid {
for(int i=0;i<Nsimd;i++){ for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1)); int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0; int ic= (i&inner_bit)? 1:0;
@ -633,10 +664,11 @@ namespace Grid {
} }
} }
// Here we don't want to scatter, just place into a buffer. // std::cout << " CommsSimd ["<<dimension<<"] offset "<<u_comm_offset<<" buffsize "<<buffer_size <<" unified buffer size "<<_unified_buffer_size<<std::endl;
mergetime-=usecond(); mergetime-=usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int i=0;i<buffer_size;i++){ for(int i=0;i<buffer_size;i++){
// std::cout<<"buffer loop " << i<<" "<<u_comm_offset+i<<" / "<<_unified_buffer_size<<std::endl;
// assert(u_comm_offset+i<_unified_buffer_size); // assert(u_comm_offset+i<_unified_buffer_size);
merge(u_comm_buf[u_comm_offset+i],rpointers,i); merge(u_comm_buf[u_comm_offset+i],rpointers,i);
} }

View File

@ -176,6 +176,8 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
scalar_type *pointer; scalar_type *pointer;
scalar_type *vp = (scalar_type *)&vec; scalar_type *vp = (scalar_type *)&vec;
// assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
for(int i=0;i<Nextr;i++){ for(int i=0;i<Nextr;i++){
for(int ii=0;ii<s;ii++){ for(int ii=0;ii<s;ii++){

View File

@ -169,12 +169,13 @@ int main (int argc, char ** argv)
ECheck.checkerboard = Odd; ECheck.checkerboard = Odd;
OCheck.checkerboard = Even; OCheck.checkerboard = Even;
} }
// Implement a stencil code that should agree with that darn cshift! // Implement a stencil code that should agree with that darn cshift!
for(int i=0;i<OCheck._grid->oSites();i++){ for(int i=0;i<OCheck._grid->oSites();i++){
int permute_type; int permute_type;
StencilEntry *SE; StencilEntry *SE;
SE = EStencil.GetEntry(permute_type,0,i); SE = EStencil.GetEntry(permute_type,0,i);
std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl; // std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
if ( SE->_is_local && SE->_permute ) if ( SE->_is_local && SE->_permute )
permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type); permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type);
@ -187,7 +188,7 @@ int main (int argc, char ** argv)
int permute_type; int permute_type;
StencilEntry *SE; StencilEntry *SE;
SE = OStencil.GetEntry(permute_type,0,i); SE = OStencil.GetEntry(permute_type,0,i);
std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl; // std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
if ( SE->_is_local && SE->_permute ) if ( SE->_is_local && SE->_permute )
permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type); permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type);