mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Bug fix for stencil with large shifts (3+), would be important to naik term for example but did not
impact Wilson based nearest neighbour stencils.
This commit is contained in:
parent
841a37f941
commit
145a295231
@ -163,18 +163,19 @@ namespace Grid {
|
||||
// So tables are the same whether comm_dim or splice_dim
|
||||
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
|
||||
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
|
||||
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
Comms(point,dimension,shift,0x3);
|
||||
// std::cout<<"Comms 0x3"<<std::endl;
|
||||
Comms(point,dimension,shift,0x3);
|
||||
} else {
|
||||
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
|
||||
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
|
||||
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
|
||||
}
|
||||
}
|
||||
// for(int ss=0;ss<osites;ss++){
|
||||
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
|
||||
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
|
||||
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
|
||||
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
|
||||
// }
|
||||
}
|
||||
}
|
||||
@ -211,7 +212,6 @@ namespace Grid {
|
||||
wraparound = 1;
|
||||
}
|
||||
|
||||
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int wrap = sshift/rd;
|
||||
@ -228,6 +228,7 @@ namespace Grid {
|
||||
void Comms (int point,int dimension,int shiftpm,int cbmask)
|
||||
{
|
||||
GridBase *grid=_grid;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
@ -242,7 +243,8 @@ namespace Grid {
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
|
||||
// std::cout << " dim " <<dimension<<" buffersize "<<buffer_size<<std::endl;
|
||||
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
|
||||
// send to one or more remote nodes.
|
||||
|
||||
@ -252,11 +254,40 @@ namespace Grid {
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
int offnode = (comm_proc!= 0);
|
||||
|
||||
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
|
||||
int permute_type=grid->PermuteType(dimension);
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
int offnode = 0;
|
||||
if ( simd_layout > 1 ) {
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
int inner_bit = (Nsimd>>(permute_type+1));
|
||||
int ic= (i&inner_bit)? 1:0;
|
||||
int my_coor = rd*ic + x;
|
||||
int nbr_coor = my_coor+sshift;
|
||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||
|
||||
if ( nbr_proc ) {
|
||||
offnode =1;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
offnode = (comm_proc!= 0);
|
||||
// std::cout << "Stencil x "<<x<<" shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<< " comm_proc "<<comm_proc<<" pd "<< pd <<std::endl;
|
||||
}
|
||||
|
||||
|
||||
// Stencil x 1 shift 3 sshift 3 fd 8 rd 2 offnode 0 sx 0 comm_proc 0 pd 2
|
||||
// x+sshift = 4
|
||||
// x+sshift/2 = 2
|
||||
// 2%2 == 0
|
||||
// Problem: sshift is wrong in "rd" for SIMD directions. The complex logic in Cshift_mpi is needed.
|
||||
|
||||
int wraparound=0;
|
||||
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
|
||||
wraparound = 1;
|
||||
@ -282,6 +313,7 @@ namespace Grid {
|
||||
|
||||
int unified_buffer_offset = _unified_buffer_size;
|
||||
_unified_buffer_size += words;
|
||||
// std::cout<< "Comms dim "<<dimension<<" offset "<<unified_buffer_offset<<" size "<<" " << _unified_buffer_size<<std::endl;
|
||||
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
|
||||
|
||||
}
|
||||
@ -440,7 +472,7 @@ namespace Grid {
|
||||
nosplicetime+=usecond();
|
||||
}
|
||||
} else {
|
||||
std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||
// std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||
if(splice_dim){
|
||||
splicetime-=usecond();
|
||||
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
|
||||
@ -595,7 +627,6 @@ namespace Grid {
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
|
||||
int inner_bit = (Nsimd>>(permute_type+1));
|
||||
int ic= (i&inner_bit)? 1:0;
|
||||
|
||||
@ -633,10 +664,11 @@ namespace Grid {
|
||||
}
|
||||
}
|
||||
|
||||
// Here we don't want to scatter, just place into a buffer.
|
||||
// std::cout << " CommsSimd ["<<dimension<<"] offset "<<u_comm_offset<<" buffsize "<<buffer_size <<" unified buffer size "<<_unified_buffer_size<<std::endl;
|
||||
mergetime-=usecond();
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int i=0;i<buffer_size;i++){
|
||||
// std::cout<<"buffer loop " << i<<" "<<u_comm_offset+i<<" / "<<_unified_buffer_size<<std::endl;
|
||||
// assert(u_comm_offset+i<_unified_buffer_size);
|
||||
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
|
||||
}
|
||||
|
@ -167,7 +167,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
||||
std::vector<scalar_object *> pointers(Nsimd); //
|
||||
std::vector<scalar_object *> pointers(Nsimd); //
|
||||
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
|
||||
|
||||
///////////////////////////////////////////
|
||||
|
@ -176,6 +176,8 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
||||
scalar_type *pointer;
|
||||
scalar_type *vp = (scalar_type *)&vec;
|
||||
|
||||
// assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
|
||||
|
||||
for(int w=0;w<words;w++){
|
||||
for(int i=0;i<Nextr;i++){
|
||||
for(int ii=0;ii<s;ii++){
|
||||
|
@ -169,12 +169,13 @@ int main (int argc, char ** argv)
|
||||
ECheck.checkerboard = Odd;
|
||||
OCheck.checkerboard = Even;
|
||||
}
|
||||
|
||||
// Implement a stencil code that should agree with that darn cshift!
|
||||
for(int i=0;i<OCheck._grid->oSites();i++){
|
||||
int permute_type;
|
||||
StencilEntry *SE;
|
||||
SE = EStencil.GetEntry(permute_type,0,i);
|
||||
std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
|
||||
// std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
|
||||
|
||||
if ( SE->_is_local && SE->_permute )
|
||||
permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type);
|
||||
@ -187,7 +188,7 @@ int main (int argc, char ** argv)
|
||||
int permute_type;
|
||||
StencilEntry *SE;
|
||||
SE = OStencil.GetEntry(permute_type,0,i);
|
||||
std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
|
||||
// std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
|
||||
|
||||
if ( SE->_is_local && SE->_permute )
|
||||
permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type);
|
||||
|
Loading…
Reference in New Issue
Block a user