Bug fix for stencil with large shifts (3+), would be important to naik term for example but did not

impact Wilson based nearest neighbour stencils.
2025-08-01 20:27:07 +01:00 · 2015-12-30 19:29:48 +00:00
parent 841a37f941
commit 145a295231
4 changed files with 52 additions and 17 deletions
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -163,13 +163,14 @@ namespace Grid {
 	  //        So tables are the same whether comm_dim or splice_dim
 	  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
 	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+
 	  if ( sshift[0] == sshift[1] ) {
-	    Comms(point,dimension,shift,0x3);
 	    //	    std::cout<<"Comms 0x3"<<std::endl;
+	    Comms(point,dimension,shift,0x3);
 	  } else {
+	    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
 	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
-	    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
 	  }
 	}
 	//	for(int ss=0;ss<osites;ss++){
@@ -211,7 +212,6 @@ namespace Grid {
 	  wraparound = 1;
 	}
 	  
-	  
 	int permute_slice=0;
 	if(permute_dim){
 	  int wrap = sshift/rd;
@@ -228,6 +228,7 @@ namespace Grid {
    void Comms     (int point,int dimension,int shiftpm,int cbmask)
    {
      GridBase *grid=_grid;
+      const int Nsimd = grid->Nsimd();
      
      int fd              = _grid->_fdimensions[dimension];
      int ld              = _grid->_ldimensions[dimension];
@@ -242,7 +243,8 @@ namespace Grid {
      assert(shift>=0);
      assert(shift<fd);

-      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
+      //      std::cout << " dim " <<dimension<<" buffersize "<<buffer_size<<std::endl;
      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
                                           // send to one or more remote nodes.

@@ -252,11 +254,40 @@ namespace Grid {

      for(int x=0;x<rd;x++){       

-	int sx        =  (x+sshift)%rd;
-	int comm_proc = ((x+sshift)/rd)%pd;
-    	int offnode = (comm_proc!= 0);

-	//	std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
+	int permute_type=grid->PermuteType(dimension);
+
+	int sx        =  (x+sshift)%rd;
+	
+    	int offnode = 0;
+	if ( simd_layout > 1 ) {
+
+	  for(int i=0;i<Nsimd;i++){
+
+	    int inner_bit = (Nsimd>>(permute_type+1));
+	    int ic= (i&inner_bit)? 1:0;
+	    int my_coor          = rd*ic + x;
+	    int nbr_coor         = my_coor+sshift;
+	    int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+
+	    if ( nbr_proc ) { 
+	      offnode =1;
+	    }
+	  }
+	  
+	} else { 
+	  int comm_proc = ((x+sshift)/rd)%pd;
+	  offnode = (comm_proc!= 0);
+	  //	  std::cout << "Stencil x "<<x<<" shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<< " comm_proc "<<comm_proc<<" pd "<< pd <<std::endl;
+	}
+
+
+	// Stencil x 1 shift 3 sshift 3 fd 8 rd 2 offnode 0 sx 0 comm_proc 0 pd 2
+	// x+sshift = 4
+	// x+sshift/2 = 2
+	// 2%2 == 0
+	// Problem: sshift is wrong in "rd" for SIMD directions. The complex logic in Cshift_mpi is needed.
+
 	int wraparound=0;
 	if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
 	  wraparound = 1;
@@ -282,6 +313,7 @@ namespace Grid {

 	  int unified_buffer_offset = _unified_buffer_size;
 	  _unified_buffer_size    += words;
+	  //	  std::cout<< "Comms dim "<<dimension<<" offset "<<unified_buffer_offset<<" size "<<" " << _unified_buffer_size<<std::endl;
 	  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
 	  
 	}
@@ -440,7 +472,7 @@ namespace Grid {
 		nosplicetime+=usecond();
 	      }
 	    } else {
-	      std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+	      //	      std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 	      if(splice_dim){
 		splicetime-=usecond();
 		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
@@ -595,7 +627,6 @@ namespace Grid {

 	      for(int i=0;i<Nsimd;i++){

-
 		int inner_bit = (Nsimd>>(permute_type+1));
 		int ic= (i&inner_bit)? 1:0;

@@ -633,10 +664,11 @@ namespace Grid {
 		}
 	      }

-	      // Here we don't want to scatter, just place into a buffer.
+	      //	      std::cout << " CommsSimd ["<<dimension<<"] offset "<<u_comm_offset<<" buffsize "<<buffer_size  <<" unified  buffer size "<<_unified_buffer_size<<std::endl;
 	      mergetime-=usecond();
 PARALLEL_FOR_LOOP
 	      for(int i=0;i<buffer_size;i++){
+		//		std::cout<<"buffer loop " << i<<" "<<u_comm_offset+i<<" / "<<_unified_buffer_size<<std::endl;
 		//		assert(u_comm_offset+i<_unified_buffer_size);
 		merge(u_comm_buf[u_comm_offset+i],rpointers,i);
 	      }
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -176,6 +176,8 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  scalar_type *pointer;
  scalar_type *vp = (scalar_type *)&vec;

+  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
+
  for(int w=0;w<words;w++){
    for(int i=0;i<Nextr;i++){
      for(int ii=0;ii<s;ii++){
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -169,12 +169,13 @@ int main (int argc, char ** argv)
 	  ECheck.checkerboard = Odd;
 	  OCheck.checkerboard = Even;
 	}
+
 	// Implement a stencil code that should agree with that darn cshift!
 	for(int i=0;i<OCheck._grid->oSites();i++){
 	  int permute_type;
 	  StencilEntry *SE;
 	  SE = EStencil.GetEntry(permute_type,0,i);
-	  std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
+	  //	  std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;

 	  if ( SE->_is_local && SE->_permute )
 	    permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type);
@@ -187,7 +188,7 @@ int main (int argc, char ** argv)
 	  int permute_type;
 	  StencilEntry *SE;
 	  SE = OStencil.GetEntry(permute_type,0,i);
-	  std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
+	  //	  std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
 	  
 	  if ( SE->_is_local && SE->_permute )
 	    permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type);