Fixed the stencil sector and Wilson now agrees between stencil based implementation

and the cshift based implementation. Managed to reduce the volume of code in this sector a little, but consolidation would be good, perhaps taking common logic out into simple helper functions
2025-07-27 17:57:08 +01:00 · 2015-04-29 06:23:56 +01:00
parent b0485894b3
commit dcc23faa4a
4 changed files with 17 additions and 51 deletions
--- a/lib/Grid_stencil.h
+++ b/lib/Grid_stencil.h
@@ -39,13 +39,6 @@

 namespace Grid {

-  struct CommsRequest { 
-    int words;
-    int unified_buffer_offset;
-    int tag;
-    int to_rank;
-    int from_rank;
-  } ;


  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
@@ -69,7 +62,6 @@ namespace Grid {
      int _unified_buffer_size;
      int _request_count;

-      std::vector<CommsRequest>  CommsRequests;

      CartesianStencil(GridBase *grid,
 		       int npoints,
@@ -90,7 +82,6 @@ namespace Grid {
      template<class vobj,class cobj, class compressor> void 
 	HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
      {
-	std::cout<< "HaloExchange comm_buf.size()="<< u_comm_buf.size()<<" unified_buffer_size"<< _unified_buffer_size<< std::endl;
 	// conformable(source._grid,_grid);
 	assert(source._grid==_grid);
 	if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
@@ -141,7 +132,6 @@ namespace Grid {
 	    }
 	  }
 	}
-	std::cout<< "HaloExchange complete"<< std::endl;
      }

      template<class vobj,class cobj, class compressor> 
@@ -194,24 +184,18 @@ namespace Grid {
 	      _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
 	      assert (xmit_to_rank != _grid->ThisRank());
 	      assert (recv_from_rank != _grid->ThisRank());
+
 	      //      FIXME Implement asynchronous send & also avoid buffer copy
 	      _grid->SendToRecvFrom((void *)&send_buf[0],
 				   xmit_to_rank,
 				   (void *)&recv_buf[0],
 				   recv_from_rank,
 				   bytes);
-	      printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout);

-	      printf("GatherStartComms inserting %le to u_comm_offset %d buf size %d for dim %d shift %d\n",
-		     *( (RealF *) &recv_buf[0]),
-		     u_comm_offset,buffer_size,
-		     dimension,shift
-		     ); fflush(stdout);
 	      for(int i=0;i<buffer_size;i++){
 		u_comm_buf[u_comm_offset+i]=recv_buf[i];
 	      }
 	      u_comm_offset+=buffer_size;
-	      printf("GatherStartComms inserted x %d\n",x);fflush(stdout);
 	    }
 	  }
 	}
@@ -248,7 +232,7 @@ namespace Grid {
 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 	  int words = sizeof(cobj)/sizeof(vector_type);

-	  /*   FIXME ALTERNATE BUFFER DETERMINATION */
+	  /* FIXME ALTERNATE BUFFER DETERMINATION ; possibly slow to allocate*/
 	  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) ); 
 	  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 	  int bytes = buffer_size*sizeof(scalar_object);
@@ -267,25 +251,21 @@ namespace Grid {
 	  for(int x=0;x<rd;x++){       

 	    int any_offnode = ( ((x+sshift)%fd) >= rd );
-	    std::cout<<"any_offnode ="<<any_offnode<<std::endl;
+
 	    if ( any_offnode ) {
-	      // FIXME call local permute copy if none are offnode.
+
 	      for(int i=0;i<Nsimd;i++){       
 		pointers[i] = &send_buf_extract[i][0];
 	      }
 	      int sx   = (x+sshift)%rd;
 	      
-	      std::cout<< "Gathering "<< x <<std::endl;
 	      Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
-	      std::cout<< "Gathered "<<std::endl;
+
 	      for(int i=0;i<Nsimd;i++){
 		
-		std::vector<int> icoor;
-		_grid->iCoorFromIindex(icoor,i);

 		int inner_bit = (Nsimd>>(permute_type+1));
 		int ic= (i&inner_bit)? 1:0;
-		assert(ic==icoor[dimension]);

 		int my_coor          = rd*ic + x;
 		int nbr_coor         = my_coor+sshift;
@@ -301,12 +281,9 @@ namespace Grid {
 		if (nbr_ic) nbr_lane|=inner_bit;
 		assert (sx == nbr_ox);

-   std::cout<<"nbr_proc "<<nbr_proc<< " x "<<x<<" nbr_x "<<nbr_ox << " lane "<<i << " nbr_lane "<<nbr_lane
-	    << " nbr_ic "<<nbr_ic  << " mycoor "<< my_coor<< " nbr_coor "<<nbr_coor<<std::endl;
 		
 		if(nbr_proc){
 		  
-		  std::cout<< "MPI sending "<<std::endl;
 		  _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 		  
 		  _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
@@ -314,23 +291,20 @@ namespace Grid {
 					(void *)&recv_buf_extract[i][0],
 					recv_from_rank,
 					bytes);
-		  std::cout<< "MPI complete "<<std::endl;
 		  
 		  rpointers[i] = &recv_buf_extract[i][0];
-		  std::cout<<"lane "<<i<<" data "<<*( (Real *) rpointers[i])<<std::endl;
+
 		} else { 
 		  rpointers[i] = &send_buf_extract[nbr_lane][0];
-		  std::cout<<"lane "<<i<<" data "<<*( (Real *) rpointers[i])<<std::endl;
 		}
 	      }

 	      // Here we don't want to scatter, just place into a buffer.
-	      std::cout<< "merging u_comm_offset "<< u_comm_offset<<" comm_buf_size" << u_comm_buf.size() <<std::endl;
-
 	      for(int i=0;i<buffer_size;i++){
 		assert(u_comm_offset+i<_unified_buffer_size);
 		merge(u_comm_buf[u_comm_offset+i],rpointers,i);
 	      }
+
 	      u_comm_offset+=buffer_size;
 	    }
 	  }
--- a/lib/qcd/Grid_qcd_wilson_dop.cc
+++ b/lib/qcd/Grid_qcd_wilson_dop.cc
@@ -24,7 +24,6 @@ const int WilsonMatrix::Tm = 7;
    
    void Point(int p) { 
      mu=p;
-      std::cout << "WilsonCompressor.Point " << mu<<std::endl;
    };

    vHalfSpinColourVector operator () (const vSpinColourVector &in)
@@ -193,7 +192,6 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 	chi_p = &tmp;
      }
    }
-    std::cout<<"Xm for site  "<<ss<<" l "<<local<<" p "<<perm<<" chi "<<Reduce(TensorRemove(innerProduct(*chi_p,*chi_p)))<<std::endl;
    mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)());
    accumReconXm(result,Uchi);

--- a/lib/stencil/Grid_stencil_common.cc
+++ b/lib/stencil/Grid_stencil_common.cc
@@ -19,7 +19,6 @@ namespace Grid {
      _distances  = distances;
      _unified_buffer_size=0;
      _request_count =0;
-      CommsRequests.resize(0);

      int osites  = _grid->oSites();

@@ -117,6 +116,7 @@ namespace Grid {
      GridBase *grid=_grid;
      
      int fd              = _grid->_fdimensions[dimension];
+      int ld              = _grid->_ldimensions[dimension];
      int rd              = _grid->_rdimensions[dimension];
      int pd              = _grid->_processors[dimension];
      int simd_layout     = _grid->_simd_layout[dimension];
@@ -137,9 +137,10 @@ namespace Grid {
      
      for(int x=0;x<rd;x++){       
 	
-	int comm_proc = ((x+sshift)/rd)%pd;
-	int offnode =   (comm_proc!=0);
-	int sx        = (x+sshift)%rd;
+	int offnode = (((x+sshift)%fd) >= rd ); 
+	//	int comm_proc   = ((x+sshift)/ld)%pd;        
+	//	int offnode     = (comm_proc!=0);
+	int sx          = (x+sshift)%rd;

 	if (!offnode) {
 	  
@@ -157,17 +158,9 @@ namespace Grid {
 	  int recv_from_rank;
 	  int xmit_to_rank;

-	  CommsRequest cr;
-
-	  cr.tag                   = _request_count++;
-	  cr.words                 = words;
-	  cr.unified_buffer_offset = _unified_buffer_size;
+	  int unified_buffer_offset = _unified_buffer_size;
 	  _unified_buffer_size    += words;
-	  grid->ShiftedRanks(dimension,comm_proc,cr.to_rank,cr.from_rank);
-
-	  CommsRequests.push_back(cr);
-
-	  ScatterPlane(point,dimension,x,cbmask,cr.unified_buffer_offset); // permute/extract/merge is done in comms phase
+	  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset); // permute/extract/merge is done in comms phase
 	  
 	}
      }
--- a/tests/Grid_wilson.cc
+++ b/tests/Grid_wilson.cc
@@ -21,7 +21,7 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);

  std::vector<int> simd_layout({1,1,2,2});
-  std::vector<int> mpi_layout ({2,1,1,2});
+  std::vector<int> mpi_layout ({2,2,2,2});
  std::vector<int> latt_size  ({8,8,8,8});
    
  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
@@ -76,7 +76,8 @@ int main (int argc, char ** argv)
  std::cout << "norm result "<< norm2(result)<<std::endl;
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;

-  for(int ss=0;ss<10;ss++ ){
+  //  for(int ss=0;ss<10;ss++ ){
+  for(int ss=0;ss<0;ss++ ){
    for(int i=0;i<Ns;i++){
      for(int j=0;j<Nc;j++){
 	ComplexF * ref_p = (ComplexF *)&ref._odata[ss]()(i)(j);