Stencil fix on BNL KNL system

2025-09-18 17:21:05 +01:00 · 2017-02-20 17:51:31 -05:00
parent 5a1fb29db7
commit 3906cd2149
2 changed files with 157 additions and 27 deletions
--- a/lib/Stencil.cc
+++ b/lib/Stencil.cc
@@ -29,19 +29,18 @@
 namespace Grid {
-void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
+void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 					int off,std::vector<std::pair<int,int> > & table)
 {
  table.resize(0);
  int rd = grid->_rdimensions[dimension];
  if ( !grid->CheckerBoarded(dimension) ) {
    cbmask = 0x3;
  }
  int rd = grid->_rdimensions[dimension];
  int so= plane*grid->_ostride[dimension]; // base offset for start of plane 
  int e1=grid->_slice_nblock[dimension];
  int e2=grid->_slice_block[dimension];
  int stride=grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
    table.resize(e1*e2);
@@ -66,4 +65,5 @@ void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,i
     }
  }
 }
 }
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -29,7 +29,7 @@
 #define GRID_STENCIL_H
 #include <Grid/stencil/Lebesgue.h>   // subdir aggregate
-
+#define NEW_XYZT_GATHER
 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
 // gather to a point stencil code. CSHIFT is not the best way, so need
@@ -68,7 +68,10 @@
 namespace Grid {
-void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
+///////////////////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 					int off,std::vector<std::pair<int,int> > & table);
 template<class vobj,class cobj,class compressor> 
@@ -85,6 +88,95 @@ PARALLEL_FOR_LOOP
  }
 }
 ///////////////////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 /*
 template<class cobj,class vobj,class compressor> double
 Gather_plane_exchange(const Lattice<vobj> &rhs,
 		      std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
 {
  int rd = rhs._grid->_rdimensions[dimension];
  double t1,t2;
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask = 0x3;
  }
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1  =rhs._grid->_slice_nblock[dimension];
  int e2  =rhs._grid->_slice_block [dimension];
  int n1  =rhs._grid->_slice_stride[dimension];
  // Need to switch to a table loop
  std::vector<std::pair<int,int> > table;
  if ( cbmask ==0x3){
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o      =   n*n1;
 	int offset = b+n*e2;
 	table.push_back(std::pair<int,int> (offset,o+b));
      }
    }
  } else { 
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
    // Test_cshift_red_black code.
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o=n*n1;
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	int offset = b+n*e2;
 	if ( ocb & cbmask ) {
 	  table.push_back(std::pair<int,int> (offset,o+b));
 	}
      }
    }
  }
  assert( (table.size()&0x1)==0);
  t1=usecond();
 PARALLEL_FOR_LOOP     
  for(int j=0;j<table.size()/2;j++){
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
    cobj temp3;
    cobj temp4;
    exchange(temp3,temp4,temp1,temp2,type);
    vstream(pointers[0][j],temp3);
    vstream(pointers[1][j],temp4);
  }
  t2=usecond();
  return t2-t1;
 }
 */
 template<class cobj,class vobj,class compressor>
 void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
 				 std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
 template<class cobj,class vobj,class compressor>
 void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
 				 std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
 				 compressor &compress,int type)
 {
  assert( (table.size()&0x1)==0);
  int num=table.size()/2;
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 PARALLEL_FOR_LOOP     
  for(int j=0;j<num;j++){
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
    cobj temp3;
    cobj temp4;
    exchange(temp3,temp4,temp1,temp2,type);
    vstream(pointers[0][j],temp3);
    vstream(pointers[1][j],temp4);
  }
 }
 struct StencilEntry { 
  uint64_t _offset;
  uint64_t _byte_offset;
@@ -129,7 +221,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
    p.to_rank  = to;
    p.from_rank= from;
    p.bytes    = bytes;
    comms_bytes+=2.0*bytes;
    Packets.push_back(p);
  }
@@ -138,23 +229,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
-	_grid->StencilSendToRecvFromBegin(reqs[i],
+      comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
 					  Packets[i].send_buf,
 					  Packets[i].to_rank,
 					  Packets[i].recv_buf,
 					  Packets[i].from_rank,
 					  Packets[i].bytes);
      if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySendrecv ) {
 	_grid->StencilSendToRecvFromComplete(reqs[i]);
      }
    }
    commtime+=usecond();
  }
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    commtime-=usecond();
-    for(int i=0;i<Packets.size();i++){
+    if( _grid->CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicyIsend ) {
      for(int i=0;i<Packets.size();i++){
 	_grid->StencilSendToRecvFromComplete(reqs[i]);
      }
    }
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    commtime+=usecond();
    /*
    int dump=1;
    if(dump){
      for(int i=0;i<Packets.size();i++){
@@ -175,7 +272,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
      }
    }
    dump =0;
-
+*/
  }
  ///////////////////////////////////////////
@@ -310,10 +407,10 @@ PARALLEL_FOR_LOOP
  // depending on comms target
  cobj* u_recv_buf_p;
  cobj* u_send_buf_p;
  std::vector<scalar_object *> u_simd_send_buf;
  std::vector<scalar_object *> u_simd_recv_buf;
  std::vector<cobj *> new_simd_send_buf;
  std::vector<cobj *> new_simd_recv_buf;
  std::vector<scalar_object *> u_simd_send_buf;
  std::vector<scalar_object *> u_simd_recv_buf;
  int u_comm_offset;
  int _unified_buffer_size;
@@ -358,6 +455,10 @@ PARALLEL_FOR_LOOP
  void Report(void) {
 #define PRINTIT(A)	\
 std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
    RealD NP = _grid->_Nprocessors;
    RealD NN = _grid->NodeCount();
    if ( calls > 0. ) {
      std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
      PRINTIT(halogtime);
@@ -367,7 +468,8 @@ PARALLEL_FOR_LOOP
      if(comms_bytes>1.0){
 	PRINTIT(comms_bytes);
 	PRINTIT(commtime);
-	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl;
+	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
      }
      PRINTIT(jointime);
      PRINTIT(spintime);
@@ -465,12 +567,17 @@ PARALLEL_FOR_LOOP
    new_simd_recv_buf.resize(Nsimd);
    u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
-    for(int l=0;l<Nsimd;l++){
+#ifdef NEW_XYZT_GATHER
-      u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
+    for(int l=0;l<2;l++){
      u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
      new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
      new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    }
 #else
    for(int l=0;l<Nsimd;l++){
      u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
      u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
    }
 #endif
    PrecomputeByteOffsets();
  }
@@ -740,7 +847,11 @@ PARALLEL_FOR_LOOP
 	  splicetime-=usecond();
 	  //	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 	  //	  std::cout << "GatherSimdNew"<<std::endl;
 #ifdef NEW_XYZT_GATHER
 	  GatherSimdNew(source,dimension,shift,0x3,compress,face_idx);
 #else 
 	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 #endif
 	  splicetime+=usecond();
 	} else { 
 	  nosplicetime-=usecond();
@@ -751,8 +862,13 @@ PARALLEL_FOR_LOOP
 	if(splice_dim){
 	  splicetime-=usecond();
 	  //	  std::cout << "GatherSimdNew2calls"<<std::endl;
 #ifdef NEW_XYZT_GATHER
 	  GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
 	  GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 #else 
 	  GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
 	  GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 #endif
 	  splicetime+=usecond();
 	} else {
 	  nosplicetime-=usecond();
@@ -829,12 +945,13 @@ PARALLEL_FOR_LOOP
 	if ( !face_table_computed ) {
 	  t_table-=usecond();
 	  face_table.resize(face_idx+1);
-	  Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
+	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
 	  //	  std::cout << " face table size "<<face_idx <<" " <<  face_table[face_idx].size() <<" computed buffer size "<< words <<
 	  //		    " bytes = " << bytes <<std::endl;
 	  t_table+=usecond();
 	}
-	
+      	int rank           = _grid->_processor;
 	int rank           = _grid->_processor;
 	int recv_from_rank;
 	int xmit_to_rank;
 	_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
@@ -845,8 +962,6 @@ PARALLEL_FOR_LOOP
 	/////////////////////////////////////////////////////////
 	// try the direct copy if possible
 	/////////////////////////////////////////////////////////
 	cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
 	if ( send_buf==NULL ) { 
 	  send_buf = u_send_buf_p;
@@ -1003,7 +1118,7 @@ PARALLEL_FOR_LOOP
    assert(simd_layout==maxl);
    assert(shift>=0);
    assert(shift<fd);
-    
+
    int permute_type=_grid->PermuteType(dimension);
    //    std::cout << "SimdNew permute type "<<permute_type<<std::endl;
@@ -1015,8 +1130,11 @@ PARALLEL_FOR_LOOP
    assert(cbmask==0x3); // Fixme think there is a latent bug if not true
-    int bytes = (buffer_size*sizeof(cobj))/simd_layout;
+    int reduced_buffer_size = buffer_size;
-    assert(bytes*simd_layout == buffer_size*sizeof(cobj));
+    if (cbmask != 0x3) reduced_buffer_size=buffer_size>>1;
    int bytes = (reduced_buffer_size*sizeof(cobj))/simd_layout;
    assert(bytes*simd_layout == reduced_buffer_size*sizeof(cobj));
    std::vector<cobj *> rpointers(maxl);
    std::vector<cobj *> spointers(maxl);
@@ -1034,15 +1152,28 @@ PARALLEL_FOR_LOOP
      int any_offnode = ( ((x+sshift)%fd) >= rd );
      if ( any_offnode ) {
 	for(int i=0;i<maxl;i++){       
 	  spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset];
 	}
 	int sx   = (x+sshift)%rd;
 	gathermtime+=Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
 	//	if ( cbmask==0x3 ) { 
 	//	  std::vector<std::pair<int,int> > table;
 	t_table-=usecond();
 	if ( !face_table_computed ) {
 	  face_table.resize(face_idx+1);
 	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]);
 	  //	  std::cout << " face table size "<<face_idx <<" " <<  face_table[face_idx].size() <<" computed buffer size "<< reduced_buffer_size <<
 	  //		    " bytes = "<<bytes <<std::endl;
 	}
 	t_table+=usecond();
 	gathermtime-=usecond();
 	Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);  face_idx++;
 	gathermtime+=usecond();
 	//spointers[0] -- low
 	//spointers[1] -- high
@@ -1089,13 +1220,12 @@ PARALLEL_FOR_LOOP
 	  }
 	}
-	AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1,permute_type);
+	AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,Packets.size()-1,permute_type);
 	u_comm_offset     +=buffer_size;
      }
    }
  }
 };
 }