Merge branch 'master' of github.com:paboyle/Grid

2026-07-21 19:13:28 +01:00 · 2015-11-04 05:14:26 -06:00
parent f87526a04f 9183920e8b
commit dfc1de6f60
13 changed files with 260 additions and 99 deletions
@@ -124,6 +124,7 @@ namespace Grid {
 	  if ( comm_dim ) {
 	    sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
 	    sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
+	    //	    std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 	    if ( sshift[0] == sshift[1] ) {
 	      if (splice_dim) {
 		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
@@ -164,23 +165,23 @@ namespace Grid {
 	  assert(comm_dim==1);
 	  assert(shift>=0);
 	  assert(shift<fd);
-	  
+
 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
-	  
+
 	  std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
 	  std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
-	  
+
 	  int cb= (cbmask==0x2)? Odd : Even;
 	  int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
-	  
+
 	  for(int x=0;x<rd;x++){       
-	    
+
 	    int sx        = (x+sshift)%rd;
 	    int comm_proc = ((x+sshift)/rd)%pd;

 	    if (comm_proc) {
 	      
-	      int words = send_buf.size();
+	      int words = buffer_size;
 	      if (cbmask != 0x3) words=words>>1;
 	    
 	      int bytes = words * sizeof(cobj);
@@ -201,10 +202,11 @@ namespace Grid {
 				   recv_from_rank,
 				   bytes);

-	      for(int i=0;i<buffer_size;i++){
+	      for(int i=0;i<words;i++){
 		u_comm_buf[u_comm_offset+i]=recv_buf[i];
+		//		std::cout << " Halo["<<i<<"] snd "<<send_buf[i]<< " rcv "<<recv_buf[i]<<"  mask 0x"<<cbmask<<std::endl;
 	      }
-	      u_comm_offset+=buffer_size;
+	      u_comm_offset+=words;
 	    }
 	  }
 	}
@@ -241,6 +243,7 @@ namespace Grid {
 	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 	  int words = sizeof(cobj)/sizeof(vector_type);

+	  assert(cbmask==0x3); // Fixme think there is a latent bug if not true
 	  /*
 	   * possibly slow to allocate
 	   * Doesn't matter in this test, but may want to preallocate in the 
@@ -29,14 +29,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
  
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-PARALLEL_NESTED_LOOP2
+  int bo=0;
+    //PARALLEL_NESTED_LOOP21
  for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o  = n*rhs._grid->_slice_stride[dimension];
-      int bo = n*rhs._grid->_slice_block[dimension];
+      //      int bo = n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb &cbmask ) {
-	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
+	buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
      }
    }
  }
@@ -59,7 +60,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_

  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-PARALLEL_NESTED_LOOP2
+  //PARALLEL_NESTED_LOOP2
  for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){

@@ -109,14 +110,15 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
    
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
-PARALLEL_NESTED_LOOP2
+  int bo=0;
+  //PARALLEL_NESTED_LOOP2
  for(int n=0;n<e1;n++){
    for(int b=0;b<e2;b++){
      int o   =n*rhs._grid->_slice_stride[dimension];
-      int bo  =n*rhs._grid->_slice_block[dimension];
+      //      int bo  =n*rhs._grid->_slice_block[dimension];
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
      if ( ocb & cbmask ) {
-	rhs._odata[so+o+b]=buffer[bo+b];
+	rhs._odata[so+o+b]=buffer[bo++];
      }
    }
  }
@@ -9,7 +9,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;

-  Lattice<vobj> ret(rhs._grid);
+  Lattice<vobj> ret(rhs._grid); 
  
  int fd = rhs._grid->_fdimensions[dimension];
  int rd = rhs._grid->_rdimensions[dimension];
@@ -26,10 +26,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension


  if ( !comm_dim ) {
+    //    std::cout << "Cshift_local" <<std::endl;
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
+    //    std::cout << "Cshift_comms_simd" <<std::endl;
    Cshift_comms_simd(ret,rhs,dimension,shift);
  } else {
+    //    std::cout << "Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift);
  }
  return ret;
@@ -42,9 +45,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);

+  //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+
  if ( sshift[0] == sshift[1] ) {
+    //    std::cout << "Single pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x3);
  } else {
+    //    std::cout << "Two pass Cshift_comms" <<std::endl;
    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
  }
@@ -113,12 +120,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      int xmit_to_rank;
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);

+
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);

+      //      for(int i=0;i<words;i++){
+      //	std::cout << "SendRecv ["<<i<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << "  0x" << cbmask<<std::endl;
+      //      }
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
    }
  }
@@ -82,7 +82,8 @@ namespace QCD {
  template<class Impl>
  void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
    out.checkerboard = in.checkerboard;
-    out = (4.0+mass)*in;
+    typename FermionField::scalar_type scal(4.0+mass);
+    out = scal*in;
  }
  
  template<class Impl>
@@ -78,7 +78,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
  }
  Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
  accumReconXm(result,Uchi);
-  
+
  // Ym
  SE=st.GetEntry(ptype,Ym,sF);
  if (  SE->_is_local && SE->_permute ) {
@@ -524,16 +524,22 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
  // reunitarise??
  static void LieRandomize(GridParallelRNG     &pRNG,LatticeMatrix &out,double scale=1.0){
    GridBase *grid = out._grid;
+
    LatticeComplex ca (grid);
    LatticeMatrix  lie(grid);
    LatticeMatrix  la (grid);
    Complex ci(0.0,scale);
+    Complex cone(1.0,0.0);
    Matrix ta;

    lie=zero;
    for(int a=0;a<generators();a++){

-      random(pRNG,ca); ca=real(ca)-0.5;
+      random(pRNG,ca); 
+
+      ca = (ca+conjugate(ca))*0.5;
+      ca = ca - 0.5;
+
      generator(a,ta);

      la=ci*ca*ta;
@@ -9,10 +9,10 @@

 #include <immintrin.h>

-#ifndef KNC_ONLY_STORES
-#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
-#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
-#endif
+//#ifndef KNC_ONLY_STORES
+//#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
+//#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
+//#endif


 namespace Optimization {
@@ -8,7 +8,7 @@ namespace Grid {
 				     int checkerboard,
 				     const std::vector<int> &directions,
 				     const std::vector<int> &distances) 
-    :   _entries(npoints), _permute_type(npoints)
+    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
    {
      _npoints = npoints;
      _grid    = grid;
@@ -61,11 +61,17 @@ namespace Grid {
 	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 	  if ( sshift[0] == sshift[1] ) {
 	    Comms(point,dimension,shift,0x3);
+	    //	    std::cout<<"Comms 0x3"<<std::endl;
 	  } else {
 	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
+	    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
 	  }
 	}
+	//	for(int ss=0;ss<osites;ss++){
+	  //	  std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
+	  //	    _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
+	//	}
      }
    }

@@ -139,13 +145,14 @@ namespace Grid {
      int cb= (cbmask==0x2)? Odd : Even;
      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
      
-      for(int x=0;x<rd;x++){       
-	
-	int offnode = (((x+sshift)%fd) >= rd ); 
-	//	int comm_proc   = ((x+sshift)/ld)%pd;        
-	//	int offnode     = (comm_proc!=0);
-	int sx          = (x+sshift)%rd;

+      for(int x=0;x<rd;x++){       
+
+	int sx        =  (x+sshift)%rd;
+	int comm_proc = ((x+sshift)/rd)%pd;
+    	int offnode = (comm_proc!= 0);
+
+	//	std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
 	int wraparound=0;
 	if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
 	  wraparound = 1;
@@ -249,7 +256,7 @@ namespace Grid {
 	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
 	int o   = 0;                                      // relative offset to base within plane
 	int bo  = 0;                                      // offset in buffer
-    
+
 	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 	  for(int b=0;b<_grid->_slice_block[dimension];b++){