From 6f5a5cd9b3269932a720804aebe8b7046d4b68fe Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 28 Jun 2017 23:27:02 +0100
Subject: [PATCH] Improved threaded comms benchmark

---
 TODO                          | 11 ++--
 benchmarks/Benchmark_comms.cc | 94 +++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 3 deletions(-)
diff --git a/TODO b/TODO
index 001c6c0c..3d29215e 100644
--- a/TODO
+++ b/TODO
@@ -2,10 +2,13 @@ TODO:
 ---------------
 
 Large item work list:
-1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
+1)- I/O;    There appear to be issues with MPI IO and NERSC with large files.
+            Possible 2GB limit reappeared. GPFS driver in Intel MPI.
+
+2)- BG/Q port and check
+
+3)- Christoph's local basis expansion Lanczos; port to use Lattice_transfer features
 
-2)- Christoph's local basis expansion Lanczos
-3)- BG/Q port and check
 4)- Precision conversion and sort out localConvert      <-- partial
   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 5)- Physical propagator interface
@@ -14,6 +17,8 @@ Large item work list:
 8)- HDCR resume
 
 Recent DONE 
+
+-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 753b8a58..698f9d25 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -435,6 +435,100 @@ int main (int argc, char ** argv)
  
     }
   }    
+
+
+
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  header();
+
+  for(int lat=4;lat<=maxlat;lat+=4){
+    for(int Ls=8;Ls<=8;Ls*=2){
+
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
+      				    lat*mpi_layout[1],
+      				    lat*mpi_layout[2],
+      				    lat*mpi_layout[3]});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank/Nnode;
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+      Grid.ShmBufferFreeAll();
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+      }
+
+      int ncomm;
+      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      double dbytes;
+      for(int i=0;i<Nloop;i++){
+	double start=usecond();
+
+	std::vector<CartesianCommunicator::CommsRequest_t> requests;
+	dbytes=0;
+	ncomm=0;
+
+	parallel_for(int dir=0;dir<8;dir++){
+
+	  double tbytes;
+	  int mu =dir % 4;
+
+	  if (mpi_layout[mu]>1 ) {
+	  
+	    ncomm++;
+	    int xmit_to_rank;
+	    int recv_from_rank;
+	    if ( dir == mu ) { 
+	      int comm_proc=1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    } else { 
+	      int comm_proc = mpi_layout[mu]-1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    }
+	    tbytes= Grid.StencilSendToRecvFromBegin(requests,
+						    (void *)&xbuf[dir][0],
+						    xmit_to_rank,
+						    (void *)&rbuf[dir][0],
+						    recv_from_rank,
+						    bytes,dir);
+	    Grid.StencilSendToRecvFromComplete(requests,dir);
+	    requests.resize(0);
+
+#pragma omp atomic
+	    dbytes+=tbytes;
+	  }
+	}
+	Grid.Barrier();
+	double stop=usecond();
+	t_time[i] = stop-start; // microseconds
+      }
+
+      timestat.statistics(t_time);
+
+      dbytes=dbytes*ppn;
+      double xbytes    = dbytes*0.5;
+      double rbytes    = dbytes*0.5;
+      double bidibytes = dbytes;
+
+
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
+               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
+               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
+ 
+    }
+  }    
+
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
   std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;