Benchmark prep

2026-02-15 11:20:54 +00:00 · 2017-08-25 09:25:54 +01:00
parent b49bec0cec
commit c3b1263e75
10 changed files with 494 additions and 71 deletions
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -32,6 +32,19 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;

+typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
+typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
+typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
+
+
+std::vector<int> L_list;
+std::vector<int> Ls_list;
+std::vector<double> mflop_list;
+
+double mflop_ref;
+double mflop_ref_err;
+
+int NN_global;

 struct time_statistics{
  double mean;
@@ -95,13 +108,15 @@ public:

  static void Comms(void)
  {
-    int Nloop=100;
+    int Nloop=1000;
    int nmu=0;
    int maxlat=32;

    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();

+    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
+
    std::vector<double> t_time(Nloop);
    time_statistics timestat;

@@ -133,13 +148,14 @@ public:
 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}

-	int ncomm;
 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+	int ncomm;
 	double dbytes;
+	std::vector<double> times(Nloop);
 	for(int i=0;i<Nloop;i++){
+
 	  double start=usecond();

-	  std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	  dbytes=0;
 	  ncomm=0;

@@ -150,7 +166,6 @@ public:

 	    if (mpi_layout[mu]>1 ) {
 	        
-	      ncomm++;
 	      int xmit_to_rank;
 	      int recv_from_rank;
 	      if ( dir == mu ) { 
@@ -160,18 +175,18 @@ public:
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
-#if 1
-	      tbytes= Grid.StencilSendToRecvFromBegin(requests,
-						      (void *)&xbuf[dir][0],
-						      xmit_to_rank,
-						      (void *)&rbuf[dir][0],
-						      recv_from_rank,
-						      bytes,dir);
-	      Grid.StencilSendToRecvFromComplete(requests,dir);
-#endif
-	      requests.resize(0);
-
+	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+						 (void *)&rbuf[dir][0], recv_from_rank,
+						 bytes,dir);
+	  
+#ifdef GRID_OMP
 #pragma omp atomic
+#endif
+	      ncomm++;
+
+#ifdef GRID_OMP
+#pragma omp atomic
+#endif
 	      dbytes+=tbytes;
 	    }
 	  }
@@ -181,13 +196,15 @@ public:
 	}

 	timestat.statistics(t_time);
+	//	for(int i=0;i<t_time.size();i++){
+	  //	  std::cout << i<<" "<<t_time[i]<<std::endl;
+	//	}

 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
 	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;

-
 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
@@ -196,7 +213,8 @@ public:
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;

 
-      }
+	
+	    }
    }    

    return;
@@ -218,7 +236,7 @@ public:
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  
  uint64_t lmax=48;
-#define NLOOP (50*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)

    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=4){
@@ -253,8 +271,7 @@ public:
    }
  };

-
-  static void DWF(int Ls,int L)
+  static double DWF5(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
@@ -262,6 +279,7 @@ public:
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
+    std::vector<double> mflops_all;

    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
@@ -274,6 +292,189 @@ public:
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+    std::vector<int> internal;
+    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
+    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
+    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
+    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
+    else assert(0);
+
+    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
+    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
+    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
+    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
+    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
+
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    std::vector<int> seeds5({5,6,7,8});
+    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
+    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    ///////// Source preparation ////////////
+    LatticeFermion src   (sFGrid); random(RNG5,src);
+    LatticeFermion tmp   (sFGrid);
+
+    RealD N2 = 1.0/::sqrt(norm2(src));
+    src = src*N2;
+    
+    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
+
+    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
+    LatticeFermion src_e (sFrbGrid);
+    LatticeFermion src_o (sFrbGrid);
+    LatticeFermion r_e   (sFrbGrid);
+    LatticeFermion r_o   (sFrbGrid);
+    LatticeFermion r_eo  (sFGrid);
+    LatticeFermion err   (sFGrid);
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+
+#if defined(AVX512) 
+      const int num_cases = 6;
+      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
+#else
+      const int num_cases = 4;
+      std::string fmt("U/S ; U/O ; G/S ; G/O ");
+#endif
+      controls Cases [] = {
+#ifdef AVX512
+	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
+#endif
+	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+
+	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
+	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
+	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+	int nwarm = 200;
+	double t0=usecond();
+	sFGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  sDw.DhopEO(src_o,r_e,DaggerNo);
+	}
+	sFGrid->Barrier();
+	double t1=usecond();
+	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+	//	if (ncall < 500) ncall = 500;
+	uint64_t ncall = 1000;
+
+	sFGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+	sDw.ZeroCounters();
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  sDw.DhopEO(src_o,r_e,DaggerNo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	sFGrid->Barrier();
+	
+	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=(1344.0*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
+
+	sDw.Report();
+
+      }
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    }
+    return mflops_best;
+  }
+
+  static double DWF(int Ls,int L)
+  {
+    RealD mass=0.1;
+    RealD M5  =1.8;
+
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    std::vector<int> local({L,L,L,L});
+
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
    uint64_t SHM=NP/NN;

    std::vector<int> internal;
@@ -364,13 +565,15 @@ public:

 #if defined(AVX512) 
      const int num_cases = 6;
+      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
+      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
-	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
@@ -394,7 +597,7 @@ public:
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

-	int nwarm = 10;
+	int nwarm = 200;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
@@ -402,7 +605,10 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+	//	if (ncall < 500) ncall = 500;
+	uint64_t ncall = 1000;
+
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));

 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
@@ -428,7 +634,7 @@ public:
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;

 	mflops = flops/timestat.mean;
-
+	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
@@ -450,12 +656,20 @@ public:

      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;

    }
+    return mflops_best;
  }

 };
@@ -493,26 +707,66 @@ int main (int argc, char ** argv)
    // empty for now
  }

+  int sel=2;
+  std::vector<int> L_list({8,12,16,24});
+  std::vector<double> wilson;
+  std::vector<double> dwf4;
+  std::vector<double> dwf5;
+
  if ( do_wilson ) {
    int Ls=1;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::DWF(Ls,16);
-    Benchmark::DWF(Ls,24);
-    Benchmark::DWF(Ls,32);
+    for(int l=0;l<L_list.size();l++){
+      wilson.push_back(Benchmark::DWF(1,L_list[l]));
+    }
  }

+  int Ls=16;
  if ( do_dwf ) {
-    int Ls=16;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::DWF(Ls,8);
-    Benchmark::DWF(Ls,12);
-    Benchmark::DWF(Ls,16);
-    Benchmark::DWF(Ls,24);
+    for(int l=0;l<L_list.size();l++){
+      dwf4.push_back(Benchmark::DWF(Ls,L_list[l]));
+    }
  }

+  if ( do_dwf ) {
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    for(int l=0;l<L_list.size();l++){
+      dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
+    }
+
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
+  }
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  int NN=NN_global;
+  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
+  }
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Comparison point result: "  << dwf4[sel]/NN <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+
+  }
+
+
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -92,11 +92,16 @@ int main (int argc, char ** argv)
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;

-      Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);

      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      for(int mu=0;mu<8;mu++){
+	xbuf[mu].resize(lat*lat*lat*Ls);
+	rbuf[mu].resize(lat*lat*lat*Ls);
+	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
+      }

      for(int i=0;i<Nloop;i++){
      double start=usecond();
@@ -112,7 +117,6 @@ int main (int argc, char ** argv)
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
-	    
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				   (void *)&xbuf[mu][0],
@@ -172,9 +176,14 @@ int main (int argc, char ** argv)
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;

-      Vector<Vector<HalfSpinColourVectorD> > xbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      Vector<Vector<HalfSpinColourVectorD> > rbuf(8,Vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);

+      for(int mu=0;mu<8;mu++){
+	xbuf[mu].resize(lat*lat*lat*Ls);
+	rbuf[mu].resize(lat*lat*lat*Ls);
+	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
+      }

      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@@ -493,14 +502,9 @@ int main (int argc, char ** argv)
 	      int comm_proc = mpi_layout[mu]-1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
-	    tbytes= Grid.StencilSendToRecvFromBegin(requests,
-						    (void *)&xbuf[dir][0],
-						    xmit_to_rank,
-						    (void *)&rbuf[dir][0],
-						    recv_from_rank,
-						    bytes,dir);
-	    Grid.StencilSendToRecvFromComplete(requests,dir);
-	    requests.resize(0);
+
+	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);

 #pragma omp atomic
 	    dbytes+=tbytes;