Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2025-08-02 12:47:07 +01:00 · 2018-04-26 23:00:46 +01:00
parent 7ecc47ac89 fa0d8feff4
commit 3e0eff6468
9 changed files with 249 additions and 148 deletions
--- a/benchmarks/Benchmark_memory_bandwidth.cc
+++ b/benchmarks/Benchmark_memory_bandwidth.cc
@@ -55,7 +55,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
-  uint64_t lmax=96;
+  uint64_t lmax=64;
 #define NLOOP (10*lmax*lmax*lmax*lmax/vol)
  for(int lat=8;lat<=lmax;lat+=8){

--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -35,7 +35,7 @@ using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
-#define LMAX (16)
+#define LMAX (32)
 #define LMIN (16)
 #define LINC (4)

@@ -46,14 +46,14 @@ int main (int argc, char ** argv)

  int64_t threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-#if 1
+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  x= x*y"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=4;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -85,7 +85,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -116,7 +116,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){

      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -147,7 +147,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;

-  for(int lat=2;lat<=LMAX;lat+=LINC){
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){
    
    std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
    int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@@ -171,8 +171,6 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
    
  }
-  
-#endif

  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl;
@@ -193,19 +191,63 @@ int main (int argc, char ** argv)
      LatticeColourMatrix y(&Grid); random(pRNG,y);

      for(int mu=0;mu<4;mu++){
+	      double start=usecond();
+	      for(int64_t i=0;i<Nloop;i++){
+	        z = PeriodicBC::CovShiftForward(x,mu,y);
+	    }
+	    double stop=usecond();
+	    double time = (stop-start)/Nloop*1000.0;
+	
+	
+	    double bytes=3*vol*Nc*Nc*sizeof(Complex);
+	    double flops=Nc*Nc*(6+8+8)*vol;
+	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
+      }
+  }
+#if 1
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl;
+  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
+  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
+
+  for(int lat=LMIN;lat<=LMAX;lat+=LINC){
+      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
+
+      LatticeColourMatrix z(&Grid); random(pRNG,z);
+      LatticeColourMatrix x(&Grid); random(pRNG,x);
+      LatticeColourMatrix y(&Grid); random(pRNG,y);
+      LatticeColourMatrix tmp(&Grid);
+
+      for(int mu=0;mu<4;mu++){
+	double tshift=0;
+	double tmult =0;
+
 	double start=usecond();
 	for(int64_t i=0;i<Nloop;i++){
-	  z = PeriodicBC::CovShiftForward(x,mu,y);
+	  tshift-=usecond();
+	  tmp = Cshift(y,mu,-1);
+	  tshift+=usecond();
+	  tmult-=usecond();
+	  z   = x*tmp;
+	  tmult+=usecond();
 	}
 	double stop=usecond();
-	double time = (stop-start)/Nloop*1000.0;
-	
+	double time = (stop-start)/Nloop;
+	tshift = tshift/Nloop;
+	tmult  = tmult /Nloop;
 	
 	double bytes=3*vol*Nc*Nc*sizeof(Complex);
 	double flops=Nc*Nc*(6+8+8)*vol;
+	std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl;
+	time = time * 1000; // convert to NS for GB/s
 	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
      }
-  }
-
+    }
+#endif
  Grid_finalize();
 }
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -277,7 +277,9 @@ public:
    uint8_t *cp = (uint8_t *)ptr;
    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
+#ifdef GRID_OMP
 #pragma omp parallel for schedule(static)
+#endif
      for(size_type n=0;n<bytes;n+=4096){
 	cp[n]=0;
      }
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -45,31 +45,33 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
  int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
+  int ent = 0;
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);

  int stride=rhs._grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o  = n*stride;
 	int bo = n*e2;
-	buffer[off+bo+b]=rhs._odata[so+o+b];
+	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
      }
    }
  } else { 
     int bo=0;
-     std::vector<std::pair<int,int> > table;
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*stride;
 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	 if ( ocb &cbmask ) {
-	   table.push_back(std::pair<int,int> (bo++,o+b));
+	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
 	 }
       }
     }
-     parallel_for(int i=0;i<table.size();i++){
-       buffer[off+table[i].first]=rhs._odata[so+table[i].second];
-     }
+  }
+  parallel_for(int i=0;i<ent;i++){
+    buffer[table[i].first]=rhs._odata[table[i].second];
  }
 }

@@ -140,31 +142,35 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int stride=rhs._grid->_slice_stride[dimension];
-  
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent    =0;
+
  if ( cbmask ==0x3 ) {
-    parallel_for_nest2(int n=0;n<e1;n++){
+
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int bo  =n*rhs._grid->_slice_block[dimension];
-	rhs._odata[so+o+b]=buffer[bo+b];
+	table[ent++] = std::pair<int,int>(so+o+b,bo);
      }
    }
+
  } else { 
-    std::vector<std::pair<int,int> > table;
    int bo=0;
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o   =n*rhs._grid->_slice_stride[dimension];
 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
-	  table.push_back(std::pair<int,int> (so+o+b,bo++));
+	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
 	}
      }
    }
-    parallel_for(int i=0;i<table.size();i++){
-       //       std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
-       rhs._odata[table[i].first]=buffer[table[i].second];
-     }
+  }
+
+  parallel_for(int i=0;i<ent;i++){
+    rhs._odata[table[i].first]=buffer[table[i].second];
  }
 }

@@ -228,29 +234,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
  int e2=rhs._grid->_slice_block[dimension];
  int stride = rhs._grid->_slice_stride[dimension];
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent=0;
+
  if(cbmask == 0x3 ){
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
- 
        int o =n*stride+b;
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
-	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	table[ent++] = std::pair<int,int>(lo+o,ro+o);
      }
    }
  } else { 
-    parallel_for_nest2(int n=0;n<e1;n++){
+    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
- 
        int o =n*stride+b;
        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
        if ( ocb&cbmask ) {
-  	//lhs._odata[lo+o]=rhs._odata[ro+o];
-	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
+	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
 	}
      }
    }
  }
-  
+
+  parallel_for(int i=0;i<ent;i++){
+    lhs._odata[table[i].first]=rhs._odata[table[i].second];
+  }
+
 }

 template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
@@ -269,16 +278,28 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
  int e2=rhs._grid->_slice_block [dimension];
  int stride = rhs._grid->_slice_stride[dimension];

-  parallel_for_nest2(int n=0;n<e1;n++){
-  for(int b=0;b<e2;b++){
+  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
+  int ent=0;

+  double t_tab,t_perm;
+  if ( cbmask == 0x3 ) {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
+      int o  =n*stride;
+      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  } else {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
      int o  =n*stride;
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
-      if ( ocb&cbmask ) {
-	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
-      }
+      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  }

-  }}
+  parallel_for(int i=0;i<ent;i++){
+    permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
+  }
 }

 //////////////////////////////////////////////////////
@@ -291,6 +312,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);

+  double t_local;
+  
  if ( sshift[0] == sshift[1] ) {
    Cshift_local(ret,rhs,dimension,shift,0x3);
  } else {
@@ -299,7 +322,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
  }
 }

-template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid = rhs._grid;
  int fd = grid->_fdimensions[dimension];
@@ -325,11 +348,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice

    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;
-
-    // FIXME : This must change where we have a 
-    // Rotate slice.
    
-    // Document how this works ; why didn't I do this when I first wrote it...
    // wrap is whether sshift > rd.
    //  num is sshift mod rd.
    // 
@@ -365,10 +384,8 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice

    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-
  
  }
-  return ret;
 }
 }
 #endif
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -256,13 +256,42 @@ public:
      _odata[ss]=r._odata[ss];
    }  	
  }
-  
+
  Lattice(Lattice&& r){ // move constructor
    _grid = r._grid;
    checkerboard = r.checkerboard;
    _odata=std::move(r._odata);
  }
  
+  inline Lattice<vobj> & operator = (Lattice<vobj> && r)
+  {
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata       =std::move(r._odata);
+    return *this;
+  }
+
+  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata.resize(_grid->oSites());// essential
+    
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      _odata[ss]=r._odata[ss];
+    }  	
+    return *this;
+  }
+
+  template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+    this->checkerboard = r.checkerboard;
+    conformable(*this,r);
+    
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      this->_odata[ss]=r._odata[ss];
+    }
+    return *this;
+  }
+
  virtual ~Lattice(void) = default;
    
  void reset(GridBase* grid) {
@@ -281,33 +310,6 @@ public:
    return *this;
  }
  
-  template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
-    this->checkerboard = r.checkerboard;
-    conformable(*this,r);
-    
-    parallel_for(int ss=0;ss<_grid->oSites();ss++){
-      this->_odata[ss]=r._odata[ss];
-    }
-    return *this;
-  }
-
-  strong_inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
-    _grid        = r._grid;
-    checkerboard = r.checkerboard;
-    _odata.resize(_grid->oSites());// essential
-    
-    parallel_for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss]=r._odata[ss];
-    }  	
-    return *this;
-  }
-  strong_inline Lattice<vobj> & operator = (Lattice<vobj> && r)
-  {
-    _grid        = r._grid;
-    checkerboard = r.checkerboard;
-    _odata       =std::move(r._odata);
-    return *this;
-  }
  
  // *=,+=,-= operators inherit behvour from correspond */+/- operation
  template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -110,11 +110,11 @@ class BinaryIO {
      lsites = 1;
    }

-    #pragma omp parallel
+PARALLEL_REGION
    {
      uint32_t nersc_csum_thr = 0;

-      #pragma omp for
+PARALLEL_FOR_LOOP_INTERN
      for (uint64_t local_site = 0; local_site < lsites; local_site++)
      {
        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
@@ -124,7 +124,7 @@ class BinaryIO {
        }
      }

-      #pragma omp critical
+PARALLEL_CRITICAL
      {
        nersc_csum += nersc_csum_thr;
      }
@@ -146,14 +146,14 @@ class BinaryIO {
    std::vector<int> local_start =grid->LocalStarts();
    std::vector<int> global_vol  =grid->FullDimensions();

-#pragma omp parallel
+PARALLEL_REGION
    { 
      std::vector<int> coor(nd);
      uint32_t scidac_csuma_thr=0;
      uint32_t scidac_csumb_thr=0;
      uint32_t site_crc=0;

-#pragma omp for
+PARALLEL_FOR_LOOP_INTERN
      for(uint64_t local_site=0;local_site<lsites;local_site++){

 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
@@ -183,7 +183,7 @@ class BinaryIO {
 	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
      }

-#pragma omp critical
+PARALLEL_CRITICAL
      {
 	scidac_csuma^= scidac_csuma_thr;
 	scidac_csumb^= scidac_csumb_thr;
--- a/lib/qcd/hmc/integrators/Integrator.h
+++ b/lib/qcd/hmc/integrators/Integrator.h
@@ -114,18 +114,26 @@ class Integrator {
    // input U actually not used in the fundamental case
    // Fundamental updates, include smearing

-   for (int a = 0; a < as[level].actions.size(); ++a) {
+    for (int a = 0; a < as[level].actions.size(); ++a) {
+      double start_full = usecond();
      Field force(U._grid);
      conformable(U._grid, Mom._grid);
+
      Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
+      double start_force = usecond();
      as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta

      std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
      if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
      force = FieldImplementation::projectForce(force); // Ta for gauge fields
+      double end_force = usecond();
      Real force_abs = std::sqrt(norm2(force)/U._grid->gSites());
-      std::cout << GridLogIntegrator << "Force average: " << force_abs << std::endl;
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
      Mom -= force * ep; 
+      double end_full = usecond();
+      double time_full  = (end_full - start_full) / 1e3;
+      double time_force = (end_force - start_force) / 1e3;
+      std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl;
    }

    // Force from the other representations
--- a/lib/qcd/smearing/GaugeConfiguration.h
+++ b/lib/qcd/smearing/GaugeConfiguration.h
@@ -6,30 +6,33 @@
 #ifndef GAUGE_CONFIG_
 #define GAUGE_CONFIG_

-namespace Grid {
+namespace Grid
+{

-namespace QCD {
+namespace QCD
+{

-  //trivial class for no smearing
-  template< class Impl >
-class NoSmearing {
+//trivial class for no smearing
+template <class Impl>
+class NoSmearing
+{
 public:
  INHERIT_FIELD_TYPES(Impl);

-  Field* ThinField;
+  Field *ThinField;

-  NoSmearing(): ThinField(NULL) {}
+  NoSmearing() : ThinField(NULL) {}

-  void set_Field(Field& U) { ThinField = &U; }
+  void set_Field(Field &U) { ThinField = &U; }

-  void smeared_force(Field&) const {}
+  void smeared_force(Field &) const {}

-  Field& get_SmearedU() { return *ThinField; }
+  Field &get_SmearedU() { return *ThinField; }

-  Field& get_U(bool smeared = false) {
+  Field &get_U(bool smeared = false)
+  {
    return *ThinField;
  }
-
 };

 /*!
@@ -44,32 +47,36 @@ public:
  It stores a list of smeared configurations.
 */
 template <class Gimpl>
-class SmearedConfiguration {
- public:
+class SmearedConfiguration
+{
+public:
  INHERIT_GIMPL_TYPES(Gimpl);

- private:
+private:
  const unsigned int smearingLevels;
  Smear_Stout<Gimpl> StoutSmearing;
  std::vector<GaugeField> SmearedSet;

  // Member functions
  //====================================================================
-  void fill_smearedSet(GaugeField& U) {
-    ThinLinks = &U;  // attach the smearing routine to the field U
+  void fill_smearedSet(GaugeField &U)
+  {
+    ThinLinks = &U; // attach the smearing routine to the field U

    // check the pointer is not null
    if (ThinLinks == NULL)
      std::cout << GridLogError
                << "[SmearedConfiguration] Error in ThinLinks pointer\n";

-    if (smearingLevels > 0) {
+    if (smearingLevels > 0)
+    {
      std::cout << GridLogDebug
                << "[SmearedConfiguration] Filling SmearedSet\n";
      GaugeField previous_u(ThinLinks->_grid);

      previous_u = *ThinLinks;
-      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) {
+      for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
+      {
        StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
        previous_u = SmearedSet[smearLvl];

@@ -81,9 +88,10 @@ class SmearedConfiguration {
    }
  }
  //====================================================================
-  GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
-                                  const GaugeField& GaugeK) const {
-    GridBase* grid = GaugeK._grid;
+  GaugeField AnalyticSmearedForce(const GaugeField &SigmaKPrime,
+                                  const GaugeField &GaugeK) const
+  {
+    GridBase *grid = GaugeK._grid;
    GaugeField C(grid), SigmaK(grid), iLambda(grid);
    GaugeLinkField iLambda_mu(grid);
    GaugeLinkField iQ(grid), e_iQ(grid);
@@ -94,7 +102,8 @@ class SmearedConfiguration {
    SigmaK = zero;
    iLambda = zero;

-    for (int mu = 0; mu < Nd; mu++) {
+    for (int mu = 0; mu < Nd; mu++)
+    {
      Cmu = peekLorentz(C, mu);
      GaugeKmu = peekLorentz(GaugeK, mu);
      SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu);
@@ -104,20 +113,22 @@ class SmearedConfiguration {
      pokeLorentz(iLambda, iLambda_mu, mu);
    }
    StoutSmearing.derivative(SigmaK, iLambda,
-                             GaugeK);  // derivative of SmearBase
+                             GaugeK); // derivative of SmearBase
    return SigmaK;
  }

  /*! @brief Returns smeared configuration at level 'Level' */
-  const GaugeField& get_smeared_conf(int Level) const {
+  const GaugeField &get_smeared_conf(int Level) const
+  {
    return SmearedSet[Level];
  }

  //====================================================================
-  void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ,
-                   const GaugeLinkField& iQ, const GaugeLinkField& Sigmap,
-                   const GaugeLinkField& GaugeK) const {
-    GridBase* grid = iQ._grid;
+  void set_iLambda(GaugeLinkField &iLambda, GaugeLinkField &e_iQ,
+                   const GaugeLinkField &iQ, const GaugeLinkField &Sigmap,
+                   const GaugeLinkField &GaugeK) const
+  {
+    GridBase *grid = iQ._grid;
    GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid);
    GaugeLinkField unity(grid);
    unity = 1.0;
@@ -206,15 +217,15 @@ class SmearedConfiguration {
  }

  //====================================================================
- public:
-  GaugeField*
-      ThinLinks; /*!< @brief Pointer to the thin
-                                                         links configuration */
+public:
+  GaugeField *
+      ThinLinks; /* Pointer to the thin links configuration */

-  /*! @brief Standard constructor */
-  SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
-                       Smear_Stout<Gimpl>& Stout)
-      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) {
+  /* Standard constructor */
+  SmearedConfiguration(GridCartesian *UGrid, unsigned int Nsmear,
+                       Smear_Stout<Gimpl> &Stout)
+      : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL)
+  {
    for (unsigned int i = 0; i < smearingLevels; ++i)
      SmearedSet.push_back(*(new GaugeField(UGrid)));
  }
@@ -223,21 +234,29 @@ class SmearedConfiguration {
  SmearedConfiguration()
      : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}

-
-  
  // attach the smeared routines to the thin links U and fill the smeared set
-  void set_Field(GaugeField& U) { fill_smearedSet(U); }
+  void set_Field(GaugeField &U)
+  {
+    double start = usecond();
+    fill_smearedSet(U);
+    double end = usecond();
+    double time = (end - start)/ 1e3;
+    std::cout << GridLogMessage << "Smearing in " << time << " ms" << std::endl;  
+  }

  //====================================================================
-  void smeared_force(GaugeField& SigmaTilde) const {
-    if (smearingLevels > 0) {
+  void smeared_force(GaugeField &SigmaTilde) const
+  {
+    if (smearingLevels > 0)
+    {
+      double start = usecond();
      GaugeField force = SigmaTilde; // actually = U*SigmaTilde
      GaugeLinkField tmp_mu(SigmaTilde._grid);

-      for (int mu = 0; mu < Nd; mu++) {
+      for (int mu = 0; mu < Nd; mu++)
+      {
        // to get just SigmaTilde
-        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) *
-                 peekLorentz(force, mu);
+        tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * peekLorentz(force, mu);
        pokeLorentz(force, tmp_mu, mu);
      }

@@ -246,33 +265,43 @@ class SmearedConfiguration {

      force = AnalyticSmearedForce(force, *ThinLinks);

-      for (int mu = 0; mu < Nd; mu++) {
+      for (int mu = 0; mu < Nd; mu++)
+      {
        tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu);
        pokeLorentz(SigmaTilde, tmp_mu, mu);
      }
-    }  // if smearingLevels = 0 do nothing
+      double end = usecond();
+      double time = (end - start)/ 1e3;
+      std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;  
+    } // if smearingLevels = 0 do nothing
  }
  //====================================================================

-  GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
+  GaugeField &get_SmearedU() { return SmearedSet[smearingLevels - 1]; }

-  GaugeField& get_U(bool smeared = false) {
+  GaugeField &get_U(bool smeared = false)
+  {
    // get the config, thin links by default
-    if (smeared) {
-      if (smearingLevels) {
+    if (smeared)
+    {
+      if (smearingLevels)
+      {
        RealD impl_plaq =
            WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]);
        std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq
                  << std::endl;
        return get_SmearedU();
-
-      } else {
+      }
+      else
+      {
        RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
        std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                  << std::endl;
        return *ThinLinks;
      }
-    } else {
+    }
+    else
+    {
      RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
      std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
                << std::endl;
--- a/tests/Test_dwf_mixedcg_prec.cc
+++ b/tests/Test_dwf_mixedcg_prec.cc
@@ -49,6 +49,8 @@ int main (int argc, char ** argv)

  const int Ls=8;

+  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
+
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -90,24 +92,23 @@ int main (int argc, char ** argv)
  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);

-  std::cout << "Starting mixed CG" << std::endl;
+  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
  mCG(src_o,result_o);

-  std::cout << "Starting regular CG" << std::endl;
+  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
  CG(HermOpEO,src_o,result_o_2);

  LatticeFermionD diff_o(FrbGrid);
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);

-  std::cout << "Diff between mixed and regular CG: " << diff << std::endl;
+  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;

  #ifdef HAVE_LIME
  if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
  
  std::string file1("./Propagator1");
-  std::string file2("./Propagator2");
  emptyUserRecord record;
  uint32_t nersc_csum;
  uint32_t scidac_csuma;
@@ -121,12 +122,12 @@ int main (int argc, char ** argv)
  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
 						   nersc_csum,scidac_csuma,scidac_csumb);

-  std::cout << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
+  std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;

  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
 						   nersc_csum,scidac_csuma,scidac_csumb);

-  std::cout << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
+  std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
  }
  #endif