diff --git a/Grid/.clang-format b/Grid/.clang-format
index 9156d50..9d54a25 100644
--- a/Grid/.clang-format
+++ b/Grid/.clang-format
@@ -6,7 +6,7 @@
   BreakBeforeBraces: Allman,
   AllowShortIfStatementsOnASingleLine: false,
   IndentCaseLabels: false,
-  ColumnLimit: 0,
+  ColumnLimit: 90,
   AccessModifierOffset: -4,
   NamespaceIndentation: All,
   FixNamespaceComments: false,
diff --git a/Grid/Benchmark_Grid.cpp b/Grid/Benchmark_Grid.cpp
index fd2056e..841f26e 100644
--- a/Grid/Benchmark_Grid.cpp
+++ b/Grid/Benchmark_Grid.cpp
@@ -18,6 +18,7 @@ You should have received a copy of the GNU General Public License
 along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 
+#include "Common.hpp"
 #include <Grid/Grid.h>
 
 using namespace Grid;
@@ -44,8 +45,7 @@ struct time_statistics
     mean = sum / v.size();
 
     std::vector<double> diff(v.size());
-    std::transform(v.begin(), v.end(), diff.begin(), [=](double x)
-                   { return x - mean; });
+    std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
     double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
     err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
 
@@ -64,11 +64,8 @@ void comms_header()
             << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
 };
 
-Gamma::Algebra Gmu[] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT};
+Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
+                        Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
 
 struct controls
 {
@@ -84,26 +81,52 @@ class Benchmark
   {
 
     int threads = GridThread::GetThreads();
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
+    std::cout << GridLogMessage << "= Grid is setup to use " << threads << " threads"
+              << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     std::cout << GridLogMessage << "Grid Default Decomposition patterns\n";
-    std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads() << std::endl;
-    std::cout << GridLogMessage << "\tMPI tasks      : " << GridCmdVectorIntToString(GridDefaultMpi()) << std::endl;
-    std::cout << GridLogMessage << "\tvReal          : " << sizeof(vReal) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvRealF         : " << sizeof(vRealF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvRealD         : " << sizeof(vRealD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvComplex       : " << sizeof(vComplex) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvComplexF      : " << sizeof(vComplexF) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "\tvComplexD      : " << sizeof(vComplexD) * 8 << "bits ; " << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd())) << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage << "\tOpenMP threads : " << GridThread::GetThreads()
+              << std::endl;
+    std::cout << GridLogMessage
+              << "\tMPI tasks      : " << GridCmdVectorIntToString(GridDefaultMpi())
+              << std::endl;
+    std::cout << GridLogMessage << "\tvReal          : " << sizeof(vReal) * 8 << "bits ; "
+              << GridCmdVectorIntToString(GridDefaultSimd(4, vReal::Nsimd()))
+              << std::endl;
+    std::cout << GridLogMessage << "\tvRealF         : " << sizeof(vRealF) * 8
+              << "bits ; "
+              << GridCmdVectorIntToString(GridDefaultSimd(4, vRealF::Nsimd()))
+              << std::endl;
+    std::cout << GridLogMessage << "\tvRealD         : " << sizeof(vRealD) * 8
+              << "bits ; "
+              << GridCmdVectorIntToString(GridDefaultSimd(4, vRealD::Nsimd()))
+              << std::endl;
+    std::cout << GridLogMessage << "\tvComplex       : " << sizeof(vComplex) * 8
+              << "bits ; "
+              << GridCmdVectorIntToString(GridDefaultSimd(4, vComplex::Nsimd()))
+              << std::endl;
+    std::cout << GridLogMessage << "\tvComplexF      : " << sizeof(vComplexF) * 8
+              << "bits ; "
+              << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexF::Nsimd()))
+              << std::endl;
+    std::cout << GridLogMessage << "\tvComplexD      : " << sizeof(vComplexD) * 8
+              << "bits ; "
+              << GridCmdVectorIntToString(GridDefaultSimd(4, vComplexD::Nsimd()))
+              << std::endl;
   }
 
   static void Comms(void)
   {
     int Nloop = 200;
     int nmu = 0;
-    int maxlat = 32;
+    int maxlat = 48;
 
     Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
     Coordinate mpi_layout = GridDefaultMpi();
@@ -115,94 +138,89 @@ class Benchmark
     std::vector<double> t_time(Nloop);
     time_statistics timestat;
 
-    std::cout << GridLogMessage << "====================================================================================================" << std::endl;
-    std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in " << nmu << " dimensions" << std::endl;
-    std::cout << GridLogMessage << "====================================================================================================" << std::endl;
-    comms_header();
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "============================="
+              << std::endl;
+    std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "
+              << nmu << " dimensions" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "============================="
+              << std::endl;
+    grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)",
+                "time (usec)", "rate (GB/s)", "std dev", "max");
 
     for (int lat = 16; lat <= maxlat; lat += 8)
     {
-      //      for(int Ls=8;Ls<=8;Ls*=2){
+      int Ls = 12;
+
+      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
+                            lat * mpi_layout[3]});
+
+      GridCartesian Grid(latt_size, simd_layout, mpi_layout);
+      RealD Nrank = Grid._Nprocessors;
+      RealD Nnode = Grid.NodeCount();
+      RealD ppn = Nrank / Nnode;
+
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+      uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
+      for (int d = 0; d < 8; d++)
       {
-        int Ls = 12;
+        xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+        rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+      }
 
-        Coordinate latt_size({lat * mpi_layout[0],
-                              lat * mpi_layout[1],
-                              lat * mpi_layout[2],
-                              lat * mpi_layout[3]});
+      double dbytes;
 
-        GridCartesian Grid(latt_size, simd_layout, mpi_layout);
-        RealD Nrank = Grid._Nprocessors;
-        RealD Nnode = Grid.NodeCount();
-        RealD ppn = Nrank / Nnode;
-
-        std::vector<HalfSpinColourVectorD *> xbuf(8);
-        std::vector<HalfSpinColourVectorD *> rbuf(8);
-        // Grid.ShmBufferFreeAll();
-        uint64_t bytes = lat * lat * lat * Ls * sizeof(HalfSpinColourVectorD);
-        for (int d = 0; d < 8; d++)
+      for (int dir = 0; dir < 8; dir++)
+      {
+        int mu = dir % 4;
+        if (mpi_layout[mu] > 1)
         {
-          xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
-          rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
-          //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-          //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-        }
 
-        //	int ncomm;
-        double dbytes;
-
-        for (int dir = 0; dir < 8; dir++)
-        {
-          int mu = dir % 4;
-          if (mpi_layout[mu] > 1)
+          std::vector<double> times(Nloop);
+          for (int i = 0; i < Nloop; i++)
           {
 
-            std::vector<double> times(Nloop);
-            for (int i = 0; i < Nloop; i++)
+            dbytes = 0;
+            double start = usecond();
+            int xmit_to_rank;
+            int recv_from_rank;
+
+            if (dir == mu)
             {
-
-              dbytes = 0;
-              double start = usecond();
-              int xmit_to_rank;
-              int recv_from_rank;
-
-              if (dir == mu)
-              {
-                int comm_proc = 1;
-                Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              }
-              else
-              {
-                int comm_proc = mpi_layout[mu] - 1;
-                Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              }
-              Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
-                                  (void *)&rbuf[dir][0], recv_from_rank,
-                                  bytes);
-              dbytes += bytes;
-
-              double stop = usecond();
-              t_time[i] = stop - start; // microseconds
+              int comm_proc = 1;
+              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
             }
-            timestat.statistics(t_time);
+            else
+            {
+              int comm_proc = mpi_layout[mu] - 1;
+              Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
+            }
+            Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+                                (void *)&rbuf[dir][0], recv_from_rank, bytes);
+            dbytes += bytes;
 
-            dbytes = dbytes * ppn;
-            double xbytes = dbytes * 0.5;
-            double bidibytes = dbytes;
-
-            std::cout << GridLogMessage << lat << "\t" << Ls << "\t "
-                      << bytes << " \t "
-                      << xbytes / timestat.mean << " \t " << xbytes * timestat.err / (timestat.mean * timestat.mean) << " \t "
-                      << xbytes / timestat.max << " " << xbytes / timestat.min
-                      << "\t\t" << bidibytes / timestat.mean << "  " << bidibytes * timestat.err / (timestat.mean * timestat.mean) << " "
-                      << bidibytes / timestat.max << " " << bidibytes / timestat.min << std::endl;
+            double stop = usecond();
+            t_time[i] = stop - start; // microseconds
           }
+          timestat.statistics(t_time);
+
+          dbytes = dbytes * ppn;
+          double bidibytes = 2. * dbytes;
+          double rate = bidibytes / (timestat.mean / 1.e6) / 1024. / 1024. / 1024.;
+          double rate_err = rate * timestat.err / timestat.mean;
+          double rate_max = rate * timestat.mean / timestat.min;
+          grid_printf("%5d %5d %15d %15.2f %15.2f %15.1f %15.2f\n", lat, dir, bytes,
+                      timestat.mean, rate, rate_err, rate_max);
         }
-        for (int d = 0; d < 8; d++)
-        {
-          acceleratorFreeDevice(xbuf[d]);
-          acceleratorFreeDevice(rbuf[d]);
-        }
+      }
+      for (int d = 0; d < 8; d++)
+      {
+        acceleratorFreeDevice(xbuf[d]);
+        acceleratorFreeDevice(rbuf[d]);
       }
     }
     return;
@@ -217,9 +235,15 @@ class Benchmark
     Coordinate simd_layout = GridDefaultSimd(Nd, vReal::Nsimd());
     Coordinate mpi_layout = GridDefaultMpi();
 
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     std::cout << GridLogMessage << "= Benchmarking a*x + y bandwidth" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     std::cout << GridLogMessage << "  L  "
               << "\t\t"
               << "bytes"
@@ -229,7 +253,9 @@ class Benchmark
               << "Gflop/s"
               << "\t\t seconds"
               << "\t\tGB/s / node" << std::endl;
-    std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl;
+    std::cout << GridLogMessage
+              << "----------------------------------------------------------"
+              << std::endl;
 
     //    uint64_t NP;
     uint64_t NN;
@@ -242,7 +268,8 @@ class Benchmark
     for (int lat = 8; lat <= lmax; lat += 8)
     {
 
-      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]});
+      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
+                            lat * mpi_layout[3]});
       int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
 
       GridCartesian Grid(latt_size, simd_layout, mpi_layout);
@@ -273,9 +300,10 @@ class Benchmark
 
       double flops = vol * Nvec * 2; // mul,add
       double bytes = 3.0 * vol * Nvec * sizeof(Real);
-      std::cout << GridLogMessage << std::setprecision(3)
-                << lat << "\t\t" << bytes << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000.
-                << "\t\t" << bytes / time / NN << std::endl;
+      std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
+                << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
+                << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
+                << std::endl;
     }
   };
 
@@ -287,9 +315,15 @@ class Benchmark
     Coordinate simd_layout = GridDefaultSimd(Nd, vComplexF::Nsimd());
     Coordinate mpi_layout = GridDefaultMpi();
 
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     std::cout << GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     std::cout << GridLogMessage << "  L  "
               << "\t\t"
               << "bytes"
@@ -299,7 +333,9 @@ class Benchmark
               << "Gflop/s"
               << "\t\t seconds"
               << "\t\tGB/s / node" << std::endl;
-    std::cout << GridLogMessage << "----------------------------------------------------------" << std::endl;
+    std::cout << GridLogMessage
+              << "----------------------------------------------------------"
+              << std::endl;
 
     uint64_t NN;
 
@@ -310,7 +346,8 @@ class Benchmark
     for (int lat = 8; lat <= lmax; lat += 8)
     {
 
-      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2], lat * mpi_layout[3]});
+      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
+                            lat * mpi_layout[3]});
       int64_t vol = latt_size[0] * latt_size[1] * latt_size[2] * latt_size[3];
 
       GridCartesian Grid(latt_size, simd_layout, mpi_layout);
@@ -337,9 +374,10 @@ class Benchmark
 
       double flops = vol * Nc4 * Nc4 * (6 + (Nc4 - 1) * 8); // mul,add
       double bytes = 3.0 * vol * Nc4 * Nc4 * 2 * sizeof(RealF);
-      std::cout << GridLogMessage << std::setprecision(3)
-                << lat << "\t\t" << bytes << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t" << (stop - start) / 1000. / 1000.
-                << "\t\t" << bytes / time / NN << std::endl;
+      std::cout << GridLogMessage << std::setprecision(3) << lat << "\t\t" << bytes
+                << "   \t\t" << bytes / time << "\t\t" << flops / time << "\t\t"
+                << (stop - start) / 1000. / 1000. << "\t\t" << bytes / time / NN
+                << std::endl;
     }
   };
 
@@ -360,31 +398,41 @@ class Benchmark
     Coordinate mpi = GridDefaultMpi();
     assert(mpi.size() == 4);
     Coordinate local({L, L, L, L});
-    Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
+    Coordinate latt4(
+        {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
 
-    GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
-                                                            GridDefaultSimd(Nd, vComplex::Nsimd()),
-                                                            GridDefaultMpi());
+    GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(
+        latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
     uint64_t NP = TmpGrid->RankCount();
     uint64_t NN = TmpGrid->NodeCount();
     NN_global = NN;
     uint64_t SHM = NP / NN;
 
     ///////// Welcome message ////////////
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume " << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
+    std::cout << GridLogMessage << "Benchmark DWF on " << L << "^4 local volume "
+              << std::endl;
     std::cout << GridLogMessage << "* Nc             : " << Nc << std::endl;
-    std::cout << GridLogMessage << "* Global volume  : " << GridCmdVectorIntToString(latt4) << std::endl;
+    std::cout << GridLogMessage
+              << "* Global volume  : " << GridCmdVectorIntToString(latt4) << std::endl;
     std::cout << GridLogMessage << "* Ls             : " << Ls << std::endl;
     std::cout << GridLogMessage << "* ranks          : " << NP << std::endl;
     std::cout << GridLogMessage << "* nodes          : " << NN << std::endl;
     std::cout << GridLogMessage << "* ranks/node     : " << SHM << std::endl;
-    std::cout << GridLogMessage << "* ranks geom     : " << GridCmdVectorIntToString(mpi) << std::endl;
+    std::cout << GridLogMessage << "* ranks geom     : " << GridCmdVectorIntToString(mpi)
+              << std::endl;
     std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
 
     ///////// Lattice Init ////////////
-    GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+    GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
+        latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
     GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
     GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
     GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
@@ -423,10 +471,14 @@ class Benchmark
       std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
 
       controls Cases[] = {
-          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
-          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
-          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential},
-          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}};
+          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute,
+           CartesianCommunicator::CommunicatorPolicyConcurrent},
+          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute,
+           CartesianCommunicator::CommunicatorPolicyConcurrent},
+          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsThenCompute,
+           CartesianCommunicator::CommunicatorPolicySequential},
+          {WilsonKernelsStatic::OptGeneric, WilsonKernelsStatic::CommsAndCompute,
+           CartesianCommunicator::CommunicatorPolicySequential}};
 
       for (int c = 0; c < num_cases; c++)
       {
@@ -435,7 +487,10 @@ class Benchmark
         WilsonKernelsStatic::Opt = Cases[c].Opt;
         CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 
-        std::cout << GridLogMessage << "==================================================================================" << std::endl;
+        std::cout << GridLogMessage
+                  << "==================================================================="
+                     "==============="
+                  << std::endl;
         if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
           std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
         if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
@@ -443,7 +498,10 @@ class Benchmark
         if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
           std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
         std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
-        std::cout << GridLogMessage << "==================================================================================" << std::endl;
+        std::cout << GridLogMessage
+                  << "==================================================================="
+                     "==============="
+                  << std::endl;
 
         int nwarm = 10;
         double t0 = usecond();
@@ -458,7 +516,8 @@ class Benchmark
 
         FGrid->Broadcast(0, &ncall, sizeof(ncall));
 
-        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per
+        // second"<<std::endl;
         Dw.ZeroCounters();
 
         time_statistics timestat;
@@ -483,7 +542,8 @@ class Benchmark
 #if 0
 	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns  + Nd*Nc*Ns*2;
 #else
-        double fps = Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
+        double fps =
+            Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
 #endif
         double flops = (fps * volume) / 2;
         double mf_hi, mf_lo, mf_err;
@@ -505,14 +565,25 @@ class Benchmark
           mflops_worst = mflops;
 
         std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank   " << mflops / NP << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node   " << mflops / NN << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
+                  << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo
+                  << "-" << mf_hi << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
+                  << "Deo mflop/s per rank   " << mflops / NP << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
+                  << "Deo mflop/s per node   " << mflops / NN << std::endl;
       }
 
-      std::cout << GridLogMessage << "==================================================================================" << std::endl;
-      std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Best  mflop/s        =   " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl;
-      std::cout << GridLogMessage << L << "^4 x " << Ls << " Deo Worst mflop/s        =   " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl;
+      std::cout << GridLogMessage
+                << "====================================================================="
+                   "============="
+                << std::endl;
+      std::cout << GridLogMessage << L << "^4 x " << Ls
+                << " Deo Best  mflop/s        =   " << mflops_best << " ; "
+                << mflops_best / NN << " per node " << std::endl;
+      std::cout << GridLogMessage << L << "^4 x " << Ls
+                << " Deo Worst mflop/s        =   " << mflops_worst << " ; "
+                << mflops_worst / NN << " per node " << std::endl;
       std::cout << GridLogMessage << fmt << std::endl;
       std::cout << GridLogMessage;
 
@@ -521,7 +592,10 @@ class Benchmark
         std::cout << mflops_all[i] / NN << " ; ";
       }
       std::cout << std::endl;
-      std::cout << GridLogMessage << "==================================================================================" << std::endl;
+      std::cout << GridLogMessage
+                << "====================================================================="
+                   "============="
+                << std::endl;
     }
     return mflops_best;
   }
@@ -540,29 +614,39 @@ class Benchmark
     Coordinate mpi = GridDefaultMpi();
     assert(mpi.size() == 4);
     Coordinate local({L, L, L, L});
-    Coordinate latt4({local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
+    Coordinate latt4(
+        {local[0] * mpi[0], local[1] * mpi[1], local[2] * mpi[2], local[3] * mpi[3]});
 
-    GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
-                                                            GridDefaultSimd(Nd, vComplex::Nsimd()),
-                                                            GridDefaultMpi());
+    GridCartesian *TmpGrid = SpaceTimeGrid::makeFourDimGrid(
+        latt4, GridDefaultSimd(Nd, vComplex::Nsimd()), GridDefaultMpi());
     uint64_t NP = TmpGrid->RankCount();
     uint64_t NN = TmpGrid->NodeCount();
     NN_global = NN;
     uint64_t SHM = NP / NN;
 
     ///////// Welcome message ////////////
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
-    std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L << "^4 local volume " << std::endl;
-    std::cout << GridLogMessage << "* Global volume  : " << GridCmdVectorIntToString(latt4) << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
+    std::cout << GridLogMessage << "Benchmark ImprovedStaggered on " << L
+              << "^4 local volume " << std::endl;
+    std::cout << GridLogMessage
+              << "* Global volume  : " << GridCmdVectorIntToString(latt4) << std::endl;
     std::cout << GridLogMessage << "* ranks          : " << NP << std::endl;
     std::cout << GridLogMessage << "* nodes          : " << NN << std::endl;
     std::cout << GridLogMessage << "* ranks/node     : " << SHM << std::endl;
-    std::cout << GridLogMessage << "* ranks geom     : " << GridCmdVectorIntToString(mpi) << std::endl;
+    std::cout << GridLogMessage << "* ranks geom     : " << GridCmdVectorIntToString(mpi)
+              << std::endl;
     std::cout << GridLogMessage << "* Using " << threads << " threads" << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
 
     ///////// Lattice Init ////////////
-    GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+    GridCartesian *FGrid = SpaceTimeGrid::makeFourDimGrid(
+        latt4, GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
     GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
 
     ///////// RNG Init ////////////
@@ -604,10 +688,14 @@ class Benchmark
       std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
 
       controls Cases[] = {
-          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
-          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicyConcurrent},
-          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute, CartesianCommunicator::CommunicatorPolicySequential},
-          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute, CartesianCommunicator::CommunicatorPolicySequential}};
+          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute,
+           CartesianCommunicator::CommunicatorPolicyConcurrent},
+          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute,
+           CartesianCommunicator::CommunicatorPolicyConcurrent},
+          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsThenCompute,
+           CartesianCommunicator::CommunicatorPolicySequential},
+          {StaggeredKernelsStatic::OptGeneric, StaggeredKernelsStatic::CommsAndCompute,
+           CartesianCommunicator::CommunicatorPolicySequential}};
 
       for (int c = 0; c < num_cases; c++)
       {
@@ -616,15 +704,22 @@ class Benchmark
         StaggeredKernelsStatic::Opt = Cases[c].Opt;
         CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 
-        std::cout << GridLogMessage << "==================================================================================" << std::endl;
+        std::cout << GridLogMessage
+                  << "==================================================================="
+                     "==============="
+                  << std::endl;
         if (StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric)
-          std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels" << std::endl;
+          std::cout << GridLogMessage << "* Using GENERIC Nc StaggeredKernels"
+                    << std::endl;
         if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute)
           std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
         if (StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute)
           std::cout << GridLogMessage << "* Using sequential Comms/Compute" << std::endl;
         std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
-        std::cout << GridLogMessage << "==================================================================================" << std::endl;
+        std::cout << GridLogMessage
+                  << "==================================================================="
+                     "==============="
+                  << std::endl;
 
         int nwarm = 10;
         double t0 = usecond();
@@ -639,7 +734,8 @@ class Benchmark
 
         FGrid->Broadcast(0, &ncall, sizeof(ncall));
 
-        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per
+        // second"<<std::endl;
         Ds.ZeroCounters();
 
         time_statistics timestat;
@@ -675,14 +771,25 @@ class Benchmark
         if (mflops < mflops_worst)
           mflops_worst = mflops;
 
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo << "-" << mf_hi << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per rank   " << mflops / NP << std::endl;
-        std::cout << GridLogMessage << std::fixed << std::setprecision(1) << "Deo mflop/s per node   " << mflops / NN << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
+                  << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo
+                  << "-" << mf_hi << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
+                  << "Deo mflop/s per rank   " << mflops / NP << std::endl;
+        std::cout << GridLogMessage << std::fixed << std::setprecision(1)
+                  << "Deo mflop/s per node   " << mflops / NN << std::endl;
       }
 
-      std::cout << GridLogMessage << "==================================================================================" << std::endl;
-      std::cout << GridLogMessage << L << "^4  Deo Best  mflop/s        =   " << mflops_best << " ; " << mflops_best / NN << " per node " << std::endl;
-      std::cout << GridLogMessage << L << "^4  Deo Worst mflop/s        =   " << mflops_worst << " ; " << mflops_worst / NN << " per node " << std::endl;
+      std::cout << GridLogMessage
+                << "====================================================================="
+                   "============="
+                << std::endl;
+      std::cout << GridLogMessage << L
+                << "^4  Deo Best  mflop/s        =   " << mflops_best << " ; "
+                << mflops_best / NN << " per node " << std::endl;
+      std::cout << GridLogMessage << L
+                << "^4  Deo Worst mflop/s        =   " << mflops_worst << " ; "
+                << mflops_worst / NN << " per node " << std::endl;
       std::cout << GridLogMessage << fmt << std::endl;
       std::cout << GridLogMessage;
 
@@ -692,7 +799,10 @@ class Benchmark
       }
       std::cout << std::endl;
     }
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     return mflops_best;
   }
 };
@@ -701,7 +811,8 @@ int main(int argc, char **argv)
 {
   Grid_init(&argc, &argv);
 
-  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
+  CartesianCommunicator::SetCommunicatorPolicy(
+      CartesianCommunicator::CommunicatorPolicySequential);
 #ifdef KNL
   LebesgueOrder::Block = std::vector<int>({8, 2, 2, 2});
 #else
@@ -709,9 +820,11 @@ int main(int argc, char **argv)
 #endif
   Benchmark::Decomposition();
 
-  int do_su4 = 1;
-  int do_memory = 1;
+  int do_su4 = 0;
+  int do_memory = 0;
   int do_comms = 1;
+  int do_flops = 0;
+  int Ls = 1;
 
   int sel = 4;
   std::vector<int> L_list({8, 12, 16, 24, 32});
@@ -721,84 +834,170 @@ int main(int argc, char **argv)
   std::vector<double> dwf4;
   std::vector<double> staggered;
 
-  int Ls = 1;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
+  if (do_flops)
   {
-    wilson.push_back(Benchmark::DWF(Ls, L_list[l]));
-  }
+    Ls = 1;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    std::cout << GridLogMessage << " Wilson dslash 4D vectorised" << std::endl;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    for (int l = 0; l < L_list.size(); l++)
+    {
+      wilson.push_back(Benchmark::DWF(Ls, L_list[l]));
+    }
 
-  Ls = 12;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
-  {
-    double result = Benchmark::DWF(Ls, L_list[l]);
-    dwf4.push_back(result);
-  }
+    Ls = 12;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    std::cout << GridLogMessage << " Domain wall dslash 4D vectorised" << std::endl;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    for (int l = 0; l < L_list.size(); l++)
+    {
+      double result = Benchmark::DWF(Ls, L_list[l]);
+      dwf4.push_back(result);
+    }
 
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised" << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
-  {
-    double result = Benchmark::Staggered(L_list[l]);
-    staggered.push_back(result);
-  }
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    std::cout << GridLogMessage << " Improved Staggered dslash 4D vectorised"
+              << std::endl;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    for (int l = 0; l < L_list.size(); l++)
+    {
+      double result = Benchmark::Staggered(L_list[l]);
+      staggered.push_back(result);
+    }
 
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
-  {
-    std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t " << dwf4[l] << " \t\t " << staggered[l] << std::endl;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
+    for (int l = 0; l < L_list.size(); l++)
+    {
+      std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t "
+                << dwf4[l] << " \t\t " << staggered[l] << std::endl;
+    }
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
   }
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
 
   int NN = NN_global;
   if (do_memory)
   {
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     std::cout << GridLogMessage << " Memory benchmark " << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     Benchmark::Memory();
   }
 
   if (do_su4)
   {
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     Benchmark::SU4();
   }
 
   if (do_comms)
   {
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     std::cout << GridLogMessage << " Communications benchmark " << std::endl;
-    std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout << GridLogMessage
+              << "======================================================================="
+                 "==========="
+              << std::endl;
     Benchmark::Comms();
   }
 
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
-  for (int l = 0; l < L_list.size(); l++)
+  if (do_flops)
   {
-    std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t " << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
-  }
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
+    for (int l = 0; l < L_list.size(); l++)
+    {
+      std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t "
+                << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
+    }
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
 
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
-  std::cout << GridLogMessage << " Comparison point     result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN << " Mflop/s per node" << std::endl;
-  std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+" << dwf4[selm1] / NN << ") " << std::endl;
-  std::cout << std::setprecision(3);
-  std::cout << GridLogMessage << "==================================================================================" << std::endl;
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+    std::cout << GridLogMessage
+              << " Comparison point     result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN
+              << " Mflop/s per node" << std::endl;
+    std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+"
+              << dwf4[selm1] / NN << ") " << std::endl;
+    std::cout << std::setprecision(3);
+    std::cout
+        << GridLogMessage
+        << "========================================================================="
+           "========="
+        << std::endl;
+  }
 
   Grid_finalize();
 }
diff --git a/Grid/Benchmark_IO.cpp b/Grid/Benchmark_IO.cpp
index da6b78f..96ef3e3 100644
--- a/Grid/Benchmark_IO.cpp
+++ b/Grid/Benchmark_IO.cpp
@@ -32,23 +32,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 #ifdef HAVE_LIME
 using namespace Grid;
 
-std::string filestem(const int l)
-{
-  return "iobench_l" + std::to_string(l);
-}
+std::string filestem(const int l) { return "iobench_l" + std::to_string(l); }
 
-int vol(const int i)
-{
-  return BENCH_IO_LMIN + 2 * i;
-}
+int vol(const int i) { return BENCH_IO_LMIN + 2 * i; }
 
-int volInd(const int l)
-{
-  return (l - BENCH_IO_LMIN) / 2;
-}
+int volInd(const int l) { return (l - BENCH_IO_LMIN) / 2; }
 
-template <typename Mat>
-void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
+template <typename Mat> void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
 {
   auto nr = data[0].rows(), nc = data[0].cols();
   Eigen::MatrixXd sqSum(nr, nc);
@@ -66,11 +56,11 @@ void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data)
   mean /= n;
 }
 
-#define grid_printf(...)        \
-  {                             \
-    char _buf[1024];            \
-    sprintf(_buf, __VA_ARGS__); \
-    MSG << _buf;                \
+#define grid_printf(...)                                                                 \
+  {                                                                                      \
+    char _buf[1024];                                                                     \
+    sprintf(_buf, __VA_ARGS__);                                                          \
+    MSG << _buf;                                                                         \
   }
 
 enum
@@ -173,47 +163,49 @@ int main(int argc, char **argv)
   MSG << "SUMMARY" << std::endl;
   MSG << BIGSEP << std::endl;
   MSG << "Summary of individual results (all results in MB/s)." << std::endl;
-  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column."
+      << std::endl;
   MSG << std::endl;
-  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n",
-              "L", "std read", "std dev", "std write", "std dev",
-              "Grid read", "std dev", "Grid write", "std dev");
+  grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", "L", "std read", "std dev",
+              "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
   for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
   {
-    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
-                l, mean(volInd(l), sRead), stdDev(volInd(l), sRead),
-                mean(volInd(l), sWrite), stdDev(volInd(l), sWrite),
-                mean(volInd(l), gRead), stdDev(volInd(l), gRead),
-                mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", l,
+                mean(volInd(l), sRead), stdDev(volInd(l), sRead), mean(volInd(l), sWrite),
+                stdDev(volInd(l), sWrite), mean(volInd(l), gRead),
+                stdDev(volInd(l), gRead), mean(volInd(l), gWrite),
+                stdDev(volInd(l), gWrite));
   }
   MSG << std::endl;
-  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
+  MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)"
+      << std::endl;
   MSG << std::endl;
-  grid_printf("%4s %12s %12s %12s %12s\n",
-              "L", "std read", "std write", "Grid read", "Grid write");
+  grid_printf("%4s %12s %12s %12s %12s\n", "L", "std read", "std write", "Grid read",
+              "Grid write");
   for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2)
   {
-    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n",
-                l, rob(volInd(l), sRead), rob(volInd(l), sWrite),
-                rob(volInd(l), gRead), rob(volInd(l), gWrite));
+    grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", l, rob(volInd(l), sRead),
+                rob(volInd(l), sWrite), rob(volInd(l), gRead), rob(volInd(l), gWrite));
   }
   MSG << std::endl;
-  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl;
-  MSG << "Every second colum gives the standard deviation of the previous column." << std::endl;
+  MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX
+      << "^4 (all results in MB/s)." << std::endl;
+  MSG << "Every second colum gives the standard deviation of the previous column."
+      << std::endl;
   MSG << std::endl;
-  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n",
-              "std read", "std dev", "std write", "std dev",
-              "Grid read", "std dev", "Grid write", "std dev");
-  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n",
-              avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
-              avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
+  grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "std read", "std dev",
+              "std write", "std dev", "Grid read", "std dev", "Grid write", "std dev");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", avMean(sRead),
+              avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), avMean(gRead),
+              avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
   MSG << std::endl;
-  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
+  MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)"
+      << std::endl;
   MSG << std::endl;
-  grid_printf("%12s %12s %12s %12s\n",
-              "std read", "std write", "Grid read", "Grid write");
-  grid_printf("%12.1f %12.1f %12.1f %12.1f\n",
-              avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite));
+  grid_printf("%12s %12s %12s %12s\n", "std read", "std write", "Grid read",
+              "Grid write");
+  grid_printf("%12.1f %12.1f %12.1f %12.1f\n", avRob(sRead), avRob(sWrite), avRob(gRead),
+              avRob(gWrite));
 
   Grid_finalize();
 
diff --git a/Grid/Benchmark_IO.hpp b/Grid/Benchmark_IO.hpp
index 24e2214..d71e943 100644
--- a/Grid/Benchmark_IO.hpp
+++ b/Grid/Benchmark_IO.hpp
@@ -20,9 +20,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 
 #include <Grid/Grid.h>
 #define MSG std::cout << GridLogMessage
-#define SEP \
+#define SEP                                                                              \
   "-----------------------------------------------------------------------------"
-#define BIGSEP \
+#define BIGSEP                                                                           \
   "============================================================================="
 #ifdef HAVE_LIME
 
@@ -36,16 +36,15 @@ namespace Grid
 
   // AP 06/10/2020: Standard C version in case one is suspicious of the C++ API
   //
-  // template <typename Field>
-  // void stdWrite(const std::string filestem, Field &vec)
+  // template <typename Field> void stdWrite(const std::string filestem, Field &vec)
   // {
-  //   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
-  //   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
-  //   size_t        size;
-  //   uint32_t      crc;
+  //   std::string rankStr = std::to_string(vec.Grid()->ThisRank());
+  //   std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb");
+  //   size_t size;
+  //   uint32_t crc;
   //   GridStopWatch ioWatch, crcWatch;
 
-  //   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  //   size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
   //   autoView(vec_v, vec, CpuRead);
   //   crcWatch.Start();
   //   crc = GridChecksum::crc32(vec_v.cpu_ptr, size);
@@ -53,36 +52,39 @@ namespace Grid
   //   crcWatch.Stop();
   //   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl;
   //   ioWatch.Start();
-  //   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+  //   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
+  //               vec.Grid()->lSites(), file);
   //   ioWatch.Stop();
   //   std::fclose(file);
   //   size *= vec.Grid()->ProcessorCount();
   //   auto &p = BinaryIO::lastPerf;
-  //   p.size            = size;
-  //   p.time            = ioWatch.useconds();
-  //   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
+  //   p.size = size;
+  //   p.time = ioWatch.useconds();
+  //   p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
   //   MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
-  //       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  //       << ",
+  //          "
+  //       << p.mbytesPerSecond << " MB/s" << std::endl;
   //   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
   // }
-  //
-  // template <typename Field>
-  // void stdRead(Field &vec, const std::string filestem)
+
+  // template <typename Field> void stdRead(Field &vec, const std::string filestem)
   // {
-  //   std::string   rankStr = std::to_string(vec.Grid()->ThisRank());
-  //   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
-  //   size_t        size;
-  //   uint32_t      crcRead, crcData;
+  //   std::string rankStr = std::to_string(vec.Grid()->ThisRank());
+  //   std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb");
+  //   size_t size;
+  //   uint32_t crcRead, crcData;
   //   GridStopWatch ioWatch, crcWatch;
 
-  //   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object);
+  //   size = vec.Grid()->lSites() * sizeof(typename Field::scalar_object);
   //   crcWatch.Start();
   //   std::fread(&crcRead, sizeof(uint32_t), 1, file);
   //   crcWatch.Stop();
   //   {
   //     autoView(vec_v, vec, CpuWrite);
   //     ioWatch.Start();
-  //     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file);
+  //     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object),
+  //                vec.Grid()->lSites(), file);
   //     ioWatch.Stop();
   //     std::fclose(file);
   //   }
@@ -96,19 +98,19 @@ namespace Grid
   //   assert(crcData == crcRead);
   //   size *= vec.Grid()->ProcessorCount();
   //   auto &p = BinaryIO::lastPerf;
-  //   p.size            = size;
-  //   p.time            = ioWatch.useconds();
-  //   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6);
-  //   MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed()
-  //       << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+  //   p.size = size;
+  //   p.time = ioWatch.useconds();
+  //   p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
+  //   MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
+  //       << p.mbytesPerSecond << " MB/s" << std::endl;
   //   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
   // }
 
-  template <typename Field>
-  void stdWrite(const std::string filestem, Field &vec)
+  template <typename Field> void stdWrite(const std::string filestem, Field &vec)
   {
     std::string rankStr = std::to_string(vec.Grid()->ThisRank());
-    std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary);
+    std::ofstream file(filestem + "." + rankStr + ".bin",
+                       std::ios::out | std::ios::binary);
     size_t size, sizec;
     uint32_t crc;
     GridStopWatch ioWatch, crcWatch;
@@ -130,16 +132,16 @@ namespace Grid
     p.size = size;
     p.time = ioWatch.useconds();
     p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
-    MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()
-        << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+    MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
+        << p.mbytesPerSecond << " MB/s" << std::endl;
     MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl;
   }
 
-  template <typename Field>
-  void stdRead(Field &vec, const std::string filestem)
+  template <typename Field> void stdRead(Field &vec, const std::string filestem)
   {
     std::string rankStr = std::to_string(vec.Grid()->ThisRank());
-    std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary);
+    std::ifstream file(filestem + "." + rankStr + ".bin",
+                       std::ios::in | std::ios::binary);
     size_t size, sizec;
     uint32_t crcRead, crcData;
     GridStopWatch ioWatch, crcWatch;
@@ -168,13 +170,12 @@ namespace Grid
     p.size = size;
     p.time = ioWatch.useconds();
     p.mbytesPerSecond = size / 1024. / 1024. / (ioWatch.useconds() / 1.e6);
-    MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed()
-        << ", " << p.mbytesPerSecond << " MB/s" << std::endl;
+    MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() << ", "
+        << p.mbytesPerSecond << " MB/s" << std::endl;
     MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl;
   }
 
-  template <typename Field>
-  void limeWrite(const std::string filestem, Field &vec)
+  template <typename Field> void limeWrite(const std::string filestem, Field &vec)
   {
     emptyUserRecord record;
     ScidacWriter binWriter(vec.Grid()->IsBoss());
@@ -184,8 +185,7 @@ namespace Grid
     binWriter.close();
   }
 
-  template <typename Field>
-  void limeRead(Field &vec, const std::string filestem)
+  template <typename Field> void limeRead(Field &vec, const std::string filestem)
   {
     emptyUserRecord record;
     ScidacReader binReader;
@@ -225,12 +225,13 @@ namespace Grid
 
   template <typename Field>
   void writeBenchmark(const Coordinate &latt, const std::string filename,
-                      const WriterFn<Field> &write,
-                      const unsigned int Ls = 1, const bool rb = false)
+                      const WriterFn<Field> &write, const unsigned int Ls = 1,
+                      const bool rb = false)
   {
     auto mpi = GridDefaultMpi();
     auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
-    std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
+    std::shared_ptr<GridCartesian> gBasePt(
+        SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
     std::shared_ptr<GridBase> gPt;
     std::random_device rd;
 
@@ -251,12 +252,13 @@ namespace Grid
 
   template <typename Field>
   void readBenchmark(const Coordinate &latt, const std::string filename,
-                     const ReaderFn<Field> &read,
-                     const unsigned int Ls = 1, const bool rb = false)
+                     const ReaderFn<Field> &read, const unsigned int Ls = 1,
+                     const bool rb = false)
   {
     auto mpi = GridDefaultMpi();
     auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd());
-    std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
+    std::shared_ptr<GridCartesian> gBasePt(
+        SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
     std::shared_ptr<GridBase> gPt;
 
     makeGrid(gPt, gBasePt, Ls, rb);
diff --git a/Grid/Benchmark_comms_host_device.cpp b/Grid/Benchmark_comms_host_device.cpp
index 5a4aae4..e213859 100644
--- a/Grid/Benchmark_comms_host_device.cpp
+++ b/Grid/Benchmark_comms_host_device.cpp
@@ -34,8 +34,7 @@ struct time_statistics
     mean = sum / v.size();
 
     std::vector<double> diff(v.size());
-    std::transform(v.begin(), v.end(), diff.begin(), [=](double x)
-                   { return x - mean; });
+    std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
     double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
     err = std::sqrt(sq_sum / (v.size() * (v.size() - 1)));
 
@@ -50,8 +49,7 @@ void header()
   std::cout << GridLogMessage << " L  "
             << "\t"
             << " Ls  "
-            << "\t"
-            << std::setw(11) << "bytes\t\t"
+            << "\t" << std::setw(11) << "bytes\t\t"
             << "MB/s uni"
             << "\t"
             << "MB/s bidi" << std::endl;
@@ -64,7 +62,8 @@ int main(int argc, char **argv)
   Coordinate simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd());
   Coordinate mpi_layout = GridDefaultMpi();
   int threads = GridThread::GetThreads();
-  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
+  std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads"
+            << std::endl;
 
   int Nloop = 250;
   int nmu = 0;
@@ -73,13 +72,21 @@ int main(int argc, char **argv)
     if (mpi_layout[mu] > 1)
       nmu++;
 
-  std::cout << GridLogMessage << "Number of iterations to average: " << Nloop << std::endl;
+  std::cout << GridLogMessage << "Number of iterations to average: " << Nloop
+            << std::endl;
   std::vector<double> t_time(Nloop);
   //  time_statistics timestat;
 
-  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
-  std::cout << GridLogMessage << "= Benchmarking sequential halo exchange from host memory " << std::endl;
-  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
+  std::cout << GridLogMessage
+            << "= Benchmarking sequential halo exchange from host memory " << std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
   header();
 
   for (int lat = 8; lat <= maxlat; lat += 4)
@@ -87,9 +94,7 @@ int main(int argc, char **argv)
     for (int Ls = 8; Ls <= 8; Ls *= 2)
     {
 
-      Coordinate latt_size({lat * mpi_layout[0],
-                            lat * mpi_layout[1],
-                            lat * mpi_layout[2],
+      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
                             lat * mpi_layout[3]});
 
       GridCartesian Grid(latt_size, simd_layout, mpi_layout);
@@ -127,22 +132,16 @@ int main(int argc, char **argv)
             {
               std::vector<CommsRequest_t> requests;
               Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              Grid.SendToRecvFrom((void *)&xbuf[mu][0],
-                                  xmit_to_rank,
-                                  (void *)&rbuf[mu][0],
-                                  recv_from_rank,
-                                  bytes);
+              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
+                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
             }
 
             comm_proc = mpi_layout[mu] - 1;
             {
               std::vector<CommsRequest_t> requests;
               Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0],
-                                  xmit_to_rank,
-                                  (void *)&rbuf[mu + 4][0],
-                                  recv_from_rank,
-                                  bytes);
+              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
+                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
             }
           }
           Grid.Barrier();
@@ -154,17 +153,24 @@ int main(int argc, char **argv)
           double bidibytes = xbytes + rbytes;
 
           std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
-                    << std::setw(11) << bytes << std::fixed << std::setprecision(1) << std::setw(7) << " "
-                    << std::right << xbytes / mean << "  "
+                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
+                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
                     << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
         }
       }
     }
   }
 
-  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
-  std::cout << GridLogMessage << "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
-  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
+  std::cout << GridLogMessage
+            << "= Benchmarking sequential halo exchange from GPU memory " << std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
   header();
 
   for (int lat = 8; lat <= maxlat; lat += 4)
@@ -172,9 +178,7 @@ int main(int argc, char **argv)
     for (int Ls = 8; Ls <= 8; Ls *= 2)
     {
 
-      Coordinate latt_size({lat * mpi_layout[0],
-                            lat * mpi_layout[1],
-                            lat * mpi_layout[2],
+      Coordinate latt_size({lat * mpi_layout[0], lat * mpi_layout[1], lat * mpi_layout[2],
                             lat * mpi_layout[3]});
 
       GridCartesian Grid(latt_size, simd_layout, mpi_layout);
@@ -212,22 +216,16 @@ int main(int argc, char **argv)
             {
               std::vector<CommsRequest_t> requests;
               Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              Grid.SendToRecvFrom((void *)&xbuf[mu][0],
-                                  xmit_to_rank,
-                                  (void *)&rbuf[mu][0],
-                                  recv_from_rank,
-                                  bytes);
+              Grid.SendToRecvFrom((void *)&xbuf[mu][0], xmit_to_rank,
+                                  (void *)&rbuf[mu][0], recv_from_rank, bytes);
             }
 
             comm_proc = mpi_layout[mu] - 1;
             {
               std::vector<CommsRequest_t> requests;
               Grid.ShiftedRanks(mu, comm_proc, xmit_to_rank, recv_from_rank);
-              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0],
-                                  xmit_to_rank,
-                                  (void *)&rbuf[mu + 4][0],
-                                  recv_from_rank,
-                                  bytes);
+              Grid.SendToRecvFrom((void *)&xbuf[mu + 4][0], xmit_to_rank,
+                                  (void *)&rbuf[mu + 4][0], recv_from_rank, bytes);
             }
           }
           Grid.Barrier();
@@ -239,8 +237,8 @@ int main(int argc, char **argv)
           double bidibytes = xbytes + rbytes;
 
           std::cout << GridLogMessage << std::setw(4) << lat << "\t" << Ls << "\t"
-                    << std::setw(11) << bytes << std::fixed << std::setprecision(1) << std::setw(7) << " "
-                    << std::right << xbytes / mean << "  "
+                    << std::setw(11) << bytes << std::fixed << std::setprecision(1)
+                    << std::setw(7) << " " << std::right << xbytes / mean << "  "
                     << "\t\t" << std::setw(7) << bidibytes / mean << std::endl;
         }
       }
@@ -253,9 +251,15 @@ int main(int argc, char **argv)
     }
   }
 
-  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
   std::cout << GridLogMessage << "= All done; Bye Bye" << std::endl;
-  std::cout << GridLogMessage << "====================================================================================================" << std::endl;
+  std::cout << GridLogMessage
+            << "========================================================================="
+               "==========================="
+            << std::endl;
 
   Grid_finalize();
 }
diff --git a/Grid/Benchmark_dwf_fp32.cpp b/Grid/Benchmark_dwf_fp32.cpp
index ee2dd2d..c0fcf7c 100644
--- a/Grid/Benchmark_dwf_fp32.cpp
+++ b/Grid/Benchmark_dwf_fp32.cpp
@@ -30,8 +30,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 using namespace std;
 using namespace Grid;
 
-template <class d>
-struct scal
+template <class d> struct scal
 {
   d internal;
 };
@@ -69,13 +68,11 @@ int main(int argc, char **argv)
   json["single_site_flops"] = single_site_flops;
 
   GridCartesian *UGrid = SpaceTimeGrid::makeFourDimGrid(
-      GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()),
-      GridDefaultMpi());
-  GridRedBlackCartesian *UrbGrid =
-      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
+  GridRedBlackCartesian *UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
   GridCartesian *FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
-  GridRedBlackCartesian *FrbGrid =
-      SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
+  GridRedBlackCartesian *FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
 
   json["grid"] = FGrid->FullDimensions().toVector();
   json["local_grid"] = FGrid->LocalDimensions().toVector();
@@ -83,11 +80,10 @@ int main(int argc, char **argv)
   std::cout << GridLogMessage << "Making s innermost grids" << std::endl;
   GridCartesian *sUGrid =
       SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(), GridDefaultMpi());
-  GridRedBlackCartesian *sUrbGrid =
-      SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
+
+  GridRedBlackCartesian *sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
   GridCartesian *sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls, UGrid);
-  GridRedBlackCartesian *sFrbGrid =
-      SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
+  GridRedBlackCartesian *sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls, UGrid);
 
   std::vector<int> seeds4({1, 2, 3, 4});
   std::vector<int> seeds5({5, 6, 7, 8});
@@ -150,8 +146,7 @@ int main(int argc, char **argv)
   {
     U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
   }
-  std::cout << GridLogMessage << "Setting up Cshift based reference "
-            << std::endl;
+  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
 
   if (1)
   {
@@ -200,54 +195,45 @@ int main(int argc, char **argv)
   json["ranks"] = NP;
   json["nodes"] = NN;
 
-  std::cout
-      << GridLogMessage
-      << "*****************************************************************"
-      << std::endl;
-  std::cout
-      << GridLogMessage
-      << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
-      << std::endl;
-  std::cout
-      << GridLogMessage
-      << "*****************************************************************"
-      << std::endl;
-  std::cout
-      << GridLogMessage
-      << "*****************************************************************"
-      << std::endl;
   std::cout << GridLogMessage
-            << "* Benchmarking DomainWallFermionR::Dhop                  "
+            << "*****************************************************************"
             << std::endl;
-  std::cout << GridLogMessage << "* Vectorising space-time by "
-            << vComplexF::Nsimd() << std::endl;
-  std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF)
-            << " B" << std::endl;
+  std::cout << GridLogMessage
+            << "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm"
+            << std::endl;
+  std::cout << GridLogMessage
+            << "*****************************************************************"
+            << std::endl;
+  std::cout << GridLogMessage
+            << "*****************************************************************"
+            << std::endl;
+  std::cout << GridLogMessage
+            << "* Benchmarking DomainWallFermionR::Dhop                  " << std::endl;
+  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
+            << std::endl;
+  std::cout << GridLogMessage << "* VComplexF size is " << sizeof(vComplexF) << " B"
+            << std::endl;
+
   if (sizeof(RealF) == 4)
     std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
   if (sizeof(RealF) == 8)
     std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
 #ifdef GRID_OMP
   if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
-    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute"
-              << std::endl;
+    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
   if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
-    std::cout << GridLogMessage << "* Using sequential comms compute"
-              << std::endl;
+    std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
 #endif
   if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
-    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels"
-              << std::endl;
+    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
   if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
-    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels"
-              << std::endl;
+    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
   if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
-    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels"
-              << std::endl;
-  std::cout
-      << GridLogMessage
-      << "*****************************************************************"
-      << std::endl;
+
+    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
+  std::cout << GridLogMessage
+            << "*****************************************************************"
+            << std::endl;
 
   DomainWallFermionF Dw(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
   int ncall = 300;
@@ -277,16 +263,13 @@ int main(int argc, char **argv)
     auto simdwidth = sizeof(vComplex);
 
     // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
-    double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) *
-                     simdwidth / nsimd * ncall / (1024. * 1024. * 1024.);
+    double data_rf = volume * ((2 * Nd + 1) * Nd * Nc + 2 * Nd * Nc * Nc) * simdwidth /
+                     nsimd * ncall / (1024. * 1024. * 1024.);
 
     // mem: Nd Wilson * Ls, Nd gauge, Nc colors
     double data_mem =
-        (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) *
-        simdwidth / nsimd * ncall / (1024. * 1024. * 1024.);
-
-    std::cout << GridLogMessage << "Called Dw " << ncall << " times in "
-              << t1 - t0 << " us" << std::endl;
+        (volume * (2 * Nd + 1) * Nd * Nc + (volume / Ls) * 2 * Nd * Nc * Nc) * simdwidth /
+        nsimd * ncall / (1024. * 1024. * 1024.);
 
     json["Dw"]["calls"] = ncall;
     json["Dw"]["time"] = t1 - t0;
@@ -296,15 +279,16 @@ int main(int argc, char **argv)
     json["Dw"]["RF"] = 1000000. * data_rf / ((t1 - t0));
     json["Dw"]["mem"] = 1000000. * data_mem / ((t1 - t0));
 
-    //    std::cout<<GridLogMessage << "norm result "<<
-    //    norm2(result)<<std::endl; std::cout<<GridLogMessage << "norm ref "<<
-    //    norm2(ref)<<std::endl;
-    std::cout << GridLogMessage << "mflop/s =   " << flops / (t1 - t0)
+    std::cout << GridLogMessage << "Called Dw " << ncall << " times in " << t1 - t0
+              << " us" << std::endl;
+    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
+    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
+    std::cout << GridLogMessage << "mflop/s =   " << flops / (t1 - t0) << std::endl;
+    std::cout << GridLogMessage << "mflop/s per rank =  " << flops / (t1 - t0) / NP
               << std::endl;
-    std::cout << GridLogMessage
-              << "mflop/s per rank =  " << flops / (t1 - t0) / NP << std::endl;
-    std::cout << GridLogMessage
-              << "mflop/s per node =  " << flops / (t1 - t0) / NN << std::endl;
+    std::cout << GridLogMessage << "mflop/s per node =  " << flops / (t1 - t0) / NN
+              << std::endl;
+
     std::cout << GridLogMessage
               << "RF  GiB/s (base 2) =   " << 1000000. * data_rf / ((t1 - t0))
               << std::endl;
@@ -381,13 +365,12 @@ int main(int argc, char **argv)
   }
   //  dump=1;
   Dw.Dhop(src, result, 1);
-  std::cout
-      << GridLogMessage
-      << "Compare to naive wilson implementation Dag to verify correctness"
-      << std::endl;
-  std::cout << GridLogMessage << "Called DwDag" << std::endl;
-  std::cout << GridLogMessage << "norm dag result " << norm2(result)
+
+  std::cout << GridLogMessage
+            << "Compare to naive wilson implementation Dag to verify correctness"
             << std::endl;
+  std::cout << GridLogMessage << "Called DwDag" << std::endl;
+  std::cout << GridLogMessage << "norm dag result " << norm2(result) << std::endl;
   std::cout << GridLogMessage << "norm dag ref    " << norm2(ref) << std::endl;
   err = ref - result;
   std::cout << GridLogMessage << "norm dag diff   " << norm2(err) << std::endl;
@@ -405,8 +388,7 @@ int main(int argc, char **argv)
   LatticeFermionF r_o(FrbGrid);
   LatticeFermionF r_eo(FGrid);
 
-  std::cout << GridLogMessage
-            << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
+  std::cout << GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"
             << std::endl;
   pickCheckerboard(Even, src_e, src);
   pickCheckerboard(Odd, src_o, src);
@@ -416,37 +398,32 @@ int main(int argc, char **argv)
 
   // S-direction is INNERMOST and takes no part in the parity.
   std::cout << GridLogMessage
-            << "*********************************************************"
-            << std::endl;
+
+            << "*********************************************************" << std::endl;
   std::cout << GridLogMessage
-            << "* Benchmarking DomainWallFermionF::DhopEO                "
+            << "* Benchmarking DomainWallFermionF::DhopEO                " << std::endl;
+  std::cout << GridLogMessage << "* Vectorising space-time by " << vComplexF::Nsimd()
             << std::endl;
-  std::cout << GridLogMessage << "* Vectorising space-time by "
-            << vComplexF::Nsimd() << std::endl;
+
   if (sizeof(RealF) == 4)
     std::cout << GridLogMessage << "* SINGLE precision " << std::endl;
   if (sizeof(RealF) == 8)
     std::cout << GridLogMessage << "* DOUBLE precision " << std::endl;
 #ifdef GRID_OMP
   if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute)
-    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute"
-              << std::endl;
+    std::cout << GridLogMessage << "* Using Overlapped Comms/Compute" << std::endl;
   if (WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute)
-    std::cout << GridLogMessage << "* Using sequential comms compute"
-              << std::endl;
+    std::cout << GridLogMessage << "* Using sequential comms compute" << std::endl;
 #endif
   if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric)
-    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels"
-              << std::endl;
+    std::cout << GridLogMessage << "* Using GENERIC Nc WilsonKernels" << std::endl;
   if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
-    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels"
-              << std::endl;
+    std::cout << GridLogMessage << "* Using Nc=3       WilsonKernels" << std::endl;
   if (WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm)
-    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels"
-              << std::endl;
+    std::cout << GridLogMessage << "* Using Asm Nc=3   WilsonKernels" << std::endl;
   std::cout << GridLogMessage
-            << "*********************************************************"
-            << std::endl;
+            << "*********************************************************" << std::endl;
+
   {
     Dw.ZeroCounters();
     FGrid->Barrier();
@@ -478,12 +455,12 @@ int main(int argc, char **argv)
     json["Deo"]["mflops_per_rank"] = flops / (t1 - t0) / NP;
     json["Deo"]["mflops_per_node"] = flops / (t1 - t0) / NN;
 
-    std::cout << GridLogMessage << "Deo mflop/s =   " << flops / (t1 - t0)
+    std::cout << GridLogMessage << "Deo mflop/s =   " << flops / (t1 - t0) << std::endl;
+    std::cout << GridLogMessage << "Deo mflop/s per rank   " << flops / (t1 - t0) / NP
               << std::endl;
-    std::cout << GridLogMessage << "Deo mflop/s per rank   "
-              << flops / (t1 - t0) / NP << std::endl;
-    std::cout << GridLogMessage << "Deo mflop/s per node   "
-              << flops / (t1 - t0) / NN << std::endl;
+    std::cout << GridLogMessage << "Deo mflop/s per node   " << flops / (t1 - t0) / NN
+              << std::endl;
+
     Dw.Report();
   }
   Dw.DhopEO(src_o, r_e, DaggerNo);
@@ -510,18 +487,16 @@ int main(int argc, char **argv)
 
   pickCheckerboard(Even, src_e, err);
   pickCheckerboard(Odd, src_o, err);
-  std::cout << GridLogMessage << "norm diff even  " << norm2(src_e)
-            << std::endl;
-  std::cout << GridLogMessage << "norm diff odd   " << norm2(src_o)
-            << std::endl;
+  std::cout << GridLogMessage << "norm diff even  " << norm2(src_e) << std::endl;
+  std::cout << GridLogMessage << "norm diff odd   " << norm2(src_o) << std::endl;
 
   assert(norm2(src_e) < 1.0e-4);
   assert(norm2(src_o) < 1.0e-4);
 
   if (!json_filename.empty())
   {
-    std::cout << GridLogMessage << "writing benchmark results to "
-              << json_filename << std::endl;
+    std::cout << GridLogMessage << "writing benchmark results to " << json_filename
+              << std::endl;
 
     int me = 0;
     MPI_Comm_rank(MPI_COMM_WORLD, &me);
diff --git a/Grid/Common.hpp b/Grid/Common.hpp
new file mode 100644
index 0000000..ccae02c
--- /dev/null
+++ b/Grid/Common.hpp
@@ -0,0 +1,36 @@
+/*
+Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef Grid_Benchmarks_Common_hpp_
+#define Grid_Benchmarks_Common_hpp_
+
+#ifndef GRID_MSG
+#define GRID_MSG std::cout << GridLogMessage
+#endif
+
+#ifndef GRID_MSG_MAXSIZE
+#define GRID_MSG_MAXSIZE 1024
+#endif
+
+#define grid_printf(...)                                                                 \
+  {                                                                                      \
+    char _buf[GRID_MSG_MAXSIZE];                                                         \
+    snprintf(_buf, GRID_MSG_MAXSIZE, __VA_ARGS__);                                       \
+    GRID_MSG << _buf;                                                                    \
+  }
+
+#endif // Grid_Benchmarks_Common_hpp_
diff --git a/Grid/Readme.md b/Grid/Readme.md
new file mode 100644
index 0000000..10d6bbd
--- /dev/null
+++ b/Grid/Readme.md
@@ -0,0 +1,69 @@
+# Grid benchmarks
+
+This folder contains benchmarks for the [Grid](https://github.com/aportelli/) library.
+The benchmarks can be summarised as follows
+
+- `Benchmark_Grid`: This benchmark measure floating point performances for various fermion
+matrices, as well as bandwidth measurement for different operations. Measurements are
+performed for a fixed range of problem sizes.
+
+## TL;DR
+Build and install Grid, all dependencies, and the benchmark with
+```bash
+systems/<system>/bootstrap-env.sh <env_dir> # build dependencies, takes a long time
+./build-grid.sh <env_dir> <config>          # build Grid
+./build-benchmark.sh <env_dir> <config>     # build benchmarks
+```
+where `<env_dir>` is an arbitrary directory where every product will be stored, `<system>`
+is a sub-directory of `systems` containing system-specific scripts 
+(an existing preset or your own), and finally `<config>` is the name of a build config
+in `systems/<system>/grid-config.json`. After a successful execution the benchmark binaries
+will be in `<env_dir>/prefix/gridbench_<config>`.
+
+## Environment setup
+A complete runtime environnement can be deploy using scripts from this repository. System-specific scripts are in the `systems` directory.
+
+You should first deploy the environment for the specific system you are using, for example
+```bash
+systems/tursa/bootstrap-env.sh ./env
+```
+will deploy the relevant environment for the [Tursa](https://www.epcc.ed.ac.uk/hpc-services/dirac-tursa-gpu) supercomputer in `./env`. This step might compile from source a large set
+of packages, and might take some time to complete.
+
+After that, the environment directory (`./env` in the example above) will contain a `env.sh` file that need to be sourced to activate the environment
+```bash
+source ./env/env.sh
+```
+Additional scripts `env-*.sh` can be sourced after to activate more specific environments,
+this should be done after sourcing `env.sh` as above.
+
+## Building the benchmarks
+The environnement directory contains a `grid-config.json` file specifying compilation flag
+configurations for Grid (please see Grid's repository for documentation). All entries have 
+the form
+```json
+{
+  "name": "foo",          // name of the configuration
+  "env-script": "bar.sh", // script to source before building 
+                          // (path relative to the environment directory)
+  "commit": "...",        // Grid commit to use 
+                          // (anything that can be an argument of git checkout)
+  "config-options": "..." // options to pass to the configure script,
+  "env" : {               // environment variables
+    "VAR": "value"        // export VAR="value" before building
+  }
+}
+```
+Grid can then be built with
+```
+./build-grid.sh <env_dir> <config>
+```
+where `<env_dir>` is the environment directory and `<config>` is the build config name in 
+`grid-config.json`. Similarly, the benchmarks can then be built with
+```
+./build-grid <env_dir> <config>
+```
+
+## Running the benchmarks
+After building the benchmarks as above you can find the binaries in 
+`<env_dir>/prefix/gridbench_<config>`.
\ No newline at end of file
diff --git a/Grid/build-benchmark.sh b/Grid/build-benchmark.sh
index 4b973d5..92ce696 100755
--- a/Grid/build-benchmark.sh
+++ b/Grid/build-benchmark.sh
@@ -16,18 +16,16 @@ cd "${env_dir}"
 env_dir=$(pwd -P)
 cd "${call_dir}"
 build_dir="${env_dir}/build/Grid-benchmarks/${cfg}"
-if [ -d "${build_dir}" ]; then
-    echo "error: directory '${build_dir}' exists"
-    exit 1
-fi
 mkdir -p "${build_dir}"
 source "${env_dir}/env.sh"
 entry=$(jq ".configs[]|select(.name==\"${cfg}\")" "${env_dir}"/grid-config.json)
 env_script=$(echo "${entry}" | jq -r ".\"env-script\"")
 cd "${build_dir}" || return
 source "${env_dir}/${env_script}"
-"${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
-                          --prefix="${env_dir}/prefix/gridbench_${cfg}"
+if [ ! -f Makefile ]; then
+    "${script_dir}/configure" --with-grid="${env_dir}/prefix/grid_${cfg}" \
+                            --prefix="${env_dir}/prefix/gridbench_${cfg}"
+fi
 make -j 128
 make install
 cd "${call_dir}"
diff --git a/Readme.md b/Readme.md
new file mode 100644
index 0000000..b3aefae
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,8 @@
+# Lattice benchmarks
+
+This repository is an attempt at packaging benchmarks for various libraries used for
+lattice field theory simulations. It is currently only featuring the Grid library but
+more will be added later.
+
+Libraries:
+- [Grid](https://github.com/aportelli/) - [Documentation](Grid/Readme.md)
\ No newline at end of file