diff --git a/Grid/Benchmark_Grid.cpp b/Grid/Benchmark_Grid.cpp
index 3878c1a..a4db975 100644
--- a/Grid/Benchmark_Grid.cpp
+++ b/Grid/Benchmark_Grid.cpp
@@ -1,6 +1,7 @@
 /*
 Copyright © 2015 Peter Boyle <paboyle@ph.ed.ac.uk>
 Copyright © 2022 Antonin Portelli <antonin.portelli@me.com>
+Copyright © 2022 Simon Buerger <simon.buerger@rwth-aachen.de>
 
 This is a fork of Benchmark_ITT.cpp from Grid
 
@@ -24,13 +25,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 
 using namespace Grid;
 
-std::vector<int> L_list;
-std::vector<int> Ls_list;
-std::vector<double> mflop_list;
-
-double mflop_ref;
-double mflop_ref_err;
-
 int NN_global;
 
 nlohmann::json json_results;
@@ -58,18 +52,6 @@ struct time_statistics
   }
 };
 
-void comms_header()
-{
-  std::cout << GridLogMessage << " L  "
-            << "\t"
-            << " Ls  "
-            << "\t"
-            << "bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)" << std::endl;
-};
-
-Gamma::Algebra Gmu[] = {Gamma::Algebra::GammaX, Gamma::Algebra::GammaY,
-                        Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT};
-
 struct controls
 {
   int Opt;
@@ -133,10 +115,9 @@ class Benchmark
     std::vector<double> t_time(Nloop);
     time_statistics timestat;
 
-    grid_big_sep();
-    std::cout << GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "
+    std::cout << GridLogMessage << "Benchmarking threaded STENCIL halo exchange in "
               << nmu << " dimensions" << std::endl;
-    grid_big_sep();
+    grid_small_sep();
     grid_printf("%5s %5s %15s %15s %15s %15s %15s\n", "L", "dir", "payload (B)",
                 "time (usec)", "rate (GB/s)", "std dev", "max");
 
@@ -368,10 +349,10 @@ class Benchmark
     RealD mass = 0.1;
     RealD M5 = 1.8;
 
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst = 0;
-    std::vector<double> mflops_all;
+    double gflops;
+    double gflops_best = 0;
+    double gflops_worst = 0;
+    std::vector<double> gflops_all;
 
     ///////////////////////////////////////////////////////
     // Set/Get the layout & grid size
@@ -486,8 +467,6 @@ class Benchmark
 
         FGrid->Broadcast(0, &ncall, sizeof(ncall));
 
-        //	std::cout << GridLogMessage << " Estimate " << ncall << " calls per
-        // second"<<std::endl;
         Dw.ZeroCounters();
 
         time_statistics timestat;
@@ -515,60 +494,60 @@ class Benchmark
         double fps =
             Nc * (6 + (Nc - 1) * 8) * Ns * Nd + 2 * Nd * Nc * Ns + 2 * Nd * Nc * Ns * 2;
 #endif
-        double flops = (fps * volume) / 2;
-        double mf_hi, mf_lo, mf_err;
+        double flops = (fps * volume) / 2.;
+        double gf_hi, gf_lo, gf_err;
 
         timestat.statistics(t_time);
-        mf_hi = flops / timestat.min;
-        mf_lo = flops / timestat.max;
-        mf_err = flops / timestat.min * timestat.err / timestat.mean;
+        gf_hi = flops / timestat.min / 1000.;
+        gf_lo = flops / timestat.max / 1000.;
+        gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
 
-        mflops = flops / timestat.mean;
-        mflops_all.push_back(mflops);
-        if (mflops_best == 0)
-          mflops_best = mflops;
-        if (mflops_worst == 0)
-          mflops_worst = mflops;
-        if (mflops > mflops_best)
-          mflops_best = mflops;
-        if (mflops < mflops_worst)
-          mflops_worst = mflops;
+        gflops = flops / timestat.mean / 1000.;
+        gflops_all.push_back(gflops);
+        if (gflops_best == 0)
+          gflops_best = gflops;
+        if (gflops_worst == 0)
+          gflops_worst = gflops;
+        if (gflops > gflops_best)
+          gflops_best = gflops;
+        if (gflops < gflops_worst)
+          gflops_worst = gflops;
 
         std::cout << GridLogMessage << "Deo FlopsPerSite is " << fps << std::endl;
         std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo
-                  << "-" << mf_hi << std::endl;
+                  << "Deo Gflop/s =   " << gflops << " (" << gf_err << ") " << gf_lo
+                  << "-" << gf_hi << std::endl;
         std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s per rank   " << mflops / NP << std::endl;
+                  << "Deo Gflop/s per rank   " << gflops / NP << std::endl;
         std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s per node   " << mflops / NN << std::endl;
+                  << "Deo Gflop/s per node   " << gflops / NN << std::endl;
       }
 
       grid_small_sep();
       std::cout << GridLogMessage << L << "^4 x " << Ls
-                << " Deo Best  mflop/s        =   " << mflops_best << " ; "
-                << mflops_best / NN << " per node " << std::endl;
+                << " Deo Best  Gflop/s        =   " << gflops_best << " ; "
+                << gflops_best / NN << " per node " << std::endl;
       std::cout << GridLogMessage << L << "^4 x " << Ls
-                << " Deo Worst mflop/s        =   " << mflops_worst << " ; "
-                << mflops_worst / NN << " per node " << std::endl;
+                << " Deo Worst Gflop/s        =   " << gflops_worst << " ; "
+                << gflops_worst / NN << " per node " << std::endl;
       std::cout << GridLogMessage << fmt << std::endl;
       std::cout << GridLogMessage;
 
-      for (int i = 0; i < mflops_all.size(); i++)
+      for (int i = 0; i < gflops_all.size(); i++)
       {
-        std::cout << mflops_all[i] / NN << " ; ";
+        std::cout << gflops_all[i] / NN << " ; ";
       }
       std::cout << std::endl;
     }
-    return mflops_best;
+    return gflops_best;
   }
 
   static double Staggered(int L)
   {
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst = 0;
-    std::vector<double> mflops_all;
+    double gflops;
+    double gflops_best = 0;
+    double gflops_worst = 0;
+    std::vector<double> gflops_all;
 
     ///////////////////////////////////////////////////////
     // Set/Get the layout & grid size
@@ -700,51 +679,51 @@ class Benchmark
         double volume = 1;
         for (int mu = 0; mu < Nd; mu++)
           volume = volume * latt4[mu];
-        double flops = (1146.0 * volume) / 2;
-        double mf_hi, mf_lo, mf_err;
+        double flops = (1146.0 * volume) / 2.;
+        double gf_hi, gf_lo, gf_err;
 
         timestat.statistics(t_time);
-        mf_hi = flops / timestat.min;
-        mf_lo = flops / timestat.max;
-        mf_err = flops / timestat.min * timestat.err / timestat.mean;
+        gf_hi = flops / timestat.min / 1000.;
+        gf_lo = flops / timestat.max / 1000.;
+        gf_err = flops / timestat.min * timestat.err / timestat.mean / 1000.;
 
-        mflops = flops / timestat.mean;
-        mflops_all.push_back(mflops);
-        if (mflops_best == 0)
-          mflops_best = mflops;
-        if (mflops_worst == 0)
-          mflops_worst = mflops;
-        if (mflops > mflops_best)
-          mflops_best = mflops;
-        if (mflops < mflops_worst)
-          mflops_worst = mflops;
+        gflops = flops / timestat.mean / 1000.;
+        gflops_all.push_back(gflops);
+        if (gflops_best == 0)
+          gflops_best = gflops;
+        if (gflops_worst == 0)
+          gflops_worst = gflops;
+        if (gflops > gflops_best)
+          gflops_best = gflops;
+        if (gflops < gflops_worst)
+          gflops_worst = gflops;
 
         std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s =   " << mflops << " (" << mf_err << ") " << mf_lo
-                  << "-" << mf_hi << std::endl;
+                  << "Deo Gflop/s =   " << gflops << " (" << gf_err << ") " << gf_lo
+                  << "-" << gf_hi << std::endl;
         std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s per rank   " << mflops / NP << std::endl;
+                  << "Deo Gflop/s per rank   " << gflops / NP << std::endl;
         std::cout << GridLogMessage << std::fixed << std::setprecision(1)
-                  << "Deo mflop/s per node   " << mflops / NN << std::endl;
+                  << "Deo Gflop/s per node   " << gflops / NN << std::endl;
       }
 
       grid_small_sep();
       std::cout << GridLogMessage << L
-                << "^4  Deo Best  mflop/s        =   " << mflops_best << " ; "
-                << mflops_best / NN << " per node " << std::endl;
+                << "^4  Deo Best  Gflop/s        =   " << gflops_best << " ; "
+                << gflops_best / NN << " per node " << std::endl;
       std::cout << GridLogMessage << L
-                << "^4  Deo Worst mflop/s        =   " << mflops_worst << " ; "
-                << mflops_worst / NN << " per node " << std::endl;
+                << "^4  Deo Worst Gflop/s        =   " << gflops_worst << " ; "
+                << gflops_worst / NN << " per node " << std::endl;
       std::cout << GridLogMessage << fmt << std::endl;
       std::cout << GridLogMessage;
 
-      for (int i = 0; i < mflops_all.size(); i++)
+      for (int i = 0; i < gflops_all.size(); i++)
       {
-        std::cout << mflops_all[i] / NN << " ; ";
+        std::cout << gflops_all[i] / NN << " ; ";
       }
       std::cout << std::endl;
     }
-    return mflops_best;
+    return gflops_best;
   }
 };
 
@@ -782,6 +761,30 @@ int main(int argc, char **argv)
   std::vector<double> dwf4;
   std::vector<double> staggered;
 
+  if (do_memory)
+  {
+    grid_big_sep();
+    std::cout << GridLogMessage << " Memory benchmark " << std::endl;
+    grid_big_sep();
+    Benchmark::Memory();
+  }
+
+  if (do_su4)
+  {
+    grid_big_sep();
+    std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
+    grid_big_sep();
+    Benchmark::SU4();
+  }
+
+  if (do_comms)
+  {
+    grid_big_sep();
+    std::cout << GridLogMessage << " Communications benchmark " << std::endl;
+    grid_big_sep();
+    Benchmark::Comms();
+  }
+
   if (do_flops)
   {
     Ls = 1;
@@ -810,68 +813,35 @@ int main(int argc, char **argv)
       staggered.push_back(result);
     }
 
+    int NN = NN_global;
+
     grid_big_sep();
-    std::cout << GridLogMessage << " Summary table Ls=" << Ls << std::endl;
+    std::cout << GridLogMessage << "Gflop/s/node Summary table Ls=" << Ls << std::endl;
     grid_big_sep();
-    std::cout << GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" << std::endl;
+    grid_printf("%5s %12s %12s %12s\n", "L", "Wilson", "DWF", "Staggered");
+    nlohmann::json tmp_flops;
     for (int l = 0; l < L_list.size(); l++)
     {
-      std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] << " \t\t "
-                << dwf4[l] << " \t\t " << staggered[l] << std::endl;
+      grid_printf("%5d %12.2f %12.2f %12.2f\n", L_list[l], wilson[l] / NN, dwf4[l] / NN,
+                  staggered[l] / NN);
+
       nlohmann::json tmp;
       tmp["L"] = L_list[l];
-      tmp["Mflops_wilson"] = wilson[l];
-      tmp["Mflops_dwf4"] = dwf4[l];
-      tmp["Mflops_staggered"] = staggered[l];
-      json_results["flops"].push_back(tmp);
-    }
-  }
-
-  int NN = NN_global;
-  if (do_memory)
-  {
-    grid_big_sep();
-    std::cout << GridLogMessage << " Memory benchmark " << std::endl;
-    grid_big_sep();
-    Benchmark::Memory();
-  }
-
-  if (do_su4)
-  {
-    grid_big_sep();
-    std::cout << GridLogMessage << " SU(4) benchmark " << std::endl;
-    grid_big_sep();
-    Benchmark::SU4();
-  }
-
-  if (do_comms)
-  {
-    grid_big_sep();
-    std::cout << GridLogMessage << " Communications benchmark " << std::endl;
-    grid_big_sep();
-    Benchmark::Comms();
-  }
-
-  if (do_flops)
-  {
-    grid_big_sep();
-    std::cout << GridLogMessage << " Per Node Summary table Ls=" << Ls << std::endl;
-    grid_big_sep();
-    std::cout << GridLogMessage << " L \t\t Wilson\t\t DWF4\t\t Staggered " << std::endl;
-    for (int l = 0; l < L_list.size(); l++)
-    {
-      std::cout << GridLogMessage << L_list[l] << " \t\t " << wilson[l] / NN << " \t "
-                << dwf4[l] / NN << " \t " << staggered[l] / NN << std::endl;
+      tmp["Gflops_wilson"] = wilson[l] / NN;
+      tmp["Gflops_dwf4"] = dwf4[l] / NN;
+      tmp["Gflops_staggered"] = staggered[l] / NN;
+      tmp_flops["results"].push_back(tmp);
     }
     grid_big_sep();
     std::cout << GridLogMessage
               << " Comparison point     result: " << 0.5 * (dwf4[sel] + dwf4[selm1]) / NN
-              << " Mflop/s per node" << std::endl;
+              << " Gflop/s per node" << std::endl;
     std::cout << GridLogMessage << " Comparison point is 0.5*(" << dwf4[sel] / NN << "+"
               << dwf4[selm1] / NN << ") " << std::endl;
     std::cout << std::setprecision(3);
     grid_big_sep();
-    json_results["comp_point_Mflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN;
+    tmp_flops["comparison_point_Gflops"] = 0.5 * (dwf4[sel] + dwf4[selm1]) / NN;
+    json_results["flops"] = tmp_flops;
   }
 
   if (!json_filename.empty())
diff --git a/Grid/systems/tursa/files/run.cpu.template.sh b/Grid/systems/tursa/files/run.cpu.template.sh
new file mode 100644
index 0000000..6e339ac
--- /dev/null
+++ b/Grid/systems/tursa/files/run.cpu.template.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+## This set of slurm settings assumes that the AMD chips are using bios setting NPS4 (4 mpi taks per socket).
+
+#SBATCH -J @job-name@
+#SBATCH -A @budget@
+#SBATCH -t 48:00:00
+#SBATCH --nodes=@nnodes@
+#SBATCH --ntasks=@ntasks@
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=32
+#SBATCH --partition=@partition@
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --qos=standard
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=16
+export OMP_DISPLAY_AFFINITY=true
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=rc,sm,self
+export UCX_RNDV_THRESH=16384
+export UCX_MEMTYPE_CACHE=n
+export UCX_NET_DEVICES=mlx5_0:1
+
+export OMPI_MCA_BTL_SM_USE_KNEM=1
+export OMPI_MCA_coll_hcoll_enable=1
+export OMPI_MCA_coll_hcoll_np=0
+
+# IO environment ###############################################################
+if [ @nnodes@ -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3 # are these needed here?
+
+# load environment #############################################################
+env_dir="$(readlink -f @env-dir@)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'cpu' ]; then
+	source "${env_dir}/env-cpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+
+# application and parameters ###################################################
+app='@application@'
+opt='--comms-overlap --comms-concurrent'
+par='@par@'
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./cpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi @mpi-geom@ \
+	--grid @grid-geom@ \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+
+################################################################################
diff --git a/Grid/systems/tursa/files/run.gpu.template.sh b/Grid/systems/tursa/files/run.gpu.template.sh
new file mode 100644
index 0000000..860c856
--- /dev/null
+++ b/Grid/systems/tursa/files/run.gpu.template.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2050,SC2170
+
+# using options from https://github.com/paboyle/Grid/tree/develop/systems/Tursa
+
+#SBATCH -J @job-name@
+#SBATCH -A @budget@
+#SBATCH -t 48:00:00
+#SBATCH --nodes=@nnodes@
+#SBATCH --ntasks=@ntasks@
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --partition=@partition@
+#SBATCH --gres=gpu:4
+#SBATCH --output=%x.%j.out
+#SBATCH --error=%x.%j.err
+#SBATCH --qos=standard
+#SBATCH --no-requeue
+
+set -e
+
+# OpenMP/OpenMPI/UCX environment ###############################################
+export OMP_NUM_THREADS=8
+export OMPI_MCA_btl=^uct,openib
+export OMPI_MCA_pml=ucx
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+# IO environment ###############################################################
+
+if [ @nnodes@ -eq 1 ]; then
+	export OMPI_MCA_io=ompio
+else
+	export OMPI_MCA_io=romio321
+fi
+export OMPI_MCA_btl_openib_allow_ib=true
+export OMPI_MCA_btl_openib_device_type=infiniband
+export OMPI_MCA_btl_openib_if_exclude=mlx5_1,mlx5_2,mlx5_3
+
+# load environment #############################################################
+env_dir="$(readlink -f @env-dir@)"
+source "${env_dir}/env-base.sh"
+if [ "${SLURM_JOB_PARTITION}" = 'gpu' ]; then
+	source "${env_dir}/env-gpu.sh"
+else
+	echo "error: partition ${SLURM_JOB_PARTITION} not supported for this template" 1>&2
+  exit 1
+fi
+
+# application and parameters ###################################################
+app='@application@'
+opt=('--comms-overlap' '--comms-concurrent')
+par='@par@'
+
+# collect job information ######################################################
+job_info_dir=job/${SLURM_JOB_NAME}.${SLURM_JOB_ID}
+mkdir -p "${job_info_dir}"
+
+date                         > "${job_info_dir}/start-date"
+set                          > "${job_info_dir}/env"
+ldd ${app}                   > "${job_info_dir}/ldd"
+md5sum ${app}                > "${job_info_dir}/app-hash"
+readelf -a ${app}            > "${job_info_dir}/elf"
+echo "${SLURM_JOB_NODELIST}" > "${job_info_dir}/nodes"
+cp "${BASH_SOURCE[0]}"       "${job_info_dir}/script"
+if [ -n "${par}" ]; then cp "${par}" "${job_info_dir}/par"; fi
+
+# run! #########################################################################
+mpirun -np "${SLURM_NTASKS}" -x LD_LIBRARY_PATH --bind-to none \
+	./gpu-mpi-wrapper.sh \
+  ${app} "${par}" "${opt[@]}" \
+	--mpi @mpi-geom@ \
+  --accelerator-threads 8 \
+	--grid @grid-geom@ \
+	--shm 2048 &> "${job_info_dir}/log"
+
+# if we reach that point the application exited successfully ###################
+touch "${job_info_dir}/success"
+date > "${job_info_dir}/end-date"
+
+################################################################################